package com.yunzu.parsewarc;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.Iterator;
import org.archive.io.ArchiveRecordHeader;
import org.archive.io.warc.WARCReader;
import org.archive.io.warc.WARCReaderFactory;
import org.archive.io.warc.WARCRecord;
public class ParseWarc {
/**
* @param args
* @throws IOException
* @throws MalformedURLException
*/
public static void main(String[] args) throws MalformedURLException, IOException {
// TODO Auto-generated method stub
WARCReader warcreader = WARCReaderFactory
.get("file:tmp/WEB-20130424144540112-00000-6676~localhost~8443.warc.gz");
for (final Iterator i = warcreader.iterator(); i.hasNext();) {
WARCRecord ar = (WARCRecord) i.next();
ArchiveRecordHeader arh = ar.getHeader();
/*System.out.println(arh);
System.out.println(arh.getUrl());
System.out.println(arh.getLength());
System.out.println(arh.getMimetype());
System.out.println(arh.getRecordIdentifier());
System.out.println(arh.getHeaderValue("WARC-Type"));
System.out.println(arh.getHeaderFields());*/
if( arh.getHeaderValue("WARC-Type").equals("response"))
{
System.out.println(arh.getHeaderValue("WARC-Type"));
System.out.println(arh.getHeaderFields());
long length = arh.getLength();
byte[] content = new byte[(int) length];
ar.read(content);
String strContent = new String(content,0,(int) length,"UTF-8");
System.out.println(strContent);
System.out.println(ar.CONTENT_TYPE);
}
}
}
}
编码探测
CharsetDetector detector = new CharsetDetector();
CharsetMatch[] matches = null;
detector.enableInputFilter(true);
detector.setText(content);
matches = detector.detectAll();
if( matches != null )
{
for (CharsetMatch match : matches)
{
//System.out.println(match.getName());
}
}
CharsetMatch matchEncoding = detector.detect();
System.out.println(matchEncoding.getName());
获取http内容
int nNN = strContent.indexOf("\n\n");
if( nNN > 0 )
{
String strBody = strContent.substring( nNN + 2 );
System.out.println("------------ body ------------");
//System.out.println( strBody );
}
else
{
nNN = strContent.indexOf("\n\r\n");
if( nNN > 0 )
{
String strBody = strContent.substring( nNN + 3 );
System.out.println("------------ body \\r\\n------------");
//System.out.println( strBody );
}
else
{
System.out.println(nNN);
System.out.println(strContent);
}
}