WarcReader学习

package com.yunzu.parsewarc;

import java.io.IOException;
import java.net.MalformedURLException;
import java.util.Iterator;

import org.archive.io.ArchiveRecordHeader;
import org.archive.io.warc.WARCReader;
import org.archive.io.warc.WARCReaderFactory;
import org.archive.io.warc.WARCRecord;

public class ParseWarc {

	/**
	 * @param args
	 * @throws IOException 
	 * @throws MalformedURLException 
	 */
	public static void main(String[] args) throws MalformedURLException, IOException {
		// TODO Auto-generated method stub
		WARCReader warcreader = WARCReaderFactory
				.get("file:tmp/WEB-20130424144540112-00000-6676~localhost~8443.warc.gz");
		for (final Iterator i = warcreader.iterator(); i.hasNext();) {
			WARCRecord ar = (WARCRecord) i.next();
			ArchiveRecordHeader arh = ar.getHeader();
			/*System.out.println(arh);
			System.out.println(arh.getUrl());
			System.out.println(arh.getLength());
			System.out.println(arh.getMimetype());
			System.out.println(arh.getRecordIdentifier());
			System.out.println(arh.getHeaderValue("WARC-Type"));
			System.out.println(arh.getHeaderFields());*/
			if( arh.getHeaderValue("WARC-Type").equals("response"))
			{
				System.out.println(arh.getHeaderValue("WARC-Type"));
				System.out.println(arh.getHeaderFields());
				long length = arh.getLength();
				byte[] content = new byte[(int) length];
				ar.read(content);
				String strContent = new String(content,0,(int) length,"UTF-8");
				System.out.println(strContent);
				System.out.println(ar.CONTENT_TYPE);
			}
		}
	}

}

编码探测

CharsetDetector detector = new  CharsetDetector();
                                CharsetMatch[] matches = null;
                                detector.enableInputFilter(true);
                                detector.setText(content);
                                matches = detector.detectAll();
                                if( matches != null )
                                {
                                        for (CharsetMatch match : matches)
                                        {
                                                //System.out.println(match.getName());
                                        }
                                }
                                
                                CharsetMatch matchEncoding = detector.detect();
                                System.out.println(matchEncoding.getName());

获取http内容

int nNN = strContent.indexOf("\n\n");
                                if( nNN > 0 )
                                {
                                        String strBody = strContent.substring( nNN + 2 );
                                        System.out.println("------------ body ------------");
                                        //System.out.println( strBody );
                                }
                                else
                                {
                                        nNN = strContent.indexOf("\n\r\n");
                                        if( nNN > 0 )
                                        {
                                                String strBody = strContent.substring( nNN + 3 );
                                                System.out.println("------------ body \\r\\n------------");
                                                //System.out.println( strBody );
                                        }
                                        else
                                        {
                                                System.out.println(nNN);
                                                System.out.println(strContent);
                                        }
                                }


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值