关于定制Heritrix1.14爬取

最新推荐文章于 2024-07-22 22:58:19 发布

jyjsjd

最新推荐文章于 2024-07-22 22:58:19 发布

阅读量113

点赞数

分类专栏：研究记录文章标签： java

本文链接：https://blog.csdn.net/jyjsjd/article/details/84227504

版权

研究记录专栏收录该内容

9 篇文章 0 订阅

订阅专栏

在网上参考了不少文章说可以继承FrontierScheduler，定制自己的爬取规则。我自己试了一下好像是不行的。

如下是我自己写了一个正则式，爬取门户网站中教育新闻。

import org.archive.crawler.datamodel.CandidateURI;
import org.archive.crawler.postprocessor.FrontierScheduler;

public class FrontierSchedulerForEduNews extends FrontierScheduler {

	/**
	 * 
	 */
	private static final long serialVersionUID = -5178775477602250542L;

    /*
     * 这里是匹配规则    
     * 但是好像不能执行
	 * String regex="^(https?://)?(edu|learning)\\.(sina)\\.(com)(\\.cn)?([\\/\\w \\. -]*)\\.(s?html)\\/?$";
	 * Pattern pattern = Pattern.compile(regex);
	 */

    public FrontierSchedulerForEduNews(String name) {
        super(name);
    }

    protected void schedule(CandidateURI caUri) {
        String uri=caUri.toString();
        uri=uri.toLowerCase();
        
        /*
         * startsWith()不可用
         */
		if(!(uri.endsWith("jpg")||uri.endsWith("gif")||uri.endsWith("png")||uri.endsWith("bmp")||uri.endsWith("jpeg")||uri.endsWith("ico")||uri.endsWith("swf")||uri.endsWith("pdf")||uri.endsWith("doc")||uri.endsWith("ppt")||uri.endsWith("xls")||uri.endsWith("css")||uri.endsWith("js")))
		{
			getController().getFrontier().schedule(caUri);
		}
	}
}

后来我发现继承Extractor ，是可以实现定制爬取的。

import java.io.IOException;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.httpclient.URIException;
import org.archive.crawler.datamodel.CrawlURI;
import org.archive.crawler.extractor.Extractor;
import org.archive.crawler.extractor.Link;
import org.archive.io.ReplayCharSequence;
import org.archive.util.HttpRecorder;

public class SohuNewsExtractor extends Extractor {
    /**
	 * 
	 */
	private static final long serialVersionUID = 1L;
	private static Logger logger = Logger.getLogger(SohuNewsExtractor.class.getName());
	
	// 构造函数
    public SohuNewsExtractor(String name) {
        this(name, "Sohu News Extractor");
    }
    
    // 构造函数
    public SohuNewsExtractor(String name, String description) {
        super(name, description);
    }
    
    // 第一个正则式，用于匹配SOHU新闻的格式
    public static final String PATTERN_SOHU_NEWS =
                                    "http://news.sohu.com/[\\d]+/n[\\d]+.shtml";
    // 第二个正则式，用于匹配所有的<a href="xxx">
    public static final String PATTERN_A_HREF =
                                    "<a\\s+href\\s*=\\s*(\"([^\"]*)\"|[^\\s>])\\s*>";
    
    // 继承的方法
    protected void extract(CrawlURI curi) {
        // 将链接对象转为字符串
        String url = curi.toString();
        /*
         * 下面一段代码主要用于取得当前链接的返回 字符串，以便对内容进行分析时使用
         */
        ReplayCharSequence cs = null;
        
        try {
            HttpRecorder hr = curi.getHttpRecorder();
            
            if (hr == null) {
                throw new IOException("Why is recorder null here?");
            }
            
            cs = hr.getReplayCharSequence();
        } catch (IOException e) {
            curi.addLocalizedError(this.getName(), e,
                    "Failed get of replay char sequence " + curi.toString()
                            + " " + e.getMessage());
            logger.log(Level.SEVERE, "Failed get of replay char sequence in "
                    + Thread.currentThread().getName(), e);
        }
        // 如果什么也没抓取到，就返回
        if (cs == null) {
            return;
        }
        // 将链接返回的内容转成字符串
        String content = cs.toString();
        
        try {
           
            // 将字符串内容进行正则匹配
            // 取出其中的链接信息
            Pattern pattern = Pattern.compile(PATTERN_A_HREF, Pattern.CASE_INSENSITIVE);
            Matcher matcher = pattern.matcher(content);
            // 若找到了一个链接
            while (matcher.find()) {
                String newUrl = matcher.group(2);
                // 查看其是否为SOHU新闻的格式
                if (newUrl.matches(PATTERN_SOHU_NEWS)) {
                    // 若是，则将链接加入到队列中
                    // 以备后续处理
                    addLinkFromString(curi, newUrl, "", Link.NAVLINK_HOP);
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
    
    // 将链接保存记录下来，以备后续处理
    private void addLinkFromString(CrawlURI curi, String uri,
            CharSequence context, char hopType) {
        try {
            curi.createAndAddLinkRelativeToBase(uri, context.toString(), hopType);
        } catch (URIException e) {
            if (getController() != null) {
                getController().logUriError(e, curi.getUURI(), uri);
            } else {
                logger.info("Failed createAndAddLinkRelativeToBase "
                + curi + ", " + uri + ", " + context + ", "
                + hopType + ": " + e);
            }
        }
    }
}

使用之前要写入Extractor 规则里，爬取时选用这个Extractor 。

jyjsjd

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
关于定制Heritrix1.14爬取

在网上参考了不少文章说可以继承FrontierScheduler，定制自己的爬取规则。我自己试了一下好像是不行的。如下是我自己写了一个正则式，爬取门户网站中教育新闻。import org.archive.crawler.datamodel.CandidateURI;import org.archive.crawler.postprocessor.FrontierScheduler...
复制链接

扫一扫