3、Frontier链接制造工厂
在heritrix- 1.12.1 /docs/articles/developer_manual/frontier.html下可找到Heritrix的官方文档的一个Frontier例子:
/** * A simple Frontier implementation for tutorial purposes */ public class MyFrontier extends ModuleType implements Frontier, FetchStatusCodes { // A list of the discovered URIs that should be crawled. // 列表用来保存还未抓取的链接 List pendingURIs = new ArrayList();
// A list of prerequisites that needs to be met before any other URI is // allowed to be crawled, e.g. DNS-lookups //这个列表中保存了一系列的链接,它们的优先级要高于pendingURIs那个List中的任何一个链接, //表中的链接表示一些需要被满足的先决条件 List prerequisites = new ArrayList();
// A hash of already crawled URIs so that every URI is crawled only once. //一个HashMap,用于存储那些已经抓取过的链接 Map alreadyIncluded = new HashMap();
// Reference to the CrawlController. // CrawlController对象 CrawlController controller;
// Flag to note if a URI is being processed. //用于标识是否一个链接正在被处理 boolean uriInProcess = false;
// top-level stats //成功下载的数量 long successCount = 0; //失败的数量 long failedCount = 0; //抛弃掉链接的数量 long disregardedCount = 0; //总共下载的字节数 long totalProcessedBytes = 0;
public MyFrontier(String name) { super(Frontier.ATTR_NAME, "A simple frontier."); }
public void initialize(CrawlController controller) throws FatalConfigurationException, IOException { //注入 this.controller = controller;
// Initialize the pending queue with the seeds //把种子文件中的链接加入到pengdingURIs中去 this.controller.getScope().refreshSeeds(); List seeds = this.controller.getScope().getSeedlist(); synchronized(seeds) { for (Iterator i = seeds.iterator(); i.hasNext();) { UURI u = (UURI) i.next(); CandidateURI caUri = new CandidateURI(u); caUri.setSeed(); schedule(caUri); } } }
//该方法是给线程池中的线程调用的,用以取出下一个准备处理的链接 public synchronized CrawlURI next(int timeout) throws InterruptedException { if (!uriInProcess && !isEmpty()) { uriInProcess = true; CrawlURI curi; /* 先看prerequistes队列中是否有要处理的链接,如果有,就先处理,如果没有,再看pengdingURIs队列中是否有链接。每次在处理的时候,总是取出队列中的第一个链接 */ if (!prerequisites.isEmpty()) { curi = CrawlURI.from((CandidateURI) prerequisites.remove(0)); } else { curi = CrawlURI.from((CandidateURI) pendingURIs.remove(0)); } curi.setServer(controller.getServerCache().getServerFor(curi)); return curi; } else { wait(timeout); return null; } } public boolean isEmpty() { return pendingURIs.isEmpty() && prerequisites.isEmpty(); }
//该方法用于将新链接加入到pengdingURIs队列中,等待处理 public synchronized void schedule(CandidateURI caURI) { // Schedule a uri for crawling if it is not already crawled /* 首先判断要加入的链接是否已经被抓取过,如果已经包含在alreadyIncluded这个HashMap中则说明处理过了,就可以放弃处理 */ if (!alreadyIncluded.containsKey(caURI.getURIString())) { if(caURI.needsImmediateScheduling()) { prerequisites.add(caURI); } else { pendingURIs.add(caURI); } //HashMap中使用url的字符串来作为key,而将实际的CadidateURI对象作为value alreadyIncluded.put(caURI.getURIString(), caURI); } }
public void batchSchedule(CandidateURI caURI) { schedule(caURI); }
public void batchFlush() { }
//一次抓取结束后所执行的操作,该操作由线程池中的线程来进行调用 public synchronized void finished(CrawlURI cURI) { uriInProcess = false; //成功下载 if (cURI.isSuccess()) {
successCount++; //统计下载总数 totalProcessedBytes += cURI.getContentSize(); //如果成功,则触发一个成功事件,比如将Extractor解析出来的新URL加入队列中 controller.fireCrawledURISuccessfulEvent(cURI); cURI.stripToMinimal(); } //需要推迟下载 else if (cURI.getFetchStatus() == S_DEFERRED) { cURI.processingCleanup(); alreadyIncluded.remove(cURI.getURIString()); schedule(cURI); } //其他状态 else if (cURI.getFetchStatus() == S_ROBOTS_PRECLUDED || cURI.getFetchStatus() == S_OUT_OF_SCOPE || cURI.getFetchStatus() == S_BLOCKED_BY_USER || cURI.getFetchStatus() == S_TOO_MANY_EMBED_HOPS || cURI.getFetchStatus() == S_TOO_MANY_LINK_HOPS || cURI.getFetchStatus() == S_DELETED_BY_USER) { //抛弃当前URI controller.fireCrawledURIDisregardEvent(cURI); disregardedCount++; cURI.stripToMinimal(); } else { controller.fireCrawledURIFailureEvent(cURI); failedCount++; cURI.stripToMinimal(); } cURI.processingCleanup(); }
//返回所有已经处理过的链接数量 public long discoveredUriCount() { return alreadyIncluded.size(); }
//返回所有等待处理的链接数量 public long queuedUriCount() { return pendingURIs.size() + prerequisites.size(); }
//返回所有已经完成的链接数量 public long finishedUriCount() { return successCount + failedCount + disregardedCount; }
//返回所有成功处理的链接数量 public long successfullyFetchedCount() { return successCount; }
//返回所有失败的链接数量 public long failedFetchCount() { return failedCount; } //返回所有抛弃的链接数量 public long disregardedFetchCount() { return disregardedCount; } //返回总共下载的字节数 public long totalBytesWritten() { return totalProcessedBytes; }
public String report() { return "This frontier does not return a report."; }
public void importRecoverLog(String pathToLog) throws IOException { throw new UnsupportedOperationException(); }
public FrontierMarker getInitialMarker(String regexpr, boolean inCacheOnly) { return null; }
public ArrayList getURIsList(FrontierMarker marker, int numberOfMatches, boolean verbose) throws InvalidFrontierMarkerException { return null; }
public long deleteURIs(String match) { return 0; }
} |
注意:上面仅仅是一个最基础的代码,从结构上揭示一个Frontier的作用