Heritrix架构学习笔记（三）_total uri crawled-CSDN博客

本文链接：https://blog.csdn.net/foamflower/article/details/2329218

3、Frontier链接制造工厂

在heritrix- 1.12.1 /docs/articles/developer_manual/frontier.html下可找到Heritrix的官方文档的一个Frontier例子：

/**

* A simple Frontier implementation for tutorial purposes

public class MyFrontier extends ModuleType implements Frontier,

FetchStatusCodes {

// A list of the discovered URIs that should be crawled.

// 列表用来保存还未抓取的链接

List pendingURIs = new ArrayList();

// A list of prerequisites that needs to be met before any other URI is

// allowed to be crawled, e.g. DNS-lookups

//这个列表中保存了一系列的链接，它们的优先级要高于pendingURIs那个List中的任何一个链接，

//表中的链接表示一些需要被满足的先决条件

List prerequisites = new ArrayList();

// A hash of already crawled URIs so that every URI is crawled only once.

//一个HashMap，用于存储那些已经抓取过的链接

Map alreadyIncluded = new HashMap();

// Reference to the CrawlController.

// CrawlController对象

CrawlController controller;

// Flag to note if a URI is being processed.

//用于标识是否一个链接正在被处理

boolean uriInProcess = false;

// top-level stats

//成功下载的数量

long successCount = 0;

//失败的数量

long failedCount = 0;

//抛弃掉链接的数量

long disregardedCount = 0;

//总共下载的字节数

long totalProcessedBytes = 0;

public MyFrontier(String name) {

super(Frontier.ATTR_NAME, "A simple frontier.");

}

public void initialize(CrawlController controller)

throws FatalConfigurationException, IOException {

//注入

this.controller = controller;

// Initialize the pending queue with the seeds

//把种子文件中的链接加入到pengdingURIs中去

this.controller.getScope().refreshSeeds();

List seeds = this.controller.getScope().getSeedlist();

synchronized(seeds) {

for (Iterator i = seeds.iterator(); i.hasNext();) {

UURI u = (UURI) i.next();

CandidateURI caUri = new CandidateURI(u);

caUri.setSeed();

schedule(caUri);

}

//该方法是给线程池中的线程调用的，用以取出下一个准备处理的链接

public synchronized CrawlURI next(int timeout) throws InterruptedException {

if (!uriInProcess && !isEmpty()) {

uriInProcess = true;

CrawlURI curi;

先看prerequistes队列中是否有要处理的链接，如果有，就先处理，如果没有，再看pengdingURIs队列中是否有链接。每次在处理的时候，总是取出队列中的第一个链接

if (!prerequisites.isEmpty()) {

curi = CrawlURI.from((CandidateURI) prerequisites.remove(0));

} else {

curi = CrawlURI.from((CandidateURI) pendingURIs.remove(0));

}

curi.setServer(controller.getServerCache().getServerFor(curi));

return curi;

} else {

wait(timeout);

return null;

}

public boolean isEmpty() {

return pendingURIs.isEmpty() && prerequisites.isEmpty();

}

//该方法用于将新链接加入到pengdingURIs队列中，等待处理

public synchronized void schedule(CandidateURI caURI) {

// Schedule a uri for crawling if it is not already crawled

首先判断要加入的链接是否已经被抓取过，如果已经包含在alreadyIncluded这个HashMap中则说明处理过了，就可以放弃处理

if (!alreadyIncluded.containsKey(caURI.getURIString())) {

if(caURI.needsImmediateScheduling()) {

prerequisites.add(caURI);

} else {

pendingURIs.add(caURI);

}

//HashMap中使用url的字符串来作为key，而将实际的CadidateURI对象作为value

alreadyIncluded.put(caURI.getURIString(), caURI);

}

public void batchSchedule(CandidateURI caURI) {

schedule(caURI);

}

public void batchFlush() {

}

//一次抓取结束后所执行的操作，该操作由线程池中的线程来进行调用

public synchronized void finished(CrawlURI cURI) {

uriInProcess = false;

//成功下载

if (cURI.isSuccess()) {

successCount++;

//统计下载总数

totalProcessedBytes += cURI.getContentSize();

//如果成功，则触发一个成功事件，比如将Extractor解析出来的新URL加入队列中

controller.fireCrawledURISuccessfulEvent(cURI);

cURI.stripToMinimal();

}

//需要推迟下载

else if (cURI.getFetchStatus() == S_DEFERRED) {

cURI.processingCleanup();

alreadyIncluded.remove(cURI.getURIString());

schedule(cURI);

}

//其他状态

else if (cURI.getFetchStatus() == S_ROBOTS_PRECLUDED

|| cURI.getFetchStatus() == S_OUT_OF_SCOPE

|| cURI.getFetchStatus() == S_BLOCKED_BY_USER

|| cURI.getFetchStatus() == S_TOO_MANY_EMBED_HOPS

|| cURI.getFetchStatus() == S_TOO_MANY_LINK_HOPS

|| cURI.getFetchStatus() == S_DELETED_BY_USER) {

//抛弃当前URI

controller.fireCrawledURIDisregardEvent(cURI);

disregardedCount++;

cURI.stripToMinimal();

} else {

controller.fireCrawledURIFailureEvent(cURI);

failedCount++;

cURI.stripToMinimal();

}

cURI.processingCleanup();

}

//返回所有已经处理过的链接数量

public long discoveredUriCount() {

return alreadyIncluded.size();

}

//返回所有等待处理的链接数量

public long queuedUriCount() {

return pendingURIs.size() + prerequisites.size();

}

//返回所有已经完成的链接数量

public long finishedUriCount() {

return successCount + failedCount + disregardedCount;

}

//返回所有成功处理的链接数量

public long successfullyFetchedCount() {

return successCount;

}

//返回所有失败的链接数量

public long failedFetchCount() {

return failedCount;

}

//返回所有抛弃的链接数量

public long disregardedFetchCount() {

return disregardedCount;

}

//返回总共下载的字节数

public long totalBytesWritten() {

return totalProcessedBytes;

}

public String report() {

return "This frontier does not return a report.";

}

public void importRecoverLog(String pathToLog) throws IOException {

throw new UnsupportedOperationException();

}

public FrontierMarker getInitialMarker(String regexpr,

boolean inCacheOnly) {

return null;

}

public ArrayList getURIsList(FrontierMarker marker, int numberOfMatches,

boolean verbose) throws InvalidFrontierMarkerException {

return null;

}

public long deleteURIs(String match) {

return 0;

}

注意：上面仅仅是一个最基础的代码，从结构上揭示一个Frontier的作用