heritrix文档上的一个例子,放这备用


package mypackage;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import org.archive.crawler.datamodel.CandidateURI;
import org.archive.crawler.datamodel.CrawlURI;
import org.archive.crawler.datamodel.FetchStatusCodes;
import org.archive.crawler.datamodel.UURI;
import org.archive.crawler.framework.CrawlController;
import org.archive.crawler.framework.Frontier;
import org.archive.crawler.framework.FrontierMarker;
import org.archive.crawler.framework.exceptions.FatalConfigurationException;
import org.archive.crawler.framework.exceptions.InvalidFrontierMarkerException;
import org.archive.crawler.settings.ModuleType;


/**
* A simple Frontier implementation for tutorial purposes
*/
public class MyFrontier extends ModuleType implements Frontier,
FetchStatusCodes {
// A list of the discovered URIs that should be crawled.
List pendingURIs = new ArrayList();

// A list of prerequisites that needs to be met before any other URI is
// allowed to be crawled, e.g. DNS-lookups
List prerequisites = new ArrayList();

// A hash of already crawled URIs so that every URI is crawled only once.
Map alreadyIncluded = new HashMap();

// Reference to the CrawlController.
CrawlController controller;

// Flag to note if a URI is being processed.
boolean uriInProcess = false;

// top-level stats
long successCount = 0;
long failedCount = 0;
long disregardedCount = 0;
long totalProcessedBytes = 0;

public MyFrontier(String name) {
super(Frontier.ATTR_NAME, "A simple frontier.");
}

public void initialize(CrawlController controller)
throws FatalConfigurationException, IOException {
this.controller = controller;

// Initialize the pending queue with the seeds
this.controller.getScope().refreshSeeds();
List seeds = this.controller.getScope().getSeedlist();
synchronized(seeds) {
for (Iterator i = seeds.iterator(); i.hasNext();) {
UURI u = (UURI) i.next();
CandidateURI caUri = new CandidateURI(u);
caUri.setSeed();
schedule(caUri);
}
}
}

public synchronized CrawlURI next(int timeout) throws InterruptedException {
if (!uriInProcess && !isEmpty()) {
uriInProcess = true;
CrawlURI curi;
if (!prerequisites.isEmpty()) {
curi = CrawlURI.from((CandidateURI) prerequisites.remove(0));
} else {
curi = CrawlURI.from((CandidateURI) pendingURIs.remove(0));
}
curi.setServer(controller.getServerCache().getServerFor(curi));
return curi;
} else {
wait(timeout);
return null;
}
}

public boolean isEmpty() {
return pendingURIs.isEmpty() && prerequisites.isEmpty();
}

public synchronized void schedule(CandidateURI caURI) {
// Schedule a uri for crawling if it is not already crawled
if (!alreadyIncluded.containsKey(caURI.getURIString())) {
if(caURI.needsImmediateScheduling()) {
prerequisites.add(caURI);
} else {
pendingURIs.add(caURI);
}
alreadyIncluded.put(caURI.getURIString(), caURI);
}
}

public void batchSchedule(CandidateURI caURI) {
schedule(caURI);
}

public void batchFlush() {
}

public synchronized void finished(CrawlURI cURI) {
uriInProcess = false;
if (cURI.isSuccess()) {
successCount++;
totalProcessedBytes += cURI.getContentSize();
controller.fireCrawledURISuccessfulEvent(cURI);
cURI.stripToMinimal();
} else if (cURI.getFetchStatus() == S_DEFERRED) {
cURI.processingCleanup();
alreadyIncluded.remove(cURI.getURIString());
schedule(cURI);
} else if (cURI.getFetchStatus() == S_ROBOTS_PRECLUDED
|| cURI.getFetchStatus() == S_OUT_OF_SCOPE
|| cURI.getFetchStatus() == S_BLOCKED_BY_USER
|| cURI.getFetchStatus() == S_TOO_MANY_EMBED_HOPS
|| cURI.getFetchStatus() == S_TOO_MANY_LINK_HOPS
|| cURI.getFetchStatus() == S_DELETED_BY_USER) {
controller.fireCrawledURIDisregardEvent(cURI);
disregardedCount++;
cURI.stripToMinimal();
} else {
controller.fireCrawledURIFailureEvent(cURI);
failedCount++;
cURI.stripToMinimal();
}
cURI.processingCleanup();
}

public long discoveredUriCount() {
return alreadyIncluded.size();
}

public long queuedUriCount() {
return pendingURIs.size() + prerequisites.size();
}

public long finishedUriCount() {
return successCount + failedCount + disregardedCount;
}

public long successfullyFetchedCount() {
return successCount;
}

public long failedFetchCount() {
return failedCount;
}

public long disregardedFetchCount() {
return disregardedCount;
}

public long totalBytesWritten() {
return totalProcessedBytes;
}

public String report() {
return "This frontier does not return a report.";
}

public void importRecoverLog(String pathToLog) throws IOException {
throw new UnsupportedOperationException();
}

public FrontierMarker getInitialMarker(String regexpr,
boolean inCacheOnly) {
return null;
}

public ArrayList getURIsList(FrontierMarker marker, int numberOfMatches,
boolean verbose) throws InvalidFrontierMarkerException {
return null;
}

public long deleteURIs(String match) {
return 0;
}

}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值