同样的:
package org.apache.nutch.fetcher;
Job
先看它的Job是个什么情况:
currentJob = new NutchJob(getConf(), "fetch");
// for politeness, don't permit parallel execution of a single task
/**
* turn off Reduce的推测执行。推测执行就是当一个任务执行的比较慢,Job Tracker就会开启一个新的duplicate tast去做它的活,
* 他们两谁先完成就把对方kill掉。为了politeness,不允许一个任务的并发执行,即不允许多个线程同时抓取同一个网页。
*/
currentJob.setReduceSpeculativeExecution(false);
Collection<WebPage.Field> fields = getFields(currentJob);
MapFieldValueFilter<String, WebPage> batchIdFilter = getBatchIdFilter(batchId);
StorageUtils.initMapperJob(currentJob, fields, IntWritable.class,
FetchEntry.class, FetcherMapper.class, FetchEntryPartitioner.class,
batchIdFilter, false);
StorageUtils.initReducerJob(currentJob, FetcherReducer.class);
if (numTasks == null || numTasks < 1) { /**怎么可以木有reduce任务呢?不可以!*/
currentJob.setNumReduceTasks(currentJob.getConfiguration().getInt(
"mapred.map.tasks", currentJob.getNumReduceTasks()));
} else {
currentJob.setNumReduceTasks(numTasks);
}
currentJob.waitForCompletion(true); /**run job*/
FetchEntry
public class FetchEntry extends Configured implements Writable {
private String key;
private WebPage page;
}
其中的key就是reversedUrl,没啥说的。
FetchEntryPartitioner
public static class FetchEntryPartitioner extends
Partitioner<IntWritable, FetchEntry> implements Configurable {
private URLPartitioner partitioner = new URLPartitioner();
@Override
public int getPartition(IntWritable intWritable, FetchEntry fetchEntry,
int numReduces) {
String key = fetchEntry.getKey();
String url = TableUtil.unreverseUrl(key);
return partitioner.getPartition(url, numReduces);
}
}
getPartition同样调用的是GeneratorJob中GeneratorMapper的getPartition方法。
FetcherMapper
下面我们看一看它的Mapper类:
public static class FetcherMapper extends
GoraMapper<String, WebPage, IntWritable, FetchEntry> {
private boolean shouldContinue;
private Utf8 batchId;
private Random random = new Random();
@Override
protected void setup(Context context) {
Configuration conf = context.getConfiguration();
shouldContinue = conf.getBoolean(RESUME_KEY, false); /**是否继续*/
batchId = new Utf8(
conf.get(GeneratorJob.BATCH_ID, Nutch.ALL_BATCH_ID_STR));
}
@Override
protected void map(String key, WebPage page, Context context)
throws IOException, InterruptedException {
if (Mark.GENERATE_MARK.checkMark(page) == null) { /**跳过未生成的(上一步generator)*/
return;
}
if (shouldContinue && Mark.FETCH_MARK.checkMark(page) != null) { /**已经抓取过了*/
return;
}
context.write(new IntWritable(random.nextInt(65536)), new FetchEntry(
context.getConfiguration(), key, page)); /**Mapper输出<IntWritable, FetchEntry>对,其中key是[0,65535]之间的随机数,使得Mapper的输出能够大致均匀的分给每个reduce的任务*/
}
}
FetcherReducer
这个Job中最重要的大概就属它的Reducer类了,其中用到了生产者/消费者模型。一个生产者对应于多个消费者。对于该模型,这里是其理论部分,这里是其实践部分。
好了,大致了解了该模型之后,我们来看看该Reducer类中对应的生产者和消费者究竟是谁。
首先呢,该Reducer类中一种含有五个其他内部类,分别是FetchItem
, FetchItemQueue
, FetchItemQueues
, FetcherThread
和QueueFeeder
。其实呢,FetchItem
就是要被抓取的对象,FetchQueue
里装的就是来自于同一个host Id(可能是一个proto-hostname或者proto-domainname或者一个proto-IP对)的FetchItem
。FetchQueues
就是将不同的host Id来的装进不同FetchQueue
的那些FetchItem
进行统一管理。所以将来FetchQueues
的对象就是临界区。QueueFeeder
,顾名思义,生产者,而FetcherThread
则就是其中的消费者。
下面我们来具体看一看。
FetchItem
private static class FetchItem {
WebPage page;
String queueID;
String url;
URL u;
public FetchItem(String url, WebPage page, URL u, String queueID) {
this.page = page;
this.url = url;
this.u = u; /**url的URL对象*/
this.queueID = queueID;
}
/**
* Create an item. Queue id will be created based on <code>queueMode</code>
* argument, either as a protocol + hostname pair, protocol + IP address
* pair or protocol+domain pair.
*/
public static FetchItem create(String url, WebPage page, String queueMode) {
String queueID;
URL u = null;
try {
u = new URL(url);
} catch (final Exception e) {
return null;
}
final String proto = u.getProtocol().toLowerCase();
String host;
if (FetchItemQueues.QUEUE_MODE_IP.equalsIgnoreCase(queueMode)) { /**如果是byIP*/
try {
final InetAddress addr = InetAddress.getByName(u.getHost());
host = addr.getHostAddress();
} catch (final UnknownHostException e) {
return null;
}
} else if (FetchItemQueues.QUEUE_MODE_DOMAIN.equalsIgnoreCase(queueMode)) { /**如果是byDomain*/
host = URLUtil.getDomainName(u);
if (host == null) {
host = u.toExternalForm();
}
} else { /**否则就是byHost*/
host = u.getHost();
if (host == null) {
host = u.toExternalForm();
}
}
queueID = proto + "://" + host.toLowerCase(); /**queueID就是proto-host Id对*/
return new FetchItem(url, page, u, queueID);
}
}
用url和WebPage对象再知道queueMode就可以创建一个FetchItem
对象了。
FetchItemQueue
/**
* This class handles FetchItems which come from the same host ID (be it a
* proto/hostname or proto/IP pair). It also keeps track of requests in
* progress and elapsed time between requests.
*/
private static class FetchItemQueue {
List<FetchItem> queue = Collections /**用一个LinkedList创建一个queue队列,每一个queue收集同QueueID的FetchItem*/
.synchronizedList(new LinkedList<FetchItem>());
Set<FetchItem> inProgress = Collections /**用一个HashSet创建一个inProgress队列,用于收集正在抓取的FetchItem*/
.synchronizedSet(new HashSet<FetchItem>());
AtomicLong nextFetchTime = new AtomicLong();
long crawlDelay; /**存储抓取时间间隔,当maxThreads==1时有效*/
long minCrawlDelay; /**存储最小抓取时间间隔,当maxThreads>1时有效*/
int maxThreads; /**最大同时工作线程数*/
public FetchItemQueue(Configuration conf, int maxThreads, long crawlDelay,
long minCrawlDelay) {
this.maxThreads = maxThreads;
this.crawlDelay = crawlDelay;
this.minCrawlDelay = minCrawlDelay;
// ready to start
setEndTime(System.currentTimeMillis() - crawlDelay);
}
/**后面的就是一些set和get的方法,nothing special*/
public int getQueueSize() {
return queue.size();
}
public int getInProgressSize() {
return inProgress.size();
}
public void finishFetchItem(FetchItem it, boolean asap) {
if (it != null) {
inProgress.remove(it); /**将其从inProgress中移除*/
setEndTime(System.currentTimeMillis(), asap); /**设置结束时间*/
}
}
public void addFetchItem(FetchItem it) {
if (it == null)
return;
queue.add(it); /**将其加入queue队列*/
}
@SuppressWarnings("unused")
public void addInProgressFetchItem(FetchItem it) {
if (it == null)
return;
inProgress.add(it); /**将其加入inProgress队列*/
}
public FetchItem getFetchItem() {
if (inProgress.size() >= maxThreads) /**如果正在处理队列中个数大于最大线程数*/
return null;
final long now = System.currentTimeMillis();
if (nextFetchTime.get() > now) /**如果下次抓取时间还没到*/
return null;
FetchItem it = null;
if (queue.size() == 0) /**如果queue队列为空*/
return null;
try {
it = queue.remove(0); /**将其从queue队列移到inProgress队列*/
inProgress.add(it);
} catch (final Exception e) { }
return it;
}
public synchronized void dump() {
for (int i = 0; i < queue.size(); i++) {
final FetchItem it = queue.get(i); /**dump掉其实就只是在日志中记录下来*/
LOG.info(" " + i + ". " + it.url);
}
}
private void setEndTime(long endTime) {
setEndTime(endTime, false); /**默认非立即*/
}
private void setEndTime(long endTime, boolean asap) {
if (!asap) /**下次抓取时间为 当前结束时间+抓取时间间隔*/
nextFetchTime.set(endTime
+ (maxThreads > 1 ? minCrawlDelay : crawlDelay));
else /**if as soon as possible, then 设置为当前结束时间*/
nextFetchTime.set(endTime);
}
public synchronized int emptyQueue() { /**清空队列并返回之前的队列大小*/
int presize = queue.size();
queue.clear();
return presize;
}
}
FetchItemQueues
private static class FetchItemQueues {
Map<String, FetchItemQueue> queues = new HashMap<String, FetchItemQueue>();
AtomicInteger totalSize = new AtomicInteger(0);
int maxThreads;
String queueMode;
long crawlDelay;
long minCrawlDelay;
Configuration conf;
long timelimit = -1;
}
FetchItemQueues
主要就是对FetchItem
的一个封装。
FetcherThread
/**
* This class picks items from queues and fetches the pages.
*/
private class FetcherThread extends Thread {
private final URLFilters urlFilters;
private final URLNormalizers normalizers;
private final ProtocolFactory protocolFactory; /**插件Protocol*/
private final long maxCrawlDelay; /**如果robots.txt中的Crawl-Delay比此值大,则跳过此页面;若设置为-1,则不论robots中延迟为多少都一直等*/
@SuppressWarnings("unused")
private final boolean byIP; /**是否通过byIP方式*/
private String reprUrl;
private final Context context;
private final boolean ignoreExternalLinks;
public FetcherThread(Context context, int num) {
this.setDaemon(true); // don't hang JVM on exit
this.setName("FetcherThread" + num); // use an informative name
this.context = context;
Configuration conf = context.getConfiguration();
this.urlFilters = new URLFilters(conf);
this.protocolFactory = new ProtocolFactory(conf); /**默认使用protocol-http,在nutch-default中的Plugin.includes中设置*/
this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_FETCHER);
this.maxCrawlDelay = conf.getInt("fetcher.max.crawl.delay", 30) * 1000; /**默认30s*/
// backward-compatible default setting
this.byIP = conf.getBoolean("fetcher.threads.per.host.by.ip", true); /**默认是byIP*/
/**
* If true, outlinks leading from a page to external hosts
* will be ignored. This is an effective way to limit the crawl to include
* only initially injected hosts, without creating complex URLFilters.
*/
this.ignoreExternalLinks = conf.getBoolean("db.ignore.external.links",
false);
}
@Override
@SuppressWarnings("fallthrough")
public void run() {
activeThreads.incrementAndGet(); // count threads
FetchItem fit = null;
try {
while (true) {
fit = fetchQueues.getFetchItem();
if (fit == null) { /**如果没有获取到FetchItem,则*/
if (feeder.isAlive() || fetchQueues.getTotalSize() > 0) { /**如果不是线程非alive并且不是因为队列为空了,那么就是该旋转等待(spin-wait)*/
}
// spin-wait.
spinWaiting.incrementAndGet();
try {
Thread.sleep(500);
} catch (final Exception e) {
}
spinWaiting.decrementAndGet();
continue;
} else { /**要不然就是全都做完了*/
// all done, finish this thread
return;
}
}
lastRequestStart.set(System.currentTimeMillis()); /**设置上次请求开始时间*/
if (fit.page.getReprUrl() == null) {
reprUrl = fit.url;
} else {
reprUrl = TableUtil.toString(fit.page.getReprUrl());
}
try {
// fetch the page
final Protocol protocol = this.protocolFactory.getProtocol(fit.url);
final BaseRobotRules rules = protocol.getRobotRules(fit.url,
fit.page);
if (!rules.isAllowed(fit.u.toString())) { /**访问被拒*/
// unblock
fetchQueues.finishFetchItem(fit, true);
if (LOG.isDebugEnabled()) {
LOG.debug("Denied by robots.txt: " + fit.url);
}
output(fit, null, ProtocolStatusUtils.STATUS_ROBOTS_DENIED,
CrawlStatus.STATUS_GONE);
continue;
}
if (rules.getCrawlDelay() > 0) {
if (rules.getCrawlDelay() > maxCrawlDelay && maxCrawlDelay >= 0) { /**等待时间太久,跳过*/
// unblock
fetchQueues.finishFetchItem(fit, true);
LOG.debug("Crawl-Delay for " + fit.url + " too long ("
+ rules.getCrawlDelay() + "), skipping");
output(fit, null, ProtocolStatusUtils.STATUS_ROBOTS_DENIED,
CrawlStatus.STATUS_GONE);/**记录*/
continue;
} else { /**重新设置抓取延迟*/
final FetchItemQueue fiq = fetchQueues
.getFetchItemQueue(fit.queueID);
fiq.crawlDelay = rules.getCrawlDelay();
if (LOG.isDebugEnabled()) {
LOG.info("Crawl delay for queue: " + fit.queueID
+ " is set to " + fiq.crawlDelay
+ " as per robots.txt. url: " + fit.url);
}
}
}
final ProtocolOutput output = protocol.getProtocolOutput(fit.url,
fit.page);
final ProtocolStatus status = output.getStatus();
final Content content = output.getContent();
// unblock queue
fetchQueues.finishFetchItem(fit);
context.getCounter("FetcherStatus",
ProtocolStatusUtils.getName(status.getCode())).increment(1);
int length = 0;
if (content != null && content.getContent() != null)
length = content.getContent().length;
updateStatus(length);
/************/
switch (status.getCode()) {
case ProtocolStatusCodes.WOULDBLOCK:
// retry ?
fetchQueues.addFetchItem(fit);
break;
case ProtocolStatusCodes.SUCCESS: // got a page
output(fit, content, status, CrawlStatus.STATUS_FETCHED);
break;
case ProtocolStatusCodes.MOVED: // redirect
case ProtocolStatusCodes.TEMP_MOVED:
byte code;
boolean temp;
if (status.getCode() == ProtocolStatusCodes.MOVED) {
code = CrawlStatus.STATUS_REDIR_PERM;
temp = false;
} else {
code = CrawlStatus.STATUS_REDIR_TEMP;
temp = true;
}
final String newUrl = ProtocolStatusUtils.getMessage(status);
handleRedirect(fit.url, newUrl, temp, FetcherJob.PROTOCOL_REDIR,
fit.page);
output(fit, content, status, code);
break;
case ProtocolStatusCodes.EXCEPTION:
logFetchFailure(fit.url, ProtocolStatusUtils.getMessage(status));
/* FALLTHROUGH */
case ProtocolStatusCodes.RETRY: // retry
case ProtocolStatusCodes.BLOCKED:
output(fit, null, status, CrawlStatus.STATUS_RETRY);
break;
case ProtocolStatusCodes.GONE: // gone
case ProtocolStatusCodes.NOTFOUND:
case ProtocolStatusCodes.ACCESS_DENIED:
case ProtocolStatusCodes.ROBOTS_DENIED:
output(fit, null, status, CrawlStatus.STATUS_GONE);
break;
case ProtocolStatusCodes.NOTMODIFIED:
output(fit, null, status, CrawlStatus.STATUS_NOTMODIFIED);
break;
default:
if (LOG.isWarnEnabled()) {
LOG.warn("Unknown ProtocolStatus: " + status.getCode());
}
output(fit, null, status, CrawlStatus.STATUS_RETRY);
}
/************/
} catch (final Throwable t) { // unexpected exception
// unblock
fetchQueues.finishFetchItem(fit);
LOG.error("Unexpected error for " + fit.url, t);
output(fit, null, ProtocolStatusUtils.STATUS_FAILED,
CrawlStatus.STATUS_RETRY);
}
}/**end of while(true)*/
} catch (final Throwable e) {
LOG.error("fetcher throwable caught", e);
} finally { /**整个进程结束*/
if (fit != null)
fetchQueues.finishFetchItem(fit);
activeThreads.decrementAndGet(); // count threads
LOG.info("-finishing thread " + getName() + ", activeThreads="
+ activeThreads);
}
}
private void output(FetchItem fit, Content content, ProtocolStatus pstatus,
byte status) throws IOException, InterruptedException {
fit.page.setStatus((int) status);
final long prevFetchTime = fit.page.getFetchTime();
fit.page.setPrevFetchTime(prevFetchTime);
fit.page.setFetchTime(System.currentTimeMillis());
if (pstatus != null) {
fit.page.setProtocolStatus(pstatus);
}
if (content != null) {
fit.page.setContent(ByteBuffer.wrap(content.getContent()));
fit.page.setContentType(new Utf8(content.getContentType()));
fit.page.setBaseUrl(new Utf8(content.getBaseUrl()));
}
Mark.FETCH_MARK.putMark(fit.page, Mark.GENERATE_MARK.checkMark(fit.page));
String key = TableUtil.reverseUrl(fit.url);
if (parse) {
if (!skipTruncated
|| (skipTruncated && !ParserJob.isTruncated(fit.url, fit.page))) {
parseUtil.process(key, fit.page);
}
}
// remove content if storingContent is false. Content is added to fit.page
// above
// for ParseUtil be able to parse it.
if (content != null && !storingContent) {
fit.page.setContent(ByteBuffer.wrap(new byte[0]));
}
context.write(key, fit.page);/**整个Reducer的输出*/
}
}
具体有关程序中 switch (status.getCode())部分各种status标识的含义参见这里。
QueueFeeder
/**
* This class feeds the queues with input items, and re-fills them as items
* are consumed by FetcherThread-s.
*/
private static class QueueFeeder extends Thread {
private final Context context;
private final FetchItemQueues queues;
private final int size;
private Iterator<FetchEntry> currentIter;
boolean hasMore;
private long timelimit = -1;
public QueueFeeder(Context context, FetchItemQueues queues, int size)
throws IOException, InterruptedException {
this.context = context;
this.queues = queues;
this.size = size;
this.setDaemon(true);
this.setName("QueueFeeder");
hasMore = context.nextKey();
if (hasMore) {
currentIter = context.getValues().iterator();
}
// the value of the time limit is either -1 or the time where it should
// finish
timelimit = context.getConfiguration().getLong("fetcher.timelimit", -1);
}
@Override
public void run() {
int cnt = 0;
int timelimitcount = 0;
try {
while (hasMore) {
if (System.currentTimeMillis() >= timelimit && timelimit != -1) {/**到了时间限制,跳过未处理的*/
// enough .. lets' simply
// read all the entries from the input without processing them
while (currentIter.hasNext()) {
currentIter.next();
timelimitcount++;
}
hasMore = context.nextKey();
if (hasMore) {
currentIter = context.getValues().iterator();
}
continue;
}
int feed = size - queues.getTotalSize();
if (feed <= 0) {
// queues are full - spin-wait until they have some free space
try {
Thread.sleep(1000);
} catch (final Exception e) {
}
;
continue;
}
if (LOG.isDebugEnabled()) {
LOG.debug("-feeding " + feed + " input urls ...");
}
while (feed > 0 && currentIter.hasNext()) {
FetchEntry entry = currentIter.next();
final String url = TableUtil.unreverseUrl(entry.getKey());
queues.addFetchItem(url, entry.getWebPage());
feed--;
cnt++;
}
if (currentIter.hasNext()) { /**一个list处理完处理另一个list*/
continue; // finish items in current list before reading next key
}
hasMore = context.nextKey();
if (hasMore) {
currentIter = context.getValues().iterator();
}
}
} catch (Exception e) {
return;
}
}
}
FetcherReducer
中的run方法
@Override
public void run(Context context) throws IOException, InterruptedException {
int maxFeedPerThread = conf.getInt("fetcher.queue.depth.multiplier", 50);
feeder = new QueueFeeder(context, fetchQueues, threadCount
* maxFeedPerThread);
feeder.start();
for (int i = 0; i < threadCount; i++) { // spawn threads
FetcherThread ft = new FetcherThread(context, i);
fetcherThreads.add(ft);
ft.start();
}
do { // wait for threads to exit
...
} while (activeThreads.get() > 0);
}
总之,FetcherJob是nutch中相对来说最核心的一个部分,要想完全吃透它需要大量的实践经验方可。纸上谈兵要不得。
References
Nutch 1.3 学习笔记 5 Fetcher流程
Nutch 1.3 学习笔记 5-1 FetchThread
Nutch 2.0 之 抓取流程简单分析