同样:
package org.apache.nutch.crawl;
Job
先从它的Job开始入“眼”:
currentJob = new NutchJob(getConf(), "generate: " + getConf().get(BATCH_ID));
Collection<WebPage.Field> fields = getFields(currentJob);
/**
* public static <K, V> void initMapperJob(
* Job job,
* Collection<WebPage.Field> fields,
* Class<K> outKeyClass,
* Class<V> outValueClass,
* Class<? extends GoraMapper<String, WebPage, K, V>> mapperClass,
* Class<? extends Partitioner<K, V>> partitionerClass,
* boolean reuseObjects)
*/
StorageUtils.initMapperJob(
currentJob,
fields,
SelectorEntry.class,
WebPage.class,
GeneratorMapper.class,
SelectorEntryPartitioner.class,
true);
StorageUtils.initReducerJob(currentJob, GeneratorReducer.class);
currentJob.waitForCompletion(true);
可以看出来它的Map的输出为<SelectorEntry, WebPage>
对,WebPage
大家已经了解,就是对于一个页面本身的信息还有其他信息(比如抓取时间)的一个封装。那么SelectorEntry这个Class长什么样呢?
SelectorEntry
public static class SelectorEntry implements
WritableComparable<SelectorEntry> {
String url;
float score;
/**
* 首先按网页分值排序,分值高的靠前;然后再按照url的字典序排序。
*/
public int compareTo(SelectorEntry se) {
if (se.score > score)
return 1;
else if (se.score == score)
return url.compareTo(se.url);
return -1;
}
/**
* 依靠hashCode和equals判断两个对象是否相同,用于去重的目的。
*/
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + url.hashCode();
result = prime * result + Float.floatToIntBits(score);
return result;
}
@Override
public boolean equals(Object obj) {
SelectorEntry other = (SelectorEntry) obj;
if (!url.equals(other.url))
return false;
if (Float.floatToIntBits(score) != Float.floatToIntBits(other.score))
return false;
return true;
}
}
首先呢,由于SelectorEntry
类是对WritableComparable
类的一个实现,肯定得凸显出Writable和Comparable这两大特性。关于Writable呢,自然就是readFields和write这两个方法;而Comparable呢,自然必须得有compareTo这个方法。而且又由于SelectorEntry
是作为键key
来出现,是个复合键,所以它要能够让reduce进行去重,这个功能的实现就是靠重载hashCode和equals方法来实现的。这个不懂的同学可以去看看Hadoop MapReduce怎么定制其中的Partitioner,SortComparator和GroupingComparator类,其实看MapReduce自带的SecondarySort这个实例就明白了。
SelectorEntryPartitioner
接着我们就看他的SelectorEntryPartitioner
类:
public static class SelectorEntryPartitioner extends
Partitioner<SelectorEntry, WebPage> implements Configurable {
private URLPartitioner partitioner = new URLPartitioner();
@Override
public int getPartition(SelectorEntry selectorEntry, WebPage page,
int numReduces) {
return partitioner.getPartition(selectorEntry.url, numReduces);
}
}
public int getPartition(String urlString, int numReduceTasks) {
if (numReduceTasks == 1) {
// this check can be removed when we use Hadoop with MAPREDUCE-1287
return 0;
}
int hashCode;
URL url = null;
try {
urlString = normalizers.normalize(urlString,
URLNormalizers.SCOPE_PARTITION);
hashCode = urlString.hashCode(); /**默认哈希值是正规化之后的String的哈希值*/
url = new URL(urlString);
} catch (MalformedURLException e) {
LOG.warn("Malformed URL: '" + urlString + "'");
hashCode = urlString.hashCode();
}
if (url != null) {
if (mode.equals(PARTITION_MODE_HOST)) {
hashCode = url.getHost().hashCode(); /**如果计数模式是host,则哈希值为url中host的哈希值*/
} else if (mode.equals(PARTITION_MODE_DOMAIN)) {
hashCode = URLUtil.getDomainName(url).hashCode(); /**如果计数模式是domain,则哈希值为url中domain的哈希值*/
} else { // MODE IP
try {
InetAddress address = InetAddress.getByName(url.getHost());
hashCode = address.getHostAddress().hashCode(); /**如果计数模式是ip,则哈希值为hostAddress的哈希值*/
} catch (UnknownHostException e) {
GeneratorJob.LOG.info("Couldn't find IP for host: " + url.getHost());
}
}
}
// make hosts wind up in different partitions on different runs
hashCode ^= seed; /**按位异或,hash的手段*/
return (hashCode & Integer.MAX_VALUE) % numReduceTasks; /**把SelectorEntry分入不同的reduce任务中[0..numReduceTask-1]*/
}
这个Partitioner就按照不同的计数模式将url分到了不同的reduce任务中了。假如计数模式是host,它就会将同一host的url分到同一个task中去。
GeneratorMapper
public class GeneratorMapper extends
GoraMapper<String, WebPage, SelectorEntry, WebPage> {
private URLFilters filters; /**插件*/
private URLNormalizers normalizers; /**插件*/
private boolean filter; /**是否过滤*/
private boolean normalise; /**是否规范化*/
private FetchSchedule schedule; /**还是插件*/
private ScoringFilters scoringFilters;/**仍然插件*/
private long curTime; /**Generate的时间*/
private SelectorEntry entry = new SelectorEntry();
private int maxDistance; /**该Url到种子Url最短路径的最大距离*/
@Override
public void map(String reversedUrl, WebPage page, Context context)
throws IOException, InterruptedException {
String url = TableUtil.unreverseUrl(reversedUrl);
if (Mark.GENERATE_MARK.checkMark(page) != null) {/**已经生成过了*/
return;
}
// filter on distance
if (maxDistance > -1) { /**-1 if unlimited.*/
CharSequence distanceUtf8 = page.getMarkers().get(DbUpdaterJob.DISTANCE);
if (distanceUtf8 != null) {
int distance = Integer.parseInt(distanceUtf8.toString());
if (distance > maxDistance) { /**距离种子页面太远(i.e.太不相关了)*/
return;
}
}
}
// If filtering is on don't generate URLs that don't pass URLFilters
try {
if (normalise) {
url = normalizers.normalize(url,
URLNormalizers.SCOPE_GENERATE_HOST_COUNT);
}
if (filter && filters.filter(url) == null)
return;
} catch (URLFilterException e) {
return;
} catch (MalformedURLException e) {
return;
}
// check fetch schedule
if (!schedule.shouldFetch(url, page, curTime)) { /**默认调用org.apache.nutch.crawl.DefaultFetchSchedule*/
return;
}
float score = page.getScore();
try {
/**
* 该方法为排序并且选择分值Top N的页面得出一个排序的分值
*/
score = scoringFilters.generatorSortValue(url, page, score);
} catch (ScoringFilterException e) {
// ignore
}
entry.set(url, score);
context.write(entry, page); /**Map的输出<SelectorEntry(entry), WebPage(page)>*/
}
}
该Mapper类就是将Injector步中注入的Urls作为输入,然后做简单的筛选然后输出<SelectorEntry, WebPage>
作为Reducer的输入。
我们可以看看Job中StorageUtils.initMapperJob()
又调用了GoraMapper.initMapperJob()
。这个方法中有一句关键的话:
GoraMapper
//set the input via GoraInputFormat
GoraInputFormat.setInput(job, query, dataStore, reuseObjects);
即设置input是从GoraInputFormat中来的。
GeneratorReducer
接下来我们看一看GeneratorReducer
类:
public class GeneratorReducer extends
GoraReducer<SelectorEntry, WebPage, String, WebPage> {
private long limit;
private long maxCount;
protected static long count = 0;
private boolean byDomain = false;
private Map<String, Integer> hostCountMap = new HashMap<String, Integer>();
private Utf8 batchId;
@Override
protected void reduce(SelectorEntry key, Iterable<WebPage> values,
Context context) throws IOException, InterruptedException {
for (WebPage page : values) {
if (count >= limit) { /**如果超过限制则退出*/
return;
}
if (maxCount > 0) { /**如果设定了最大值*/
String hostordomain;
if (byDomain) {
hostordomain = URLUtil.getDomainName(key.url);
} else {
hostordomain = URLUtil.getHost(key.url);
}
Integer hostCount = hostCountMap.get(hostordomain);
if (hostCount == null) {
hostCountMap.put(hostordomain, 0);
hostCount = 0;
}
if (hostCount >= maxCount) { /**如果某一个hostordomain的数量超过最大值则退出*/
return;
}
hostCountMap.put(hostordomain, hostCount + 1);
}
Mark.GENERATE_MARK.putMark(page, batchId); /**设置标记,已经生成过了*/
page.setBatchId(batchId);
try {
context.write(TableUtil.reverseUrl(key.url), page); /**输出<String(reversedUrl), WebPage(page)>到dataStore,如HBase*/
} catch (MalformedURLException e) {
context.getCounter("Generator", "MALFORMED_URL").increment(1);
continue;
}
context.getCounter("Generator", "GENERATE_MARK").increment(1);
count++;
}
}
@Override
protected void setup(Context context) throws IOException,
InterruptedException {
Configuration conf = context.getConfiguration();
long totalLimit = conf
.getLong(GeneratorJob.GENERATOR_TOP_N, Long.MAX_VALUE); /**一共需要产生Top ?的Urls*/
if (totalLimit == Long.MAX_VALUE) {
limit = Long.MAX_VALUE;
} else {
limit = totalLimit / context.getNumReduceTasks(); /**将total值平均分到每一个Reduce任务中*/
}
maxCount = conf.getLong(GeneratorJob.GENERATOR_MAX_COUNT, -2); /**在一个fetchlist上的最大Url数*/
batchId = new Utf8(conf.get(GeneratorJob.BATCH_ID));
String countMode = conf.get(GeneratorJob.GENERATOR_COUNT_MODE,
GeneratorJob.GENERATOR_COUNT_VALUE_HOST);
if (countMode.equals(GeneratorJob.GENERATOR_COUNT_VALUE_DOMAIN)) { /**是否是byDomain模式*/
byDomain = true;
}
}
}
可以看出来,每个Reduce任务只输出Top limit的<String(reversedUrl), WebPage(page)>
对。这样分配Reduce的任务并不一定完全均匀,但是也已经可以了。
GoraReducer
/**
* Initializes the Reducer, and sets output parameters for the job.
* @param dataStore the datastore as the output /**the most important one.*/
*/
public static <K1, V1, K2, V2 extends Persistent>
void initReducerJob(
Job job,
DataStore<K2,V2> dataStore,
Class<? extends GoraReducer<K1, V1, K2, V2>> reducerClass,
boolean reuseObjects) {
GoraOutputFormat.setOutput(job, dataStore, reuseObjects);
job.setReducerClass(reducerClass);
}
总结一下,GeneratorJob就是从数据库中产生要抓取的Top N页面放到抓取队列(fetchlist)中去。
References
1. Nutch 2.0 之 抓取流程简单分析
2. Nutch2 之 GeneratorJob