Nutch任务通常从InjectorJob开始,它的作用是从种子文件中取出所有种子存入库中,供之后的任务使用。
InjectorJob类中最重要是UrlMapper,这个类实际处理类
public static class UrlMapper extends
Mapper<LongWritable, Text, String, WebPage> {
private URLNormalizers urlNormalizers;
private int interval;//重新抓取同一个页面的时间间隔(默认为30天)
private float scoreInjected;//一个新页面的默认score
private URLFilters filters;//url过滤器
private ScoringFilters scfilters;
private long curTime;
@Override
protected void setup(Context context) throws IOException, InterruptedException {
urlNormalizers = new URLNormalizers(context.getConfiguration(),
URLNormalizers.SCOPE_INJECT);
interval = context.getConfiguration().getInt("db.fetch.interval.default",
2592000);
filters = new URLFilters(context.getConfiguration());
scfilters = new ScoringFilters(context.getConfiguration());
scoreInjected = context.getConfiguration().getFloat("db.score.injected",
1.0f);
curTime = context.getConfiguration().getLong("injector.current.time",
System.currentTimeMillis());
}
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String url = value.toString(); // value is line of text
//以#开头的忽略
if (url != null && url.trim().startsWith("#")) {
/* Ignore line that start with # */
return;
}
//每个url后面都可以加上name=value格式的自定义参数,以tab键分割
// if tabs : metadata that could be stored
// must be name=value and separated by \t
float customScore = -1f;
int customInterval = interval;
//解析url后的自定义参数
Map<String, String> metadata = new TreeMap<String, String>();
if (url.indexOf("\t") != -1) {
String[] splits = url.split("\t");
url = splits[0];//url地址
for (int s = 1; s < splits.length; s++) {
// find separation between name and value
int indexEquals = splits[s].indexOf("=");
if (indexEquals == -1) {
// skip anything without a =
continue;
}
String metaname = splits[s].substring(0, indexEquals);
String metavalue = splits[s].substring(indexEquals + 1);
//自定义score的值
if (metaname.equals(nutchScoreMDName)) {
try {
customScore = Float.parseFloat(metavalue);
} catch (NumberFormatException nfe) {
}
//自定义抓取间隔时间(单位为秒)
} else if (metaname.equals(nutchFetchIntervalMDName)) {
try {
customInterval = Integer.parseInt(metavalue);
} catch (NumberFormatException nfe) {
}
} else
//其他自定义参数
metadata.put(metaname, metavalue);
}
}
try {
//转换成标准格式的url
url = urlNormalizers.normalize(url, URLNormalizers.SCOPE_INJECT);
//对url进行过滤,如果不符合规则,返回null
url = filters.filter(url); // filter the url
} catch (Exception e) {
LOG.warn("Skipping " + url + ":" + e);
url = null;
}
if (url == null) {
//累加总过滤的url数量
context.getCounter("injector", "urls_filtered").increment(1);
return;
} else { // if it passes
String reversedUrl = TableUtil.reverseUrl(url); // collect it
WebPage row = new WebPage();
row.setFetchTime(curTime);
row.setFetchInterval(customInterval);
// now add the metadata
Iterator<String> keysIter = metadata.keySet().iterator();
while (keysIter.hasNext()) {
String keymd = keysIter.next();
String valuemd = metadata.get(keymd);
row.putToMetadata(new Utf8(keymd), ByteBuffer.wrap(valuemd.getBytes()));
}
if (customScore != -1)
row.setScore(customScore);
else
row.setScore(scoreInjected);
try {
scfilters.injectedScore(url, row);
} catch (ScoringFilterException e) {
if (LOG.isWarnEnabled()) {
LOG.warn("Cannot filter injected score for url " + url
+ ", using default (" + e.getMessage() + ")");
}
}
//累加inject的url数量
context.getCounter("injector", "urls_injected").increment(1);
//设置标记
row.putToMarkers(DbUpdaterJob.DISTANCE, new Utf8(String.valueOf(0)));
Mark.INJECT_MARK.putMark(row, YES_STRING);
//将row写入数据库
context.write(reversedUrl, row);
}
}
}
InjectorJob类中run(Map<String,Object> args),Crawler中实际是调用这个方法
public Map<String,Object> run(Map<String,Object> args) throws Exception {
getConf().setLong("injector.current.time", System.currentTimeMillis());
Path input;
//从入参中获得种子文件
Object path = args.get(Nutch.ARG_SEEDDIR);
if (path instanceof Path) {
input = (Path)path;
} else {
input = new Path(path.toString());
}
numJobs = 1;
currentJobNum = 0;
currentJob = new NutchJob(getConf(), "inject " + input);
FileInputFormat.addInputPath(currentJob, input);
currentJob.setMapperClass(UrlMapper.class);//处理Map
currentJob.setMapOutputKeyClass(String.class);
currentJob.setMapOutputValueClass(WebPage.class);
currentJob.setOutputFormatClass(GoraOutputFormat.class);//输出处理类
DataStore<String, WebPage> store = StorageUtils.createWebStore(currentJob.getConfiguration(),
String.class, WebPage.class);
GoraOutputFormat.setOutput(currentJob, store, true);
// NUTCH-1471 Make explicit which datastore class we use
//获取持久化处理类
Class<? extends DataStore<Object, Persistent>> dataStoreClass =
StorageUtils.getDataStoreClass(currentJob.getConfiguration());
LOG.info("InjectorJob: Using " + dataStoreClass + " as the Gora storage class.");
currentJob.setReducerClass(Reducer.class);
currentJob.setNumReduceTasks(0);
currentJob.waitForCompletion(true);
ToolUtil.recordJobStatus(null, currentJob, results);
// NUTCH-1370 Make explicit #URLs injected @runtime
long urlsInjected = currentJob.getCounters().findCounter("injector", "urls_injected").getValue();
long urlsFiltered = currentJob.getCounters().findCounter("injector", "urls_filtered").getValue();
LOG.info("InjectorJob: total number of urls rejected by filters: " + urlsFiltered);
LOG.info("InjectorJob: total number of urls injected after normalization and filtering: "
+ urlsInjected);
return results;
}