/*******************************************************************************
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
package org.apache.nutch.crawl;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import org.apache.avro.util.Utf8;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.gora.mapreduce.GoraMapper;
import org.apache.gora.mapreduce.GoraOutputFormat;
import org.apache.gora.store.DataStore;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.storage.Mark;
import org.apache.nutch.storage.StorageUtils;
import org.apache.nutch.storage.WebPage;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.NutchTool;
import org.apache.nutch.util.TableUtil;
import org.apache.nutch.util.ToolUtil;
/** This class takes a flat file of URLs and adds them to the of pages to be
* crawled. Useful for bootstrapping the system.
* The URL files contain one URL per line, optionally followed by custom metadata
* separated by tabs with the metadata key separated from the corresponding value by '='. <br>
* Note that some metadata keys are reserved : <br>
* - <i>nutch.score</i> : allows to set a custom score for a specific URL <br>
* - <i>nutch.fetchInterval</i> : allows to set a custom fetch interval for a specific URL <br>
* e.g. http://www.nutch.org/ \t nutch.score=10 \t nutch.fetchInterval=2592000 \t userType=open_source
**/
public class InjectorJob extends NutchTool implements Tool {
public static final Logger LOG = LoggerFactory.getLogger(InjectorJob.class);
private static final Set<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
private static final Utf8 YES_STRING = new Utf8("y");
static {
FIELDS.add(WebPage.Field.MARKERS);
FIELDS.add(WebPage.Field.STATUS);
}
/** metadata key reserved for setting a custom score for a specific URL */
public static String nutchScoreMDName = "nutch.score";
/**
* metadata key reserved for setting a custom fetchInterval for a specific URL
*/
public static String nutchFetchIntervalMDName = "nutch.fetchInterval";
public static class UrlMapper extends
Mapper<LongWritable, Text, String, WebPage> {
private URLNormalizers urlNormalizers;
private int interval;
private float scoreInjected;
private URLFilters filters;
private ScoringFilters scfilters;
private long curTime;
@Override
//设置参数
protected void setup(Context context) throws IOException,
InterruptedException {
urlNormalizers = new URLNormalizers(context.getConfiguration(),
URLNormalizers.SCOPE_INJECT);
interval = context.getConfiguration().getInt("db.fetch.interval.default",
2592000);
filters = new URLFilters(context.getConfiguration());
scfilters = new ScoringFilters(context.getConfiguration());
scoreInjected = context.getConfiguration().getFloat("db.score.injected",
1.0f);//默认分值
curTime = context.getConfiguration().getLong("injector.current.time",
System.currentTimeMillis());
}
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String url = value.toString();
// if tabs : metadata that could be stored
// must be name=value and separated by \t
float customScore = -1f;
int customInterval = interval;
Map<String, String> metadata = new TreeMap<String, String>();
if (url.indexOf("\t") != -1) {
String[] splits = url.split("\t");
url = splits[0];
for (int s = 1; s < splits.length; s++) {
// find separation between name and value
int indexEquals = splits[s].indexOf("=");
if (indexEquals == -1) {
// skip anything without a =
continue;
}
String metaname = splits[s].substring(0, indexEquals);
String metavalue = splits[s].substring(indexEquals + 1);
if (metaname.equals(nutchScoreMDName)) {
try {
customScore = Float.parseFloat(metavalue);
} catch (NumberFormatException nfe) {
}
} else if (metaname.equals(nutchFetchIntervalMDName)) {
try {
customInterval = Integer.parseInt(metavalue);
} catch (NumberFormatException nfe) {
}
} else
metadata.put(metaname, metavalue);
//注意:此处只会put除了nutchScoreMDName和nutchFetchIntervalMDName之外的元数据
}
}
try {
//规格化url
url = urlNormalizers.normalize(url, URLNormalizers.SCOPE_INJECT);
//过滤url
url = filters.filter(url); // filter the url
} catch (Exception e) {
LOG.warn("Skipping " + url + ":" + e);
url = null;
}
if (url == null)
return;
//倒转url
String reversedUrl = TableUtil.reverseUrl(url);
//创建webpage的一行,以下的操作都是针对改行,inject部分一共有五次设置,下面会标明
WebPage row = new WebPage();
row.setFetchTime(curTime);//<1>设置抓取时间,当前时间
row.setFetchInterval(customInterval);//<2>设置抓取间隔,默认为259200000ms,可在配置文件中指定
// now add the metadata
Iterator<String> keysIter = metadata.keySet().iterator();
while (keysIter.hasNext()) {
//此处会依次添加除了nutchScoreMDName和nutchFetchIntervalMDName之外的元数据
String keymd = keysIter.next();
String valuemd = metadata.get(keymd);
row.putToMetadata(new Utf8(keymd), ByteBuffer.wrap(valuemd.getBytes()));
}
//<3>设置分值
if (customScore != -1)
row.setScore(customScore);//当前分值,可在种子文件中指定,eg: nutch.score=3.5
else
row.setScore(scoreInjected);//默认分值
try {
//<4>设置"column=mtdt:_csh_"
scfilters.injectedScore(url, row);//过滤分值
} catch (ScoringFilterException e) {
if (LOG.isWarnEnabled()) {
LOG.warn("Cannot filter injected score for url " + url
+ ", using default (" + e.getMessage() + ")");
}
}
//<5>设置inject标记
Mark.INJECT_MARK.putMark(row, YES_STRING);
//注意:前面所有对row的操作都是在内存中进行的,并没有实时写入到表中
//下面是真正执行写入的操作
context.write(reversedUrl, row);//将该reversedUrl作为行键写入表中(webpage)
}
}
public static class InjectorMapper
extends GoraMapper<String, WebPage, String, WebPage> {
private FetchSchedule schedule;
@Override
public void setup(Context context) throws IOException {
Configuration conf = context.getConfiguration();
schedule = FetchScheduleFactory.getFetchSchedule(conf);
/*
* schedule.getFields():
* [fetchInterval, fetchTime, retriesSinceFetch]
*
*/
// scoreInjected = conf.getFloat("db.score.injected", 1.0f);
}
@Override
protected void map(String key, WebPage row, Context context)
throws IOException, InterruptedException {
if (Mark.INJECT_MARK.checkMark(row) == null) {
return;
}
Mark.INJECT_MARK.removeMark(row);
if (!row.isReadable(WebPage.Field.STATUS.getIndex())) {
//<6>设置status
row.setStatus(CrawlStatus.STATUS_UNFETCHED);
schedule.initializeSchedule(key, row);
// row.setScore(scoreInjected);
}
context.write(key, row);
}
}
public InjectorJob() {
}
public InjectorJob(Configuration conf) {
setConf(conf);
}
public Map<String,Object> run(Map<String,Object> args) throws Exception {
getConf().setLong("injector.current.time", System.currentTimeMillis());
Path input;
Object path = args.get(Nutch.ARG_SEEDDIR);
if (path instanceof Path) {
input = (Path)path;
} else {
input = new Path(path.toString());
}
numJobs = 2;
currentJobNum = 0;
status.put(Nutch.STAT_PHASE, "convert input");
currentJob = new NutchJob(getConf(), "inject-p1 " + input);
//输入
FileInputFormat.addInputPath(currentJob, input);
//设置MapperClass
currentJob.setMapperClass(UrlMapper.class);
//设置map输出键类型
currentJob.setMapOutputKeyClass(String.class);
//设置map输出值类型
currentJob.setMapOutputValueClass(WebPage.class);
//设置输出格式化类型
currentJob.setOutputFormatClass(GoraOutputFormat.class);
//创建存储,本行代码会在Hbase下创建一个叫做"webpage"的表
DataStore<String, WebPage> store = StorageUtils.createWebStore(currentJob.getConfiguration(),
String.class, WebPage.class);
GoraOutputFormat.setOutput(currentJob, store, true);
//设置Reduce
currentJob.setReducerClass(Reducer.class);
//设置Reduce个数
currentJob.setNumReduceTasks(0);
//等待作业完成
currentJob.waitForCompletion(true);
//记录job状态
ToolUtil.recordJobStatus(null, currentJob, results);
currentJob = null;
//下面的job
status.put(Nutch.STAT_PHASE, "merge input with db");
status.put(Nutch.STAT_PROGRESS, 0.5f);
currentJobNum = 1;
currentJob = new NutchJob(getConf(), "inject-p2 " + input);
StorageUtils.initMapperJob(currentJob, FIELDS, String.class,
WebPage.class, InjectorMapper.class);
currentJob.setNumReduceTasks(0);
//currentJob.waitForCompletion(true);
ToolUtil.recordJobStatus(null, currentJob, results);
status.put(Nutch.STAT_PROGRESS, 1.0f);
return results;
}
public void inject(Path urlDir) throws Exception {
LOG.info("InjectorJob: starting");
LOG.info("InjectorJob: urlDir: " + urlDir);
run(ToolUtil.toArgMap(Nutch.ARG_SEEDDIR, urlDir));
}
@Override
public int run(String[] args) throws Exception {
if (args.length < 1) {
System.err.println("Usage: InjectorJob <url_dir> [-crawlId <id>]");
return -1;
}
for (int i = 1; i < args.length; i++) {
if ("-crawlId".equals(args[i])) {
getConf().set(Nutch.CRAWL_ID_KEY, args[i+1]);
i++;
} else {
System.err.println("Unrecognized arg " + args[i]);
return -1;
}
}
try {
inject(new Path(args[0]));
LOG.info("InjectorJob: finished");
return -0;
} catch (Exception e) {
LOG.error("InjectorJob: " + StringUtils.stringifyException(e));
return -1;
}
}
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(NutchConfiguration.create(), new InjectorJob(), args);
System.exit(res);
}
}