2021SC@SDUSC
nutch源码分析—solrindex
本章开始分析nutch源码的最后一步,即通过“bin/nutch solrindex http://localhost:8983/solr crawl/crawldb/ -linkdb crawl/linkdb/ -dir crawl/segments/ -filter -normalize”命令在solr服务器上建立索引。
首先看nutch执行脚本的其中一段,
elif [ “$COMMAND” = “solrindex” ] ; then
CLASS=“org.apache.nutch.indexer.IndexingJob -D solr.server.url=$1”
shift
1
2
3
solrindex最后执行IndexingJob的main函数,并将参数“http://localhost:8983/solr”存入名称为solr.server.url变量。
IndexingJob::main
public static void main(String[] args) throws Exception {
final int res = ToolRunner.run(NutchConfiguration.create(),
new IndexingJob(), args);
System.exit(res);
}
public int run(String[] args) throws Exception {
index(crawlDb, linkDb, segments, noCommit, deleteGone, params, filter, normalize, addBinaryContent, base64);
return 0;
}
public void index(Path crawlDb, Path linkDb, List
boolean noCommit, boolean deleteGone, String params,
boolean filter, boolean normalize, boolean addBinaryContent,
boolean base64) throws IOException {
final JobConf job = new NutchJob(getConf());
job.setJobName("Indexer");
IndexWriters writers = new IndexWriters(getConf());
IndexerMapReduce.initMRJob(crawlDb, linkDb, segments, job, addBinaryContent);
...
final Path tmp = new Path("tmp_" + System.currentTimeMillis() + "-"
+ new Random().nextInt());
FileOutputFormat.setOutputPath(job, tmp);
RunningJob indexJob = JobClient.runJob(job);
writers.open(job, "commit");
writers.commit();
}
solrindex命令最后执行IndexingJob的run函数,进而执行index函数,该函数首先通过IndexWriters创建SolrIndexWriter,然后调用initMRJob函数初始化Job,设置该Job的输出为临时目录,然后执行该Job。
IndexerMapReduce::initMRJob
public static void initMRJob(Path crawlDb, Path linkDb,
Collection
for (final Path segment : segments) {
FileInputFormat.addInputPath(job, new Path(segment,
CrawlDatum.FETCH_DIR_NAME));
FileInputFormat.addInputPath(job, new Path(segment,
CrawlDatum.PARSE_DIR_NAME));
FileInputFormat.addInputPath(job, new Path(segment, ParseData.DIR_NAME));
FileInputFormat.addInputPath(job, new Path(segment, ParseText.DIR_NAME));
if (addBinaryContent) {
FileInputFormat.addInputPath(job, new Path(segment, Content.DIR_NAME));
}
}
FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
if (linkDb != null) {
Path currentLinkDb = new Path(linkDb, LinkDb.CURRENT_NAME);
FileInputFormat.addInputPath(job, currentLinkDb);
}
job.setInputFormat(SequenceFileInputFormat.class);
job.setMapperClass(IndexerMapReduce.class);
job.setReducerClass(IndexerMapReduce.class);
job.setOutputFormat(IndexerOutputFormat.class);
job.setOutputKeyClass(Text.class);
job.setMapOutputValueClass(NutchWritable.class);
job.setOutputValueClass(NutchWritable.class);
}
设置Job的输入为crawl/segments/*/下的crawl_fetch、crawl_parse、parse_data、parse_text、content目录,crawl/crawldb下的current目录和crawl下的linkdb目录。设置Mapper和Reducer为IndexerMapReduce,写函数为IndexerOutputFormat,下面一一来看。
IndexerMapReduce::map
public void map(Text key, Writable value,
OutputCollector<Text, NutchWritable> output, Reporter reporter)
throws IOException {
String urlString = filterUrl(normalizeUrl(key.toString()));
if (urlString == null) {
return;
} else {
key.set(urlString);
}
output.collect(key, new NutchWritable(value));
}
map函数很简单,就是对url进行标准化和过滤后就传给Reducer。
IndexerMapReduce::reduce
public void reduce(Text key, Iterator values,
OutputCollector<Text, NutchIndexAction> output, Reporter reporter)
throws IOException {
Inlinks inlinks = null;
CrawlDatum dbDatum = null;
CrawlDatum fetchDatum = null;
Content content = null;
ParseData parseData = null;
ParseText parseText = null;
while (values.hasNext()) {
final Writable value = values.next().get(); // unwrap
if (value instanceof Inlinks) {
inlinks = (Inlinks) value;
} else if (value instanceof CrawlDatum) {
final CrawlDatum datum = (CrawlDatum) value;
if (CrawlDatum.hasDbStatus(datum)) {
dbDatum = datum;
} else if (CrawlDatum.hasFetchStatus(datum)) {
if (datum.getStatus() != CrawlDatum.STATUS_FETCH_NOTMODIFIED) {
fetchDatum = datum;
}
} else if (CrawlDatum.STATUS_LINKED == datum.getStatus()
|| CrawlDatum.STATUS_SIGNATURE == datum.getStatus()
|| CrawlDatum.STATUS_PARSE_META == datum.getStatus()) {
continue;
}
} else if (value instanceof ParseData) {
parseData = (ParseData) value;
if (deleteRobotsNoIndex) {
String robotsMeta = parseData.getMeta("robots");
if (robotsMeta != null
&& robotsMeta.toLowerCase().indexOf("noindex") != -1) {
output.collect(key, DELETE_ACTION);
return;
}
}
} else if (value instanceof ParseText) {
parseText = (ParseText) value;
} else if (value instanceof Content) {
content = (Content)value;
}
}
...
NutchDocument doc = new NutchDocument();
doc.add("id", key.toString());
final Metadata metadata = parseData.getContentMeta();
doc.add("segment", metadata.get(Nutch.SEGMENT_NAME_KEY));
doc.add("digest", metadata.get(Nutch.SIGNATURE_KEY));
final Parse parse = new ParseImpl(parseText, parseData);
float boost = 1.0f;
boost = this.scfilters.indexerScore(key, doc, dbDatum, fetchDatum, parse,
inlinks, boost);
doc.setWeight(boost);
doc.add("boost", Float.toString(boost));
fetchDatum.setSignature(dbDatum.getSignature());
final Text url = (Text) dbDatum.getMetaData().get(
Nutch.WRITABLE_REPR_URL_KEY);
String urlString = filterUrl(normalizeUrl(url.toString()));
url.set(urlString);
fetchDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, url);
doc = this.filters.filter(doc, parse, key, fetchDatum, inlinks);
if (content != null) {
String binary;
if (base64) {
binary = Base64.encodeBase64String(content.getContent());
} else {
binary = new String(content.getContent());
}
doc.add("binaryContent", binary);
}
NutchIndexAction action = new NutchIndexAction(doc, NutchIndexAction.ADD);
output.collect(key, action);
}