接着看下最后一个Job
................... ................... ................... //判断是否更新 if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) { // update the db from tempDir // 产生临时目录 Path tempDir2 = new Path(getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis()); job = new NutchJob(getConf()); job.setJobName("generate: updatedb " + dbDir); job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); //把生成的segments作为输入 for (Path segmpaths : generatedSegments) { Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME); FileInputFormat.addInputPath(job, subGenDir); } //当前的crawldb也作为输入 FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(CrawlDbUpdater.class); job.setReducerClass(CrawlDbUpdater.class); job.setOutputFormat(MapFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); //输出到临时目录 FileOutputFormat.setOutputPath(job, tempDir2); try { JobClient.runJob(job); //用临时目录替换掉之前的crawldb CrawlDb.install(job, dbDir); } catch (IOException e) { LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); fs.delete(tempDir2, true); throw e; } fs.delete(tempDir2, true); } LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); long end = System.currentTimeMillis(); LOG.info("Generator: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); Path[] patharray = new Path[generatedSegments.size()]; return generatedSegments.toArray(patharray);
/** * Update the CrawlDB so that the next generate won't include the same URLs. */ public static class CrawlDbUpdater extends MapReduceBase implements Mapper<Text,CrawlDatum,Text,CrawlDatum>, Reducer<Text,CrawlDatum,Text,CrawlDatum> { long generateTime; public void configure(JobConf job) { generateTime = job.getLong(Nutch.GENERATE_TIME_KEY, 0L); } public void map(Text key, CrawlDatum value, OutputCollector<Text,CrawlDatum> output, Reporter reporter) throws IOException { output.collect(key, value); } private CrawlDatum orig = new CrawlDatum(); private LongWritable genTime = new LongWritable(0L); public void reduce(Text key, Iterator<CrawlDatum> values, OutputCollector<Text,CrawlDatum> output, Reporter reporter) throws IOException { genTime.set(0L); //遍历相同url的crawlDatum while (values.hasNext()) { CrawlDatum val = values.next(); //判断是否生成过 if (val.getMetaData().containsKey(Nutch.WRITABLE_GENERATE_TIME_KEY)) { LongWritable gt = (LongWritable) val.getMetaData().get( Nutch.WRITABLE_GENERATE_TIME_KEY); genTime.set(gt.get()); if (genTime.get() != generateTime) { orig.set(val); genTime.set(0L); continue; } } else { orig.set(val); } } if (genTime.get() != 0L) { //设置新的生成时间 orig.getMetaData().put(Nutch.WRITABLE_GENERATE_TIME_KEY, genTime); } output.collect(key, orig); } }