因为我最后一遍爬取的时候没有将链接数据库CrawlDb进行更新,所以连接数据库中的url链接信息是上一次的状态,所以和预期的效果不一样。
解决方案:在最后一遍爬取时,既要更新已存在的url的CrawlDatum,同时不能添加本次解析网页获得的链接(因为我们只爬取三层,第四层的链接就不需要),所以需要在改动CrawlDb中的update函数,添加一个boolean型的参数,表示是否需要将链接更新到数据库中,如果为false,就不要将Fetcher得到的链接结果parse_crawl中的信息添加到update的job的输入路径中。代码如下:
public void update(Path crawlDb, Path[] segments, boolean normalize, boolean filter, boolean additionsAllowed, boolean force,boolean add) throws IOException {
FileSystem fs = FileSystem.get(getConf());
Path lock = new Path(crawlDb, LOCK_NAME);
LockUtil.createLockFile(fs, lock, force);
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
long start = System.currentTimeMillis();
if (LOG.isInfoEnabled()) {
LOG.info("CrawlDb update: starting at " + sdf.format(start));
LOG.info("CrawlDb update: db: " + crawlDb);
LOG.info("CrawlDb update: segments: " + Arrays.asList(segments));
LOG.info("CrawlDb update: additions allowed: " + additionsAllowed);
LOG.info("CrawlDb update: URL normalizing: " + normalize);
LOG.info("CrawlDb update: URL filtering: " + filter);
}
JobConf job = CrawlDb.createJob(getConf(), crawlDb);
job.setBoolean(CRAWLDB_ADDITIONS_ALLOWED, additionsAllowed);
job.setBoolean(CrawlDbFilter.URL_FILTERING, filter);
job.setBoolean(CrawlDbFilter.URL_NORMALIZING, normalize);
// System.err.println("+++++++++++++++segments.length="+segments.length);
for (int i = 0; i < segments.length; i++) {
Path fetch = new Path(segments[i], CrawlDatum.FETCH_DIR_NAME);
if (fs.exists(fetch)) {
FileInputFormat.addInputPath(job, fetch);
} else {
LOG.info(" - skipping invalid segment " + segments[i]);
}
Path parse =null;
if(add){
parse = new Path(segments[i], CrawlDatum.PARSE_DIR_NAME);
}
if (add&&fs.exists(parse)) {
FileInputFormat.addInputPath(job, parse);
} else {
LOG.info(" - skipping invalid segment " + segments[i]);
}
}
if (LOG.isInfoEnabled()) {
LOG.info("CrawlDb update: Merging segment data into db.");
}
try {
JobClient.runJob(job);
} catch (IOException e) {
LockUtil.removeLockFile(fs, lock);
Path outPath = FileOutputFormat.getOutputPath(job);
if (fs.exists(outPath) ) fs.delete(outPath, true);
throw e;
}
CrawlDb.install(job, crawlDb);
long end = System.currentTimeMillis();
LOG.info("CrawlDb update: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}