nutch1.4 crawl详解

最新推荐文章于 2024-11-03 21:51:17 发布
obite
最新推荐文章于 2024-11-03 21:51:17 发布
阅读量108
点赞数
分类专栏： nutch 文章标签：大数据 java 爬虫
本文链接：https://blog.csdn.net/obite/article/details/84177283
版权
nutch 专栏收录该内容
21 篇文章 0 订阅
订阅专栏
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.nutch.crawl;

import java.util.*;
import java.text.*;

// Commons Logging imports
import org.apache.commons.lang.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.hadoop.fs.*;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.parse.ParseSegment;
import org.apache.nutch.indexer.solr.SolrDeleteDuplicates;
import org.apache.nutch.indexer.solr.SolrIndexer;
import org.apache.nutch.util.HadoopFSUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;

import org.apache.nutch.fetcher.Fetcher;

public class Crawl extends Configured implements Tool {
  public static final Logger LOG = LoggerFactory.getLogger(Crawl.class);

  private static String getDate() {
    return new SimpleDateFormat("yyyyMMddHHmmss").format
      (new Date(System.currentTimeMillis()));
  }


  /* Perform complete crawling and indexing (to Solr) given a set of root urls and the -solr
     parameter respectively. More information and Usage parameters can be found below. */
  public static void main(String args[]) throws Exception {
    Configuration conf = NutchConfiguration.create();
    int res = ToolRunner.run(conf, new Crawl(), args);
    System.exit(res);
  }
  

  public int run(String[] args) throws Exception {
    if (args.length < 1) {
      System.out.println
      ("Usage: Crawl <urlDir> -solr <solrURL> [-dir d] [-threads n] [-depth i] [-topN N]");
      return -1;
    }
    Path rootUrlDir = null;
    Path dir = new Path("crawl-" + getDate());
    int threads = getConf().getInt("fetcher.threads.fetch", 10);
    int depth = 5;
    long topN = Long.MAX_VALUE;
    String solrUrl = null;
    
    for (int i = 0; i < args.length; i++) {
      if ("-dir".equals(args[i])) {
        dir = new Path(args[i+1]);
        i++;
      } else if ("-threads".equals(args[i])) {
        threads = Integer.parseInt(args[i+1]);
        i++;
      } else if ("-depth".equals(args[i])) {
        depth = Integer.parseInt(args[i+1]);
        i++;
      } else if ("-topN".equals(args[i])) {
          topN = Integer.parseInt(args[i+1]);
          i++;
      } else if ("-solr".equals(args[i])) {
        solrUrl = args[i + 1];
        i++;
      } else if (args[i] != null) {
        rootUrlDir = new Path(args[i]);
      }
    }
    
    JobConf job = new NutchJob(getConf());

    if (solrUrl == null) {
      LOG.warn("solrUrl is not set, indexing will be skipped...");
    }

    FileSystem fs = FileSystem.get(job);

    if (LOG.isInfoEnabled()) {
      LOG.info("crawl started in: " + dir);
      LOG.info("rootUrlDir = " + rootUrlDir);
      LOG.info("threads = " + threads);
      LOG.info("depth = " + depth);      
      LOG.info("solrUrl=" + solrUrl);
      if (topN != Long.MAX_VALUE)
        LOG.info("topN = " + topN);
    }
    
    Path crawlDb = new Path(dir + "/crawldb");		//设置爬取库目录
    Path linkDb = new Path(dir + "/linkdb");		//链接库目录
    Path segments = new Path(dir + "/segments");	//段信息目录
    Path indexes = new Path(dir + "/indexes");		//索引目录
    Path index = new Path(dir + "/index");

  //初始化各项参数
    Path tmpDir = job.getLocalPath("crawl"+Path.SEPARATOR+getDate());
    Injector injector = new Injector(getConf());	//URL注入器对象；数据下载入口
    Generator generator = new Generator(getConf());	//生成器；生成待下载URL列表
    Fetcher fetcher = new Fetcher(getConf());		//抓取器；按照HTTP协议访问互联网，获取网页数据具体内容。下载过程由下载列表和操作参数控制，直到下载完毕。
    ParseSegment parseSegment = new ParseSegment(getConf());	//解析数据段；数据段（Segment）存放网络爬虫每一次抓取使用的待下载列表、已经获得的网页内容和本次内容的索引。
    CrawlDb crawlDbTool = new CrawlDb(getConf());				//抓取数据库工具
    LinkDb linkDbTool = new LinkDb(getConf());					//链接库工具
      
    // initialize crawlDb
    injector.inject(crawlDb, rootUrlDir);
    int i;
    for (i = 0; i < depth; i++) {             // generate new segment
      Path[] segs = generator.generate(crawlDb, segments, -1, topN, System
          .currentTimeMillis());
      if (segs == null) {
        LOG.info("Stopping at depth=" + i + " - no more URLs to fetch.");
        break;
      }
      fetcher.fetch(segs[0], threads);  // fetch it
      if (!Fetcher.isParsing(job)) {
        parseSegment.parse(segs[0]);    // parse it, if needed
      }
      crawlDbTool.update(crawlDb, segs, true, true); // update crawldb
    }
    if (i > 0) {
      linkDbTool.invert(linkDb, segments, true, true, false); // invert links

      /**
       * 使用solr索引处理完毕的文件，索引、复制、合并
       */
      if (solrUrl != null) {
        // index, dedup & merge
        FileStatus[] fstats = fs.listStatus(segments, HadoopFSUtil.getPassDirectoriesFilter(fs));
        SolrIndexer indexer = new SolrIndexer(getConf());
        indexer.indexSolr(solrUrl, crawlDb, linkDb, 
          Arrays.asList(HadoopFSUtil.getPaths(fstats)));
        SolrDeleteDuplicates dedup = new SolrDeleteDuplicates();
        dedup.setConf(getConf());
        dedup.dedup(solrUrl);
      }
      
    } else {
      LOG.warn("No URLs to fetch - check your seed list and URL filters.");
    }
    if (LOG.isInfoEnabled()) { LOG.info("crawl finished: " + dir); }
    return 0;
  }

}