Nutch2源码研究之InjectorJob

最新推荐文章于 2022-01-21 17:47:03 发布

h140465

最新推荐文章于 2022-01-21 17:47:03 发布

阅读量56

点赞数

分类专栏： Nutch 文章标签： nutch inject 源码

Nutch 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

Nutch任务通常从InjectorJob开始，它的作用是从种子文件中取出所有种子存入库中，供之后的任务使用。

InjectorJob类中最重要是UrlMapper，这个类实际处理类

public static class UrlMapper extends
      Mapper<LongWritable, Text, String, WebPage> {
    private URLNormalizers urlNormalizers;
    private int interval;//重新抓取同一个页面的时间间隔（默认为30天）
    private float scoreInjected;//一个新页面的默认score
    private URLFilters filters;//url过滤器
    private ScoringFilters scfilters;
    private long curTime;

    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
      urlNormalizers = new URLNormalizers(context.getConfiguration(),
        URLNormalizers.SCOPE_INJECT);
      interval = context.getConfiguration().getInt("db.fetch.interval.default",
        2592000);
      filters = new URLFilters(context.getConfiguration());
      scfilters = new ScoringFilters(context.getConfiguration());
      scoreInjected = context.getConfiguration().getFloat("db.score.injected",
        1.0f);
      curTime = context.getConfiguration().getLong("injector.current.time",
        System.currentTimeMillis());
    }

    protected void map(LongWritable key, Text value, Context context)
        throws IOException, InterruptedException {
      String url = value.toString(); // value is line of text
      //以#开头的忽略
      if (url != null && url.trim().startsWith("#")) {
        /* Ignore line that start with # */
        return;
      }

      //每个url后面都可以加上name=value格式的自定义参数，以tab键分割
      // if tabs : metadata that could be stored
      // must be name=value and separated by \t
      float customScore = -1f;
      int customInterval = interval;
      //解析url后的自定义参数
      Map<String, String> metadata = new TreeMap<String, String>();
      if (url.indexOf("\t") != -1) {
        String[] splits = url.split("\t");
        url = splits[0];//url地址
        for (int s = 1; s < splits.length; s++) {
          // find separation between name and value
          int indexEquals = splits[s].indexOf("=");
          if (indexEquals == -1) {
            // skip anything without a =
            continue;
          }
          String metaname = splits[s].substring(0, indexEquals);
          String metavalue = splits[s].substring(indexEquals + 1);
          //自定义score的值
          if (metaname.equals(nutchScoreMDName)) {
            try {
              customScore = Float.parseFloat(metavalue);
            } catch (NumberFormatException nfe) {
            }
            //自定义抓取间隔时间（单位为秒）
          } else if (metaname.equals(nutchFetchIntervalMDName)) {
            try {
              customInterval = Integer.parseInt(metavalue);
            } catch (NumberFormatException nfe) {
            }
          } else
        	//其他自定义参数
            metadata.put(metaname, metavalue);
        }
      }
      try {
    	//转换成标准格式的url
        url = urlNormalizers.normalize(url, URLNormalizers.SCOPE_INJECT);
        //对url进行过滤，如果不符合规则，返回null
        url = filters.filter(url); // filter the url
      } catch (Exception e) {
        LOG.warn("Skipping " + url + ":" + e);
        url = null;
      }
      if (url == null) {
    	//累加总过滤的url数量
        context.getCounter("injector", "urls_filtered").increment(1);
        return;
      } else {                                         // if it passes
      String reversedUrl = TableUtil.reverseUrl(url);  // collect it
      WebPage row = new WebPage();
      row.setFetchTime(curTime);
      row.setFetchInterval(customInterval);

      // now add the metadata
      Iterator<String> keysIter = metadata.keySet().iterator();
      while (keysIter.hasNext()) {
        String keymd = keysIter.next();
        String valuemd = metadata.get(keymd);
        row.putToMetadata(new Utf8(keymd), ByteBuffer.wrap(valuemd.getBytes()));
      }

      if (customScore != -1)
        row.setScore(customScore);
      else
        row.setScore(scoreInjected);

      try {
        scfilters.injectedScore(url, row);
      } catch (ScoringFilterException e) {
        if (LOG.isWarnEnabled()) {
          LOG.warn("Cannot filter injected score for url " + url
          + ", using default (" + e.getMessage() + ")");
        }
      }
      //累加inject的url数量
      context.getCounter("injector", "urls_injected").increment(1);
      //设置标记
      row.putToMarkers(DbUpdaterJob.DISTANCE, new Utf8(String.valueOf(0)));
      Mark.INJECT_MARK.putMark(row, YES_STRING);
      //将row写入数据库
      context.write(reversedUrl, row);
    }
    }
  }

InjectorJob类中run(Map<String,Object> args)，Crawler中实际是调用这个方法

public Map<String,Object> run(Map<String,Object> args) throws Exception {
    getConf().setLong("injector.current.time", System.currentTimeMillis());
    Path input;
    //从入参中获得种子文件
    Object path = args.get(Nutch.ARG_SEEDDIR);
    if (path instanceof Path) {
      input = (Path)path;
    } else {
      input = new Path(path.toString());
    }
    numJobs = 1;
    currentJobNum = 0;
    currentJob = new NutchJob(getConf(), "inject " + input);
    FileInputFormat.addInputPath(currentJob, input);
    currentJob.setMapperClass(UrlMapper.class);//处理Map
    currentJob.setMapOutputKeyClass(String.class);
    currentJob.setMapOutputValueClass(WebPage.class);
    currentJob.setOutputFormatClass(GoraOutputFormat.class);//输出处理类
    
    DataStore<String, WebPage> store = StorageUtils.createWebStore(currentJob.getConfiguration(),
      String.class, WebPage.class);
    GoraOutputFormat.setOutput(currentJob, store, true);
    
    // NUTCH-1471 Make explicit which datastore class we use
    //获取持久化处理类
    Class<? extends DataStore<Object, Persistent>> dataStoreClass = 
      StorageUtils.getDataStoreClass(currentJob.getConfiguration());
    LOG.info("InjectorJob: Using " + dataStoreClass + " as the Gora storage class.");
    
    currentJob.setReducerClass(Reducer.class);
    currentJob.setNumReduceTasks(0);
    
    currentJob.waitForCompletion(true);
    ToolUtil.recordJobStatus(null, currentJob, results);

    // NUTCH-1370 Make explicit #URLs injected @runtime
    long urlsInjected = currentJob.getCounters().findCounter("injector", "urls_injected").getValue();
    long urlsFiltered = currentJob.getCounters().findCounter("injector", "urls_filtered").getValue();
    LOG.info("InjectorJob: total number of urls rejected by filters: " + urlsFiltered);
    LOG.info("InjectorJob: total number of urls injected after normalization and filtering: "
        + urlsInjected);

    return results;
  }

h140465

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Nutch2源码研究之InjectorJob

Nutch任务通常从InjectorJob开始，它的作用是从种子文件中取出所有种子存入库中，供之后的任务使用。InjectorJob类中最重要是UrlMapper，这个类实际处理类public static class UrlMapper extends Mapper&lt;LongWritable, Text, String, WebPage&gt; { pr...
复制链接

扫一扫