nutch-2.0源码之InjectorJob-CSDN博客

本文链接：https://blog.csdn.net/lengyue365/article/details/8025257
/*******************************************************************************
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/

package org.apache.nutch.crawl;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;

import org.apache.avro.util.Utf8;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.gora.mapreduce.GoraMapper;
import org.apache.gora.mapreduce.GoraOutputFormat;
import org.apache.gora.store.DataStore;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.storage.Mark;
import org.apache.nutch.storage.StorageUtils;
import org.apache.nutch.storage.WebPage;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.NutchTool;
import org.apache.nutch.util.TableUtil;
import org.apache.nutch.util.ToolUtil;

/** This class takes a flat file of URLs and adds them to the of pages to be
 * crawled.  Useful for bootstrapping the system.
 * The URL files contain one URL per line, optionally followed by custom metadata
 * separated by tabs with the metadata key separated from the corresponding value by '='. <br>
 * Note that some metadata keys are reserved : <br>
 * - <i>nutch.score</i> : allows to set a custom score for a specific URL <br>
 * - <i>nutch.fetchInterval</i> : allows to set a custom fetch interval for a specific URL <br>
 * e.g. http://www.nutch.org/ \t nutch.score=10 \t nutch.fetchInterval=2592000 \t userType=open_source
 **/
public class InjectorJob extends NutchTool implements Tool {

  public static final Logger LOG = LoggerFactory.getLogger(InjectorJob.class);

  private static final Set<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();

  private static final Utf8 YES_STRING = new Utf8("y");

  static {
    FIELDS.add(WebPage.Field.MARKERS);
    FIELDS.add(WebPage.Field.STATUS);
  }

  /** metadata key reserved for setting a custom score for a specific URL */
  public static String nutchScoreMDName = "nutch.score";
  /**
   * metadata key reserved for setting a custom fetchInterval for a specific URL
   */
  public static String nutchFetchIntervalMDName = "nutch.fetchInterval";

  public static class UrlMapper extends
      Mapper<LongWritable, Text, String, WebPage> {
    private URLNormalizers urlNormalizers;
    private int interval;
    private float scoreInjected;
    private URLFilters filters;
    private ScoringFilters scfilters;
    private long curTime;

    @Override
    //设置参数
    protected void setup(Context context) throws IOException,
        InterruptedException {
      urlNormalizers = new URLNormalizers(context.getConfiguration(),
          URLNormalizers.SCOPE_INJECT);
      interval = context.getConfiguration().getInt("db.fetch.interval.default",
          2592000);
      filters = new URLFilters(context.getConfiguration());
      scfilters = new ScoringFilters(context.getConfiguration());
      scoreInjected = context.getConfiguration().getFloat("db.score.injected",
          1.0f);//默认分值
      curTime = context.getConfiguration().getLong("injector.current.time",
          System.currentTimeMillis());
    }

    @Override
    protected void map(LongWritable key, Text value, Context context)
        throws IOException, InterruptedException {
      String url = value.toString();
      
      // if tabs : metadata that could be stored
      // must be name=value and separated by \t
      float customScore = -1f;
      int customInterval = interval;
      Map<String, String> metadata = new TreeMap<String, String>();
      if (url.indexOf("\t") != -1) {
        String[] splits = url.split("\t");
        url = splits[0];
        for (int s = 1; s < splits.length; s++) {
          // find separation between name and value
          int indexEquals = splits[s].indexOf("=");
          if (indexEquals == -1) {
            // skip anything without a =
            continue;
          }
          String metaname = splits[s].substring(0, indexEquals);
          String metavalue = splits[s].substring(indexEquals + 1);
          if (metaname.equals(nutchScoreMDName)) {
            try {
              customScore = Float.parseFloat(metavalue);
            } catch (NumberFormatException nfe) {
            }
          } else if (metaname.equals(nutchFetchIntervalMDName)) {
            try {
              customInterval = Integer.parseInt(metavalue);
            } catch (NumberFormatException nfe) {
            }
          } else
            metadata.put(metaname, metavalue);
          //注意：此处只会put除了nutchScoreMDName和nutchFetchIntervalMDName之外的元数据
        }
      }
      try {
    	//规格化url
        url = urlNormalizers.normalize(url, URLNormalizers.SCOPE_INJECT);
        //过滤url
        url = filters.filter(url); // filter the url
      } catch (Exception e) {
        LOG.warn("Skipping " + url + ":" + e);
        url = null;
      }
      if (url == null)
        return;

      //倒转url
      String reversedUrl = TableUtil.reverseUrl(url);
      //创建webpage的一行，以下的操作都是针对改行，inject部分一共有五次设置，下面会标明
      WebPage row = new WebPage();
      row.setFetchTime(curTime);//<1>设置抓取时间，当前时间
      row.setFetchInterval(customInterval);//<2>设置抓取间隔，默认为259200000ms，可在配置文件中指定

      // now add the metadata
      Iterator<String> keysIter = metadata.keySet().iterator();
      while (keysIter.hasNext()) {
    	  //此处会依次添加除了nutchScoreMDName和nutchFetchIntervalMDName之外的元数据
          String keymd = keysIter.next();
          String valuemd = metadata.get(keymd);
          row.putToMetadata(new Utf8(keymd), ByteBuffer.wrap(valuemd.getBytes()));
      }

      //<3>设置分值
      if (customScore != -1)
    	  row.setScore(customScore);//当前分值，可在种子文件中指定，eg: nutch.score=3.5
      else
    	  row.setScore(scoreInjected);//默认分值

      try {
    	  //<4>设置"column=mtdt:_csh_"
    	  scfilters.injectedScore(url, row);//过滤分值
      } catch (ScoringFilterException e) {
    	  if (LOG.isWarnEnabled()) {
    		  LOG.warn("Cannot filter injected score for url " + url
    				  + ", using default (" + e.getMessage() + ")");
    	  }
      }

      //<5>设置inject标记
      Mark.INJECT_MARK.putMark(row, YES_STRING);
      //注意：前面所有对row的操作都是在内存中进行的，并没有实时写入到表中
      //下面是真正执行写入的操作
      context.write(reversedUrl, row);//将该reversedUrl作为行键写入表中(webpage)
    }
  }
  
  public static class InjectorMapper 
      extends GoraMapper<String, WebPage, String, WebPage> {
    private FetchSchedule schedule;

    @Override
    public void setup(Context context) throws IOException {
      Configuration conf = context.getConfiguration();
      schedule = FetchScheduleFactory.getFetchSchedule(conf);
      /*
       * schedule.getFields():
       * [fetchInterval, fetchTime, retriesSinceFetch]
       * 
       */ 
      // scoreInjected = conf.getFloat("db.score.injected", 1.0f);
    }

    @Override
    protected void map(String key, WebPage row, Context context)
        throws IOException, InterruptedException {
      if (Mark.INJECT_MARK.checkMark(row) == null) {
        return;
      }
      Mark.INJECT_MARK.removeMark(row);
      if (!row.isReadable(WebPage.Field.STATUS.getIndex())) {
    	//<6>设置status
        row.setStatus(CrawlStatus.STATUS_UNFETCHED);
        schedule.initializeSchedule(key, row);
        // row.setScore(scoreInjected);
      }
      context.write(key, row);
    }
        
  }

  public InjectorJob() {

  }

  public InjectorJob(Configuration conf) {
    setConf(conf);
  }

  public Map<String,Object> run(Map<String,Object> args) throws Exception {
    getConf().setLong("injector.current.time", System.currentTimeMillis());
    Path input;
    Object path = args.get(Nutch.ARG_SEEDDIR);
    if (path instanceof Path) {
      input = (Path)path;
    } else {
      input = new Path(path.toString());
    }
    numJobs = 2;
    currentJobNum = 0;
    status.put(Nutch.STAT_PHASE, "convert input");
    currentJob = new NutchJob(getConf(), "inject-p1 " + input);
    //输入
    FileInputFormat.addInputPath(currentJob, input);
    //设置MapperClass
    currentJob.setMapperClass(UrlMapper.class);
    //设置map输出键类型
    currentJob.setMapOutputKeyClass(String.class);
    //设置map输出值类型
    currentJob.setMapOutputValueClass(WebPage.class);
    //设置输出格式化类型
    currentJob.setOutputFormatClass(GoraOutputFormat.class);
    //创建存储，本行代码会在Hbase下创建一个叫做"webpage"的表
    DataStore<String, WebPage> store = StorageUtils.createWebStore(currentJob.getConfiguration(),
        String.class, WebPage.class);
    GoraOutputFormat.setOutput(currentJob, store, true);
    
    //设置Reduce
    currentJob.setReducerClass(Reducer.class);
    //设置Reduce个数
    currentJob.setNumReduceTasks(0);
    //等待作业完成
    currentJob.waitForCompletion(true);
    //记录job状态
    ToolUtil.recordJobStatus(null, currentJob, results);
    currentJob = null;

    //下面的job
    status.put(Nutch.STAT_PHASE, "merge input with db");
    status.put(Nutch.STAT_PROGRESS, 0.5f);
    currentJobNum = 1;
    currentJob = new NutchJob(getConf(), "inject-p2 " + input);
    StorageUtils.initMapperJob(currentJob, FIELDS, String.class,
        WebPage.class, InjectorMapper.class);
    currentJob.setNumReduceTasks(0);
    //currentJob.waitForCompletion(true);
    ToolUtil.recordJobStatus(null, currentJob, results);
    status.put(Nutch.STAT_PROGRESS, 1.0f);
    return results;
  }

  public void inject(Path urlDir) throws Exception {
    LOG.info("InjectorJob: starting");
    LOG.info("InjectorJob: urlDir: " + urlDir);
    
    run(ToolUtil.toArgMap(Nutch.ARG_SEEDDIR, urlDir));
  }

  @Override
  public int run(String[] args) throws Exception {
    if (args.length < 1) {
      System.err.println("Usage: InjectorJob <url_dir> [-crawlId <id>]");
      return -1;
    }
    for (int i = 1; i < args.length; i++) {
      if ("-crawlId".equals(args[i])) {
        getConf().set(Nutch.CRAWL_ID_KEY, args[i+1]);
        i++;
      } else {
        System.err.println("Unrecognized arg " + args[i]);
        return -1;
      }
    }

    try {
      inject(new Path(args[0]));
      LOG.info("InjectorJob: finished");
      return -0;
    } catch (Exception e) {
      LOG.error("InjectorJob: " + StringUtils.stringifyException(e));
      return -1;
    }
  }

  public static void main(String[] args) throws Exception {
    int res = ToolRunner.run(NutchConfiguration.create(), new InjectorJob(), args);
    System.exit(res);
  }
}