Nutch源代码浅析（四）探究fetch的工作原理

最新推荐文章于 2023-04-14 08:02:24 发布

luoyt99

最新推荐文章于 2023-04-14 08:02:24 发布

阅读量1.6k

点赞数

文章标签：工作 parsing hadoop output parallel input

本文链接：https://blog.csdn.net/luoyt99/article/details/6240559

版权

我们来看看fetch是如何工作的，如何启动多线程。

看1084行，创建了一个NutchConf，在nutch中，用到了hadoop库来管理分布式的文件系统和作业处理，NutchConf从JobConf继承而来，作为一个Job的运行参数，其中提供了Job所需的所有信息。

1086行，fetcher的运行线程数是通过JobConf来传递的，而后设置输入路径，输入内容的格式，job.setMapRunnerClass则是设置Map操作的运行类，这是所有fetcher工作的核心部分，在hadoop的runJob的过程中会调用该处注册的RunnerClass。

1098行，利用JobClient的静态函数runJob运行前面所配置的JobConf。

//nutch/src/java/org/apache/nutch/fetcher/Fetcher.java public void fetch(Path segment, int threads, boolean parsing) throws IOException { checkConfiguration(); SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("Fetcher: starting at " + sdf.format(start)); LOG.info("Fetcher: segment: " + segment); } // set the actual time for the timelimit relative // to the beginning of the whole job and not of a specific task // otherwise it keeps trying again if a task fails long timelimit = getConf().getLong("fetcher.timelimit.mins", -1); if (timelimit != -1) { timelimit = System.currentTimeMillis() + (timelimit * 60 * 1000); LOG.info("Fetcher Timelimit set for : " + timelimit); getConf().setLong("fetcher.timelimit.mins", timelimit); } JobConf job = new NutchJob(getConf()); job.setJobName("fetch " + segment); job.setInt("fetcher.threads.fetch", threads); job.set(Nutch.SEGMENT_NAME_KEY, segment.getName()); job.setBoolean("fetcher.parse", parsing); // for politeness, don't permit parallel execution of a single task job.setSpeculativeExecution(false); FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.GENERATE_DIR_NAME)); job.setInputFormat(InputFormat.class); job.setMapRunnerClass(Fetcher.class); FileOutputFormat.setOutputPath(job, segment); job.setOutputFormat(FetcherOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NutchWritable.class); JobClient.runJob(job); long end = System.currentTimeMillis(); LOG.info("Fetcher: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }

而后进入了hadoop的内部实现，这儿不去深究，我通过度hadoop的内部实现，发现这个JobClient.runJob是一个同步的操作，它会等待Job执行完毕后才返回。runJob会执行我们注册的Fetcher.class中的run函数，有两个run函数，注意区分。

//nutch/src/java/org/apache/nutch/fetcher/Fetcher.java public void run(RecordReader<Text, CrawlDatum> input, OutputCollector<Text, NutchWritable> output, Reporter reporter) throws IOException { this.output = output; this.reporter = reporter; this.fetchQueues = new FetchItemQueues(getConf()); int threadCount = getConf().getInt("fetcher.threads.fetch", 10); if (LOG.isInfoEnabled()) { LOG.info("Fetcher: threads: " + threadCount); } feeder = new QueueFeeder(input, fetchQueues, threadCount * 50); //feeder.setPriority((Thread.MAX_PRIORITY + Thread.NORM_PRIORITY) / 2); // the value of the time limit is either -1 or the time where it should finish long timelimit = getConf().getLong("fetcher.timelimit.mins", -1); if (timelimit != -1) feeder.setTimeLimit(timelimit); feeder.start(); // set non-blocking & no-robots mode for HTTP protocol plugins. getConf().setBoolean(Protocol.CHECK_BLOCKING, false); getConf().setBoolean(Protocol.CHECK_ROBOTS, false); for (int i = 0; i < threadCount; i++) { // spawn threads new FetcherThread(getConf()).start(); } // select a timeout that avoids a task timeout long timeout = getConf().getInt("mapred.task.timeout", 10*60*1000)/2; do { // wait for threads to exit try { Thread.sleep(1000); } catch (InterruptedException e) {} reportStatus(); LOG.info("-activeThreads=" + activeThreads + ", spinWaiting=" + spinWaiting.get() + ", fetchQueues.totalSize=" + fetchQueues.getTotalSize()); if (!feeder.isAlive() && fetchQueues.getTotalSize() < 5) { fetchQueues.dump(); } // check timelimit if (!feeder.isAlive()) { int hitByTimeLimit = fetchQueues.checkTimelimit(); if (hitByTimeLimit != 0) reporter.incrCounter("FetcherStatus", "hitByTimeLimit", hitByTimeLimit); } // some requests seem to hang, despite all intentions if ((System.currentTimeMillis() - lastRequestStart.get()) > timeout) { if (LOG.isWarnEnabled()) { LOG.warn("Aborting with "+activeThreads+" hung threads."); } return; } } while (activeThreads.get() > 0); LOG.info("-activeThreads=" + activeThreads); }