publicclass Processor extends ModuleType { //默认的下一个处理器 private Processor defaultNextProcessor =null; /** * Perform processing on the given CrawlURI. * 处理一个链接 * @param curi * @throws InterruptedException */ publicfinalvoid process(CrawlURI curi) throws InterruptedException { // by default, arrange for curi to proceed to next processor //设置当前处理器的下一个处理器 curi.setNextProcessor(getDefaultNextProcessor(curi));
// Check if this processor is enabled before processing try { if (!((Boolean) getAttribute(ATTR_ENABLED, curi)).booleanValue()) { return; } } catch (AttributeNotFoundException e) { logger.severe(e.getMessage()); }
/** Set the processor chain that the URI should be working through after * finishing this one. * 设置下一个处理器 * @param nextProcessorChain the chain that should be processed after this * one. */ publicvoid setNextChain(ProcessorChain nextProcessorChain) { this.nextChain = nextProcessorChain; }
/** Get the processor chain that the URI should be working through after * finishing this one. * * @return the next processor chain. */ public ProcessorChain getNextProcessorChain() { return nextChain; }
/** Get the first processor in the chain. * 获取第一个处理器 * @return the first processor in the chain. */ public Processor getFirstProcessor() { return firstProcessor; }
/** Add a new chain of processors to the chain list. * 将所有的处理器链添加到Map中 * This method takes a map of processors and wraps it in a ProcessorChain * object and adds it to the list of chains. * * @param processorMap the processor map to be added. */ publicvoid addProcessorMap(String name, MapType processorMap) { //由MapType生成一个处理器链 ProcessorChain processorChain =new ProcessorChain(processorMap); ProcessorChain previousChain = getLastChain(); if (previousChain !=null) { //设置下一个处理器链 previousChain.setNextChain(processorChain); } chainList.add(processorChain); chainMap.put(name, processorChain); }
/** Get the first processor chain. * 获取第一个处理链 * @return the first processor chain. */ public ProcessorChain getFirstChain() { return (ProcessorChain) chainList.get(0); }
(4)ToeThread
为了高效抓取网页,Heritrix采用了线程池的设计.每一个线程将调用所有的处理器来处理链接.
Code privatevoid processCrawlUri() throws InterruptedException { currentCuri.setThreadNumber(this.serialNumber); //获取第一个处理器链 currentCuri.setNextProcessorChain(controller.getFirstProcessorChain()); lastStartTime = System.currentTimeMillis(); // System.out.println(currentCuri); try { while (currentCuri.nextProcessorChain() !=null) { setStep(STEP_ABOUT_TO_BEGIN_CHAIN); // Starting on a new processor chain. //设置下一个处理器 currentCuri.setNextProcessor(currentCuri.nextProcessorChain().getFirstProcessor()); currentCuri.setNextProcessorChain(currentCuri.nextProcessorChain().getNextProcessorChain());
while (currentCuri.nextProcessor() !=null) { setStep(STEP_ABOUT_TO_BEGIN_PROCESSOR); Processor currentProcessor = getProcessor(currentCuri.nextProcessor()); currentProcessorName = currentProcessor.getName(); continueCheck(); // long memBefore = (Runtime.getRuntime().totalMemory()-Runtime.getRuntime().freeMemory())/1024; //调用处理器处理链接 currentProcessor.process(currentCuri); // long memAfter = (Runtime.getRuntime().totalMemory()-Runtime.getRuntime().freeMemory())/1024; // System.out.println((memAfter-memBefore)+"K in "+currentProcessorName); } } setStep(STEP_DONE_WITH_PROCESSORS); currentProcessorName =""; } catch (RuntimeExceptionWrapper e) { // Workaround to get cause from BDB if(e.getCause() ==null) { e.initCause(e.getCause()); } recoverableProblem(e); } catch (AssertionError ae) { // This risks leaving crawl in fatally inconsistent state, // but is often reasonable for per-Processor assertion problems recoverableProblem(ae); } catch (RuntimeException e) { recoverableProblem(e); } catch (StackOverflowError err) { recoverableProblem(err); } catch (Error err) { // OutOfMemory and any others seriousError(err); }