Heritrix3.3.0源码阅读 crawler-beans.cxml中处理器链的配置

<!-- 
   PROCESSING CHAINS
    Much of the crawler's work is specified by the sequential 
    application of swappable Processor modules. These Processors
    are collected into three 'chains'. The CandidateChain is applied 
    to URIs being considered for inclusion, before a URI is enqueued
    for collection. The FetchChain is applied to URIs when their 
    turn for collection comes up. The DispositionChain is applied 
    after a URI is fetched and analyzed/link-extracted.
  -->
  
 <!-- CANDIDATE CHAIN --> 
 <!-- first, processors are declared as top-level named beans -->
 <bean id="candidateScoper" class="org.archive.crawler.prefetch.CandidateScoper">
 </bean>
 <bean id="preparer" class="org.archive.crawler.prefetch.FrontierPreparer">
  <!-- <property name="preferenceDepthHops" value="-1" /> -->
  <!-- <property name="preferenceEmbedHops" value="1" /> -->
  <!-- <property name="canonicalizationPolicy"> 
        <ref bean="canonicalizationPolicy" />
       </property> -->
  <!-- <property name="queueAssignmentPolicy"> 
        <ref bean="queueAssignmentPolicy" />
       </property> -->
  <!-- <property name="uriPrecedencePolicy"> 
        <ref bean="uriPrecedencePolicy" />
       </property> -->
  <!-- <property name="costAssignmentPolicy"> 
        <ref bean="costAssignmentPolicy" />
       </property> -->
 </bean>
 <!-- now, processors are assembled into ordered CandidateChain bean -->
 <bean id="candidateProcessors" class="org.archive.modules.CandidateChain">
  <property name="processors">
   <list>
    <!-- apply scoping rules to each individual candidate URI... -->
    <ref bean="candidateScoper"/>
    <!-- ...then prepare those ACCEPTed to be enqueued to frontier. -->
    <ref bean="preparer"/>
   </list>
  </property>
 </bean>
  
 <!-- FETCH CHAIN --> 
 <!-- first, processors are declared as top-level named beans -->
 <bean id="preselector" class="org.archive.crawler.prefetch.Preselector">
  <!-- <property name="recheckScope" value="false" /> -->
  <!-- <property name="blockAll" value="false" /> -->
  <!-- <property name="blockByRegex" value="" /> -->
  <!-- <property name="allowByRegex" value="" /> -->
 </bean>
 <bean id="preconditions" class="org.archive.crawler.prefetch.PreconditionEnforcer">
  <!-- <property name="ipValidityDurationSeconds" value="21600" /> -->
  <!-- <property name="robotsValidityDurationSeconds" value="86400" /> -->
  <!-- <property name="calculateRobotsOnly" value="false" /> -->
 </bean>
 <bean id="fetchDns" class="org.archive.modules.fetcher.FetchDNS">
  <!-- <property name="acceptNonDnsResolves" value="false" /> -->
  <!-- <property name="digestContent" value="true" /> -->
  <!-- <property name="digestAlgorithm" value="sha1" /> -->
 </bean>
 <!-- <bean id="fetchWhois" class="org.archive.modules.fetcher.FetchWhois">
       <property name="specialQueryTemplates">
        <map>
         <entry key="whois.verisign-grs.com" value="domain %s" />
         <entry key="whois.arin.net" value="z + %s" />
         <entry key="whois.denic.de" value="-T dn %s" />
        </map>
       </property> 
      </bean> -->
 <bean id="fetchHttp" class="org.archive.modules.fetcher.FetchHTTP">
  <!-- <property name="useHTTP11" value="false" /> -->
  <!-- <property name="maxLengthBytes" value="0" /> -->
  <!-- <property name="timeoutSeconds" value="1200" /> -->
  <!-- <property name="maxFetchKBSec" value="0" /> -->
  <!-- <property name="defaultEncoding" value="ISO-8859-1" /> -->
  <!-- <property name="shouldFetchBodyRule"> 
        <bean class="org.archive.modules.deciderules.AcceptDecideRule"/>
       </property> -->
  <!-- <property name="soTimeoutMs" value="20000" /> -->
  <!-- <property name="sendIfModifiedSince" value="true" /> -->
  <!-- <property name="sendIfNoneMatch" value="true" /> -->
  <!-- <property name="sendConnectionClose" value="true" /> -->
  <!-- <property name="sendReferer" value="true" /> -->
  <!-- <property name="sendRange" value="false" /> -->
  <!-- <property name="ignoreCookies" value="false" /> -->
  <!-- <property name="sslTrustLevel" value="OPEN" /> -->
  <!-- <property name="acceptHeaders"> 
        <list>
         <value>Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8</value>
        </list>
       </property>
  -->
  <!-- <property name="httpBindAddress" value="" /> -->
  <!-- <property name="httpProxyHost" value="" /> -->
  <!-- <property name="httpProxyPort" value="0" /> -->
  <!-- <property name="httpProxyUser" value="" /> -->
  <!-- <property name="httpProxyPassword" value="" /> -->
  <!-- <property name="digestContent" value="true" /> -->
  <!-- <property name="digestAlgorithm" value="sha1" /> -->
 </bean>
 <bean id="extractorHttp" class="org.archive.modules.extractor.ExtractorHTTP">
 </bean>
 <bean id="extractorHtml" class="org.archive.modules.extractor.ExtractorHTML">
  <!-- <property name="extractJavascript" value="true" /> -->
  <!-- <property name="extractValueAttributes" value="true" /> -->
  <!-- <property name="ignoreFormActionUrls" value="false" /> -->
  <!-- <property name="extractOnlyFormGets" value="true" /> -->
  <!-- <property name="treatFramesAsEmbedLinks" value="true" /> -->
  <!-- <property name="ignoreUnexpectedHtml" value="true" /> -->
  <!-- <property name="maxElementLength" value="1024" /> -->
  <!-- <property name="maxAttributeNameLength" value="1024" /> -->
  <!-- <property name="maxAttributeValueLength" value="16384" /> -->
 </bean>
 <bean id="extractorCss" class="org.archive.modules.extractor.ExtractorCSS">
 </bean> 
 <bean id="extractorJs" class="org.archive.modules.extractor.ExtractorJS">
 </bean>
 <bean id="extractorSwf" class="org.archive.modules.extractor.ExtractorSWF">
 </bean>    
 <!-- now, processors are assembled into ordered FetchChain bean -->
 <bean id="fetchProcessors" class="org.archive.modules.FetchChain">
  <property name="processors">
   <list>
    <!-- re-check scope, if so enabled... -->
    <ref bean="preselector"/>
    <!-- ...then verify or trigger prerequisite URIs fetched, allow crawling... -->
    <ref bean="preconditions"/>
    <!-- ...fetch if DNS URI... -->
    <ref bean="fetchDns"/>
    <!-- <ref bean="fetchWhois"/> -->
    <!-- ...fetch if HTTP URI... -->
    <ref bean="fetchHttp"/>
    <!-- ...extract outlinks from HTTP headers... -->
    <ref bean="extractorHttp"/>
    <!-- ...extract outlinks from HTML content... -->
    <ref bean="extractorHtml"/>
    <!-- ...extract outlinks from CSS content... -->
    <ref bean="extractorCss"/>
    <!-- ...extract outlinks from Javascript content... -->
    <ref bean="extractorJs"/>
    <!-- ...extract outlinks from Flash content... -->
    <ref bean="extractorSwf"/>
   </list>
  </property>
 </bean>
  
 <!-- DISPOSITION CHAIN -->
 <!-- first, processors are declared as top-level named beans  -->
 <bean id="warcWriter" class="org.archive.modules.writer.WARCWriterProcessor">
  <!-- <property name="compress" value="true" /> -->
  <!-- <property name="prefix" value="IAH" /> -->
  <!-- <property name="suffix" value="${HOSTNAME}" /> -->
  <!-- <property name="maxFileSizeBytes" value="1000000000" /> -->
  <!-- <property name="poolMaxActive" value="1" /> -->
  <!-- <property name="MaxWaitForIdleMs" value="500" /> -->
  <!-- <property name="skipIdenticalDigests" value="false" /> -->
  <!-- <property name="maxTotalBytesToWrite" value="0" /> -->
  <!-- <property name="directory" value="${launchId}" /> -->
  <!-- <property name="storePaths">
        <list>
         <value>warcs</value>
        </list>
       </property> -->
  <!-- <property name="template" value="${prefix}-${timestamp17}-${serialno}-${heritrix.pid}~${heritrix.hostname}~${heritrix.port}" /> -->
  <!-- <property name="writeRequests" value="true" /> -->
  <!-- <property name="writeMetadata" value="true" /> -->
  <!-- <property name="writeRevisitForIdenticalDigests" value="true" /> -->
  <!-- <property name="writeRevisitForNotModified" value="true" /> -->
  <!-- <property name="startNewFilesOnCheckpoint" value="true" /> -->
 </bean>
 <bean id="candidates" class="org.archive.crawler.postprocessor.CandidatesProcessor">
  <!-- <property name="seedsRedirectNewSeeds" value="true" /> -->
  <!-- <property name="processErrorOutlinks" value="false" /> -->
 </bean>
 <bean id="disposition" class="org.archive.crawler.postprocessor.DispositionProcessor">
  <!-- <property name="delayFactor" value="5.0" /> -->
  <!-- <property name="minDelayMs" value="3000" /> -->
  <!-- <property name="respectCrawlDelayUpToSeconds" value="300" /> -->
  <!-- <property name="maxDelayMs" value="30000" /> -->
  <!-- <property name="maxPerHostBandwidthUsageKbSec" value="0" /> -->
 </bean>
 <!-- <bean id="rescheduler" class="org.archive.crawler.postprocessor.ReschedulingProcessor">
       <property name="rescheduleDelaySeconds" value="-1" />
      </bean> -->
 <!-- now, processors are assembled into ordered DispositionChain bean -->
 <bean id="dispositionProcessors" class="org.archive.modules.DispositionChain">
  <property name="processors">
   <list>
    <!-- write to aggregate archival files... -->
    <ref bean="warcWriter"/>
    <!-- ...send each outlink candidate URI to CandidateChain, 
         and enqueue those ACCEPTed to the frontier... -->
    <ref bean="candidates"/>
    <!-- ...then update stats, shared-structures, frontier decisions -->
    <ref bean="disposition"/>
    <!-- <ref bean="rescheduler" /> -->
   </list>
  </property>
 </bean>



评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值