heritrix 3.1.1限制爬取范围

参考这篇文章

http://www.verydemo.com/demo_c427_i9456.html

虽然说的不知道是哪个版本的,但看很适合3.1.1版本

主要配置如下:

 <bean id="acceptSurts" class="org.archive.modules.deciderules.surt.SurtPrefixedDecideRule">
  <!-- <property name="decision" value="ACCEPT"/> -->
  <!-- <property name="seedsAsSurtPrefixes" value="true" /> -->
  <!-- <property name="alsoCheckVia" value="false" /> -->
  <!-- <property name="surtsSourceFile" value="" /> -->
  <!-- <property name="surtsDumpFile" value="${launchId}/surts.dump" /> -->
  <!-- <property name="surtsSource">
        <bean class="org.archive.spring.ConfigString">
         <property name="value">
          <value>
           # example.com
           # http://www.example.edu/path1/
           # +http://(org,example,
          </value>
         </property> 
        </bean>
       </property> -->
 </bean>

需要将surtsSource的注释打开,写入需要抓取网站,网站不要加www,后面也不要加/

另外光这个还不行,还需要去除一个属性:

在下面这个配置里边

 <bean id="scope" class="org.archive.modules.deciderules.DecideRuleSequence">
  <!-- <property name="logToFile" value="false" /> -->
  <property name="rules">
   <list>
    <!-- Begin by REJECTing all... -->
    <bean class="org.archive.modules.deciderules.RejectDecideRule" />
    <!-- ...then ACCEPT those within configured/seed-implied SURT prefixes... -->
    <ref bean="acceptSurts" />
    <!-- ...but REJECT those more than a configured link-hop-count from start... -->
    <bean class="org.archive.modules.deciderules.TooManyHopsDecideRule">
     <!-- <property name="maxHops" value="20" /> -->
    </bean>
    <!-- ...but ACCEPT those more than a configured link-hop-count from start... -->
    <bean class="org.archive.modules.deciderules.TransclusionDecideRule">
     <!-- <property name="maxTransHops" value="2" /> -->
     <!-- <property name="maxSpeculativeHops" value="1" /> -->
    </bean>
    <!-- ...but REJECT those from a configurable (initially empty) set of REJECT SURTs... -->
    <bean class="org.archive.modules.deciderules.surt.SurtPrefixedDecideRule">
          <property name="decision" value="REJECT"/>
          <property name="seedsAsSurtPrefixes" value="false"/>
          <property name="surtsDumpFile" value="${launchId}/negative-surts.dump" /> 
     <!-- <property name="surtsSource">
           <bean class="org.archive.spring.ConfigFile">
            <property name="path" value="negative-surts.txt" />
           </bean>
          </property> -->
    </bean>
    <!-- ...and REJECT those from a configurable (initially empty) set of URI regexes... -->
    <bean class="org.archive.modules.deciderules.MatchesListRegexDecideRule">
          <property name="decision" value="REJECT"/>
     <!-- <property name="listLogicalOr" value="true" /> -->
     <!-- <property name="regexList">
           <list>
           </list>
          </property> -->
    </bean>
    <!-- ...and REJECT those with suspicious repeating path-segments... -->
    <bean class="org.archive.modules.deciderules.PathologicalPathDecideRule">
     <!-- <property name="maxRepetitions" value="2" /> -->
    </bean>
    <!-- ...and REJECT those with more than threshold number of path-segments... -->
    <bean class="org.archive.modules.deciderules.TooManyPathSegmentsDecideRule">
     <!-- <property name="maxPathDepth" value="20" /> -->
    </bean>
    <!-- ...but always ACCEPT those marked as prerequisitee for another URI... -->
    <bean class="org.archive.modules.deciderules.PrerequisiteAcceptDecideRule">
    </bean>
    <!-- ...but always REJECT those with unsupported URI schemes -->
    <bean class="org.archive.modules.deciderules.SchemeNotInSetDecideRule">
    </bean>
   </list>
  </property>
 </bean>

需要去掉:

    <bean class="org.archive.modules.deciderules.TransclusionDecideRule">
     <!-- <property name="maxTransHops" value="2" /> -->
     <!-- <property name="maxSpeculativeHops" value="1" /> -->
    </bean>



  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值