nutch-site参数配置

<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

<!-- Put site-specific property overrides in this file. -->

<configuration>
<property>
  <name>http.agent.name</name>
  <value>My Nutch Spider</value>
</property>
<property>
  <name>http.robots.agents</name>
  <value>My Nutch Spider,*</value>
  <description>The agent strings we'll look for in robots.txt files,
  comma-separated, in decreasing order of precedence. You should
  put the value of http.agent.name as the first agent name, and keep the
  default * at the end of the list. E.g.: BlurflDev,Blurfl,*
  </description>
</property>
<property>
  <name>http.content.limit</name>
        <value>-1</value>
</property>
<property>
  <name>plugin.includes</name>
    <value>protocol-http|urlfilter-regex|parse-(html|tika)|index-(basic)|indexer-solr|scoring-opic|urlnormalizer-(pass|regex|basic)</value>
      <description>Regular expression naming plugin directory names to
        include.  Any plugin not matching this expression is excluded.
        In any case you need at least include the nutch-extensionpoints plugin. By
        default Nutch includes crawling just HTML and plain text via HTTP,
        and basic indexing and search plugins. In order to use HTTPS please enable
        protocol-httpclient, but be aware of possible intermittent problems with the
        underlying commons-httpclient library.
       </description>
 </property>
 <property>
   <name>urlfilter.blackwhite.file</name>
   <value>blackwhite-urlfilter.txt</value>
   <description>Name of file on CLASSPATH containing url suffixes
         used by urlfilter-blackwhite (BlackWhiteURLFilter) plugin.</description>
</property>
<property>
  <name>db.ignore.internal.links</name>
  <value>false</value>
      <description>If true, when adding new links to a page, links from
        the same host are ignored.  This is an effective way to limit the
        size of the link database, keeping only the highest quality
        links.
     </description>
</property>
<property>
  <name>indexer.add.domain</name>
  <value>true</value>
  <description>Whether to add the domain field to a NutchDocument.</description>
</property>
<property>
  <name>db.fetch.interval.default</name>
  <value>2592000</value>
  <description>The default number of seconds between re-fetches of a page (30 days).
  </description>
</property>

<property>
  <name>http.timeout</name>
  <value>5000</value>
     <description>The default network timeout, in milliseconds.</description>
</property>

<property>
  <name>fetcher.threads.fetch</name>
  <value>200</value>
</property>

<property>
  <name>fetcher.threads.per.host.by.ip</name>
  <value>false</value>
</property>

<property>
  <name>indexer.skip.notmodified</name>
  <value>true</value>
<description>Whether the indexer will skip records with a db_notmodified status.
  </description>
</property>
<property>
<name>fetcher.threads.per.queue</name>
<value>10</value>
</property>
<property>
<name>fetcher.queue.depth.multiplier</name>
<value>200</value>
</property>
<property>
<name>fetcher.server.delay</name>
<value>2.0</value>
</property>
<property>
<name>fetcher.server.min.delay</name>
<value>1.0</value>
</property>
<property>
 <name>fetcher.max.crawl.delay</name>
  <value>10</value>
</property>
<property>
  <name>parser.character.encoding.default</name>
  <value>gb2312</value>
</property>
<property>
  <name>db.max.outlinks.per.page</name>
  <value>10000</value>
</property>
</configuration>

转载于:https://my.oschina.net/junfrank/blog/286548

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值