nutch-site参数配置

最新推荐文章于 2024-11-02 16:28:26 发布

weixin_34314962

最新推荐文章于 2024-11-02 16:28:26 发布

阅读量61

点赞数

文章标签： python 数据库

原文链接：https://my.oschina.net/junfrank/blog/286548

版权

2019独角兽企业重金招聘Python工程师标准>>>

<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>



<configuration>
<property>
<name>http.agent.name</name>
<value>My Nutch Spider</value>
</property>
<property>
<name>http.robots.agents</name>
<value>My Nutch Spider,*</value>
<description>The agent strings we'll look for in robots.txt files,
comma-separated, in decreasing order of precedence. You should
put the value of http.agent.name as the first agent name, and keep the
default * at the end of the list. E.g.: BlurflDev,Blurfl,*
</description>
</property>
<property>
<name>http.content.limit</name>
        <value>-1</value>
</property>
<property>
<name>plugin.includes</name>
    <value>protocol-http|urlfilter-regex|parse-(html|tika)|index-(basic)|indexer-solr|scoring-opic|urlnormalizer-(pass|regex|basic)</value>
      <description>Regular expression naming plugin directory names to
        include. Any plugin not matching this expression is excluded.
        In any case you need at least include the nutch-extensionpoints plugin. By
        default Nutch includes crawling just HTML and plain text via HTTP,
        and basic indexing and search plugins. In order to use HTTPS please enable
        protocol-httpclient, but be aware of possible intermittent problems with the
        underlying commons-httpclient library.
       </description>
</property>
<property>
   <name>urlfilter.blackwhite.file</name>
   <value>blackwhite-urlfilter.txt</value>
   <description>Name of file on CLASSPATH containing url suffixes
         used by urlfilter-blackwhite (BlackWhiteURLFilter) plugin.</description>
</property>
<property>
<name>db.ignore.internal.links</name>
<value>false</value>
      <description>If true, when adding new links to a page, links from
        the same host are ignored. This is an effective way to limit the
        size of the link database, keeping only the highest quality
        links.
     </description>
</property>
<property>
<name>indexer.add.domain</name>
<value>true</value>
<description>Whether to add the domain field to a NutchDocument.</description>
</property>
<property>
<name>db.fetch.interval.default</name>
<value>2592000</value>
<description>The default number of seconds between re-fetches of a page (30 days).
</description>
</property>

<property>
<name>http.timeout</name>
<value>5000</value>
     <description>The default network timeout, in milliseconds.</description>
</property>

<property>
<name>fetcher.threads.fetch</name>
<value>200</value>
</property>

<property>
<name>fetcher.threads.per.host.by.ip</name>
<value>false</value>
</property>

<property>
<name>indexer.skip.notmodified</name>
<value>true</value>
<description>Whether the indexer will skip records with a db_notmodified status.
</description>
</property>
<property>
<name>fetcher.threads.per.queue</name>
<value>10</value>
</property>
<property>
<name>fetcher.queue.depth.multiplier</name>
<value>200</value>
</property>
<property>
<name>fetcher.server.delay</name>
<value>2.0</value>
</property>
<property>
<name>fetcher.server.min.delay</name>
<value>1.0</value>
</property>
<property>
<name>fetcher.max.crawl.delay</name>
<value>10</value>
</property>
<property>
<name>parser.character.encoding.default</name>
<value>gb2312</value>
</property>
<property>
<name>db.max.outlinks.per.page</name>
<value>10000</value>
</property>
</configuration>