<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!-- Put site-specific property overrides in this file. -->
<configuration>
<property>
<name>http.agent.name</name>
<value>My Nutch Spider</value>
</property>
<property>
<name>http.robots.agents</name>
<value>My Nutch Spider,*</value>
<description>The agent strings we'll look for in robots.txt files,
comma-separated, in decreasing order of precedence. You should
put the value of http.agent.name as the first agent name, and keep the
default * at the end of the list. E.g.: BlurflDev,Blurfl,*
</description>
</property>
<property>
<name>http.content.limit</name>
<value>-1</value>
</property>
<property>
<name>plugin.includes</name>
<value>protocol-http|urlfilter-regex|parse-(html|tika)|index-(basic)|indexer-solr|scoring-opic|urlnormalizer-(pass|regex|basic)</value>
<description>Regular expression naming plugin directory names to
include. Any plugin not matching this expression is excluded.
In any case you need at least include the nutch-extensionpoints plugin. By
default Nutch includes crawling just HTML and plain text via HTTP,
and basic indexing and search plugins. In order to use HTTPS please enable
protocol-httpclient, but be aware of possible intermittent problems with the
underlying commons-httpclient library.
</description>
</property>
<property>
<name>urlfilter.blackwhite.file</name>
<value>blackwhite-urlfilter.txt</value>
<description>Name of file on CLASSPATH containing url suffixes
used by urlfilter-blackwhite (BlackWhiteURLFilter) plugin.</description>
</property>
<property>
<name>db.ignore.internal.links</name>
<value>false</value>
<description>If true, when adding new links to a page, links from
the same host are ignored. This is an effective way to limit the
size of the link database, keeping only the highest quality
links.
</description>
</property>
<property>
<name>indexer.add.domain</name>
<value>true</value>
<description>Whether to add the domain field to a NutchDocument.</description>
</property>
<property>
<name>db.fetch.interval.default</name>
<value>2592000</value>
<description>The default number of seconds between re-fetches of a page (30 days).
</description>
</property>
<property>
<name>http.timeout</name>
<value>5000</value>
<description>The default network timeout, in milliseconds.</description>
</property>
<property>
<name>fetcher.threads.fetch</name>
<value>200</value>
</property>
<property>
<name>fetcher.threads.per.host.by.ip</name>
<value>false</value>
</property>
<property>
<name>indexer.skip.notmodified</name>
<value>true</value>
<description>Whether the indexer will skip records with a db_notmodified status.
</description>
</property>
<property>
<name>fetcher.threads.per.queue</name>
<value>10</value>
</property>
<property>
<name>fetcher.queue.depth.multiplier</name>
<value>200</value>
</property>
<property>
<name>fetcher.server.delay</name>
<value>2.0</value>
</property>
<property>
<name>fetcher.server.min.delay</name>
<value>1.0</value>
</property>
<property>
<name>fetcher.max.crawl.delay</name>
<value>10</value>
</property>
<property>
<name>parser.character.encoding.default</name>
<value>gb2312</value>
</property>
<property>
<name>db.max.outlinks.per.page</name>
<value>10000</value>
</property>
</configuration>
转载于:https://my.oschina.net/junfrank/blog/286548