一、ivysetting.xml
注掉:
<property name="repo.maven.org"
value="http://repo1.maven.org/maven2/"
override="false"/>
加上:
<property name="repo.maven.org"
value="http://maven.oschina.net/content/groups/public/"
override="false"/>
http://mirrors.ibiblio.org/maven2/
二、ivy.xml
添加:
<dependency org="mysql" name="mysql-connector-java" rev="5.1.34"/>
<dependency org="org.springframework" name="spring-jdbc" rev="4.0.8.RELEASE"/>
<dependency org="commons-dbcp" name="commons-dbcp" rev="1.3"/>
<dependency org="com.googlecode.juniversalchardet" name="juniversalchardet" rev="1.0.3"/>
三、建库:db_news utf-8
建表:
CREATE TABLE `tb_content` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`html` longtext,
`url` varchar(200) DEFAULT NULL,
`status` int(11) DEFAULT NULL,
PRIMARY KEY (`id`),
KEY `index_id` (`id`),
KEY `index_status` (`status`)
) ENGINE=MyISAM AUTO_INCREMENT=8990828 DEFAULT CHARSET=utf8;
四、src/java/org/...../fetcher
统计目录放入DBHelper.java
Fetcher.java case ProtocolStatus.SUCCESS: 下面插入
try{
String contentType=content.getContentType();
if(contentType.contains("text")){
int result=DBHelper.addArticle(content.getUrl(),content.getContent());
Log.info("Upload "+content.getUrl()+" result="+result);
}
}catch(Exception ex){
System.out.println("Upload Failed:"+ex.toString());
Log.info("Upload Failed:"+ex.toString());
}
注释掉:
/* if (!rules.isAllowed(fit.u.toString())) {
// unblock
fetchQueues.finishFetchItem(fit, true);
if (LOG.isDebugEnabled()) {
LOG.debug("Denied by robots.txt: " + fit.url);
}
output(fit.url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE);
reporter.incrCounter("FetcherStatus", "robots_denied", 1);
continue;
}
*/
五、src/bin/crawl
注释、修改下面相应内容
SEEDDIR="$1"
CRAWL_PATH="$2"
#SOLRURL="$3"
LIMIT="$3"
#if [ "$SOLRURL" = "" ]; then
# echo "Missing SOLRURL : crawl <seedDir> <crawlDir> <solrURL> <numberOfRounds>"
# exit -1;
#fi
# set the number of slaves nodes
numSlaves=1
# num threads for fetching
numThreads=100
# parsing the segment
# echo "Parsing : $SEGMENT"
# enable the skipping of records for the parsing so that a dodgy document
# so that it does not fail the full task
# skipRecordsOptions="-D mapred.skip.attempts.to.start.skipping=2 -D mapred.skip.map.max.skip.records=1"
# "$bin/nutch" parse $commonOptions $skipRecordsOptions "$CRAWL_PATH"/segments/$SEGMENT
# if [ $? -ne 0 ]
# then exit $?
# fi
# echo "Indexing $SEGMENT on SOLR index -> $SOLRURL"
# "$bin/nutch" index -D solr.server.url=$SOLRURL "$CRAWL_PATH"/crawldb -linkdb "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT
# if [ $? -ne 0 ]
# then exit $?
# fi
# echo "Cleanup on SOLR index -> $SOLRURL"
# "$bin/nutch" clean -D solr.server.url=$SOLRURL "$CRAWL_PATH"/crawldb
# if [ $? -ne 0 ]
# then exit $?
# fi
六、conf/nutch-site.xml
<property>
<name>fetcher.parse</name>
<value>true</value>
<description>If true, fetcher will parse content. Default is false, which means
that a separate parsing step is required after fetching is finished.</description>
</property>
<property>
<name>fetcher.store.content</name>
<value>false</value>
<description>If true, fetcher will store content.</description>
</property>
<property>
<name>http.content.limit</name>
<value>-1</value>
<description>The length limit for downloaded content using the http://
protocol, in bytes. If this value is nonnegative (>=0), content longer
than it will be truncated; otherwise, no truncation at all. Do not
confuse this setting with the file.content.limit setting.
</description>
</property>
<property>
<name>fetcher.threads.per.queue</name>
<value>15</value>
<description>This number is the maximum number of threads that
should be allowed to access a queue at one time.
</description>
</property>
<property>
<name>http.agent.name</name>
<value>nutch</value>
<description>HTTP 'User-Agent' request header. MUST NOT be empty -
please set this to a single word uniquely related to your organization.
NOTE: You should also check other related properties:
http.robots.agents
http.agent.description
http.agent.url
http.agent.email
http.agent.version
and set their values appropriately.
</description>
</property>
<property>
<name>parser.skip.truncated</name>
<value>false</value>
</property>
七、多机情况
model=distributed
local=false
八、conf/regex-urlfilter.txt
flv|FLV
九、bin/crawl urls crawl1 5
注掉:
<property name="repo.maven.org"
value="http://repo1.maven.org/maven2/"
override="false"/>
加上:
<property name="repo.maven.org"
value="http://maven.oschina.net/content/groups/public/"
override="false"/>
http://mirrors.ibiblio.org/maven2/
二、ivy.xml
添加:
<dependency org="mysql" name="mysql-connector-java" rev="5.1.34"/>
<dependency org="org.springframework" name="spring-jdbc" rev="4.0.8.RELEASE"/>
<dependency org="commons-dbcp" name="commons-dbcp" rev="1.3"/>
<dependency org="com.googlecode.juniversalchardet" name="juniversalchardet" rev="1.0.3"/>
三、建库:db_news utf-8
建表:
CREATE TABLE `tb_content` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`html` longtext,
`url` varchar(200) DEFAULT NULL,
`status` int(11) DEFAULT NULL,
PRIMARY KEY (`id`),
KEY `index_id` (`id`),
KEY `index_status` (`status`)
) ENGINE=MyISAM AUTO_INCREMENT=8990828 DEFAULT CHARSET=utf8;
四、src/java/org/...../fetcher
统计目录放入DBHelper.java
Fetcher.java case ProtocolStatus.SUCCESS: 下面插入
try{
String contentType=content.getContentType();
if(contentType.contains("text")){
int result=DBHelper.addArticle(content.getUrl(),content.getContent());
Log.info("Upload "+content.getUrl()+" result="+result);
}
}catch(Exception ex){
System.out.println("Upload Failed:"+ex.toString());
Log.info("Upload Failed:"+ex.toString());
}
注释掉:
/* if (!rules.isAllowed(fit.u.toString())) {
// unblock
fetchQueues.finishFetchItem(fit, true);
if (LOG.isDebugEnabled()) {
LOG.debug("Denied by robots.txt: " + fit.url);
}
output(fit.url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE);
reporter.incrCounter("FetcherStatus", "robots_denied", 1);
continue;
}
*/
五、src/bin/crawl
注释、修改下面相应内容
SEEDDIR="$1"
CRAWL_PATH="$2"
#SOLRURL="$3"
LIMIT="$3"
#if [ "$SOLRURL" = "" ]; then
# echo "Missing SOLRURL : crawl <seedDir> <crawlDir> <solrURL> <numberOfRounds>"
# exit -1;
#fi
# set the number of slaves nodes
numSlaves=1
# num threads for fetching
numThreads=100
# parsing the segment
# echo "Parsing : $SEGMENT"
# enable the skipping of records for the parsing so that a dodgy document
# so that it does not fail the full task
# skipRecordsOptions="-D mapred.skip.attempts.to.start.skipping=2 -D mapred.skip.map.max.skip.records=1"
# "$bin/nutch" parse $commonOptions $skipRecordsOptions "$CRAWL_PATH"/segments/$SEGMENT
# if [ $? -ne 0 ]
# then exit $?
# fi
# echo "Indexing $SEGMENT on SOLR index -> $SOLRURL"
# "$bin/nutch" index -D solr.server.url=$SOLRURL "$CRAWL_PATH"/crawldb -linkdb "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT
# if [ $? -ne 0 ]
# then exit $?
# fi
# echo "Cleanup on SOLR index -> $SOLRURL"
# "$bin/nutch" clean -D solr.server.url=$SOLRURL "$CRAWL_PATH"/crawldb
# if [ $? -ne 0 ]
# then exit $?
# fi
六、conf/nutch-site.xml
<property>
<name>fetcher.parse</name>
<value>true</value>
<description>If true, fetcher will parse content. Default is false, which means
that a separate parsing step is required after fetching is finished.</description>
</property>
<property>
<name>fetcher.store.content</name>
<value>false</value>
<description>If true, fetcher will store content.</description>
</property>
<property>
<name>http.content.limit</name>
<value>-1</value>
<description>The length limit for downloaded content using the http://
protocol, in bytes. If this value is nonnegative (>=0), content longer
than it will be truncated; otherwise, no truncation at all. Do not
confuse this setting with the file.content.limit setting.
</description>
</property>
<property>
<name>fetcher.threads.per.queue</name>
<value>15</value>
<description>This number is the maximum number of threads that
should be allowed to access a queue at one time.
</description>
</property>
<property>
<name>http.agent.name</name>
<value>nutch</value>
<description>HTTP 'User-Agent' request header. MUST NOT be empty -
please set this to a single word uniquely related to your organization.
NOTE: You should also check other related properties:
http.robots.agents
http.agent.description
http.agent.url
http.agent.email
http.agent.version
and set their values appropriately.
</description>
</property>
<property>
<name>parser.skip.truncated</name>
<value>false</value>
</property>
七、多机情况
model=distributed
local=false
八、conf/regex-urlfilter.txt
flv|FLV
九、bin/crawl urls crawl1 5