nutch-0.8在eclipse中运行
运行环境:
- jdk 1.5
- eclipse 3.2
- nutch-0.8已解压制D:/ (注:此目录是随意的)
nutch 0.8缺少的两个jar包:
- 下载---关于parse-mp3的jid3lib-0.5.1.jar
- 下载---关于parse-rtf的rtf-parser.jar
- D:/nutch-0.8.1/src/plugin/parse-mp3/lib
- D:/nutch-0.8.1/src/plugin/parse-rtf/lib
配置文件的更改:
主要更改的配置文件为如下三个:
- nutch-default.xml
更改plugin.folders的value为D:/nutch-0.8.1/src/plugin
更改http.agent.name的value为godric(注:此处随意‘) - 在D:/nutch-0.8.1下添加文件夹urls,在urls目录下新建文本文档nutch.txt(注:文件名随意),内容为你要爬取得网站url,例如:http://www.iscas.ac.cn
- crawl-urlfilter.txt
修改为如下所示:
# accept hosts in MY.DOMAIN.NAME
+^http://www.iscas.ac.cn/
项目文件:
- .classpath 其内容为:
<? xml version="1.0" encoding="UTF-8" ?>
< classpath >
< classpathentry kind ="src" path ="conf" />
< classpathentry kind ="src" path ="src/java" />
< classpathentry kind ="src" path ="src/test" />
< classpathentry kind ="src" path ="src/plugin/creativecommons/src/java" />
< classpathentry kind ="src" path ="src/plugin/summary-lucene/src/java" />
< classpathentry kind ="src" path ="src/plugin/parse-rtf/src/java" />
< classpathentry kind ="src" path ="src/plugin/query-url/src/java" />
< classpathentry kind ="src" path ="src/plugin/parse-rss/src/java" />
< classpathentry kind ="src" path ="src/plugin/parse-msword/src/java" />
< classpathentry kind ="src" path ="src/plugin/urlfilter-suffix/src/test" />
< classpathentry kind ="src" path ="src/plugin/protocol-http/src/java" />
< classpathentry kind ="src" path ="src/plugin/parse-pdf/src/test" />
< classpathentry kind ="src" path ="src/plugin/parse-zip/src/java" />
< classpathentry kind ="src" path ="src/plugin/summary-basic/src/java" />
< classpathentry kind ="src" path ="src/plugin/parse-rtf/src/test" />
< classpathentry kind ="src" path ="src/plugin/parse-swf/src/test" />
< classpathentry kind ="src" path ="src/plugin/parse-oo/src/java" />
< classpathentry kind ="src" path ="src/plugin/urlfilter-regex/src/test" />
< classpathentry kind ="src" path ="src/plugin/protocol-httpclient/src/java" />
< classpathentry kind ="src" path ="src/plugin/parse-msword/src/test" />
< classpathentry kind ="src" path ="src/plugin/subcollection/src/test" />
< classpathentry kind ="src" path ="src/plugin/parse-oo/src/test" />
< classpathentry kind ="src" path ="src/plugin/protocol-http/src/test" />
< classpathentry kind ="src" path ="src/plugin/query-url/src/test" />
< classpathentry kind ="src" path ="src/plugin/urlfilter-prefix/src/java" />
< classpathentry kind ="src" path ="src/plugin/protocol-ftp/src/java" />
< classpathentry kind ="src" path ="src/plugin/query-site/src/java" />
< classpathentry kind ="src" path ="src/plugin/parse-rss/src/test" />
< classpathentry kind ="src" path ="src/plugin/parse-swf/src/java" />
< classpathentry kind ="src" path ="src/plugin/parse-zip/src/test" />
< classpathentry kind ="src" path ="src/plugin/parse-text/src/java" />
< classpathentry kind ="src" path ="src/plugin/urlfilter-suffix/src/java" />
< classpathentry kind ="src" path ="src/plugin/parse-pdf/src/java" />
< classpathentry kind ="src" path ="src/plugin/protocol-file/src/java" />
< classpathentry kind ="src" path ="src/plugin/query-more/src/java" />
< classpathentry kind ="src" path ="src/plugin/scoring-opic/src/java" />
< classpathentry kind ="src" path ="src/plugin/urlfilter-automaton/src/test" />
< classpathentry kind ="src" path ="src/plugin/urlfilter-automaton/src/java" />
< classpathentry kind ="src" path ="src/plugin/query-basic/src/java" />
< classpathentry kind ="src" path ="src/plugin/urlfilter-regex/src/java" />
< classpathentry kind ="src" path ="src/plugin/subcollection/src/java" />
< classpathentry kind ="src" path ="src/plugin/parse-ext/src/java" />
< classpathentry kind ="src" path ="src/plugin/parse-mp3/src/test" />
< classpathentry kind ="src" path ="src/plugin/ontology/src/test" />
< classpathentry kind ="src" path ="src/plugin/creativecommons/src/test" />
< classpathentry kind ="src" path ="src/plugin/lib-parsems/src/java" />
< classpathentry kind ="src" path ="src/plugin/ontology/src/java" />
< classpathentry kind ="src" path ="src/plugin/index-basic/src/java" />
< classpathentry kind ="src" path ="src/plugin/parse-js/src/java" />
< classpathentry kind ="src" path ="src/plugin/index-more/src/java" />
< classpathentry kind ="src" path ="src/plugin/lib-regex-filter/src/test" />
< classpathentry kind ="src" path ="src/plugin/analysis-de/src/java" />
< classpathentry kind ="src" path ="src/plugin/microformats-reltag/src/java" />
< classpathentry kind ="src" path ="src/plugin/parse-mspowerpoint/src/java" />
< classpathentry kind ="src" path ="src/plugin/lib-http/src/java" />
< classpathentry kind ="src" path ="src/plugin/parse-msexcel/src/test" />
< classpathentry kind ="src" path ="src/plugin/languageidentifier/src/java" />
< classpathentry kind ="src" path ="src/plugin/lib-regex-filter/src/java" />
< classpathentry kind ="src" path ="src/plugin/parse-html/src/test" />
< classpathentry kind ="src" path ="src/plugin/parse-html/src/java" />
< classpathentry kind ="src" path ="src/plugin/clustering-carrot2/src/test" />
< classpathentry kind ="src" path ="src/plugin/parse-ext/src/test" />
< classpathentry kind ="src" path ="src/plugin/parse-mspowerpoint/src/test" />
< classpathentry kind ="src" path ="src/plugin/parse-mp3/src/java" />
< classpathentry kind ="src" path ="src/plugin/parse-msexcel/src/java" />
< classpathentry kind ="src" path ="src/plugin/lib-http/src/test" />
< classpathentry kind ="src" path ="src/plugin/analysis-fr/src/java" />
< classpathentry kind ="src" path ="src/plugin/clustering-carrot2/src/java" />
< classpathentry kind ="src" path ="src/plugin/languageidentifier/src/test" />
< classpathentry kind ="lib" path ="lib/commons-cli-2.0-SNAPSHOT.jar" />
< classpathentry kind ="lib" path ="lib/commons-lang-2.1.jar" />
< classpathentry kind ="lib" path ="lib/commons-logging-1.0.4.jar" />
< classpathentry kind ="lib" path ="lib/commons-logging-api-1.0.4.jar" />
< classpathentry kind ="lib" path ="lib/concurrent-1.3.4.jar" />
< classpathentry kind ="lib" path ="lib/hadoop-0.4.0-patched.jar" />
< classpathentry kind ="lib" path ="lib/jakarta-oro-2.0.7.jar" />
< classpathentry kind ="lib" path ="lib/jetty-5.1.4.jar" />
< classpathentry kind ="lib" path ="lib/junit-3.8.1.jar" />
< classpathentry kind ="lib" path ="lib/log4j-1.2.13.jar" />
< classpathentry kind ="lib" path ="lib/lucene-core-1.9.1.jar" />
< classpathentry kind ="lib" path ="lib/lucene-misc-1.9.1.jar" />
< classpathentry kind ="lib" path ="lib/servlet-api.jar" />
< classpathentry kind ="lib" path ="lib/taglibs-i18n.jar" />
< classpathentry kind ="lib" path ="lib/xerces-2_6_2.jar" />
< classpathentry kind ="lib" path ="lib/xerces-2_6_2-apis.jar" />
< classpathentry kind ="lib" path ="src/plugin/clustering-carrot2/lib/carrot2-filter-lingo.jar" />
< classpathentry kind ="lib" path ="src/plugin/clustering-carrot2/lib/carrot2-local-core.jar" />
< classpathentry kind ="lib" path ="src/plugin/clustering-carrot2/lib/carrot2-snowball-stemmers.jar" />
< classpathentry kind ="lib" path ="src/plugin/clustering-carrot2/lib/carrot2-util-common.jar" />
< classpathentry kind ="lib" path ="src/plugin/clustering-carrot2/lib/carrot2-util-tokenizer.jar" />
< classpathentry kind ="lib" path ="src/plugin/clustering-carrot2/lib/commons-collections-3.1-patched.jar" />
< classpathentry kind ="lib" path ="src/plugin/clustering-carrot2/lib/commons-pool-1.1.jar" />
< classpathentry kind ="lib" path ="src/plugin/clustering-carrot2/lib/Jama-1.0.1-patched.jar" />
< classpathentry kind ="lib" path ="src/plugin/clustering-carrot2/lib/violinstrings-1.0.2.jar" />
< classpathentry kind ="lib" path ="src/plugin/lib-commons-httpclient/lib/commons-httpclient-3.0.jar" />
< classpathentry kind ="lib" path ="src/plugin/lib-jakarta-poi/lib/poi-3.0-alpha1-20050704.jar" />
< classpathentry kind ="lib" path ="src/plugin/lib-jakarta-poi/lib/poi-scratchpad-3.0-alpha1-20050704.jar" />
< classpathentry kind ="lib" path ="src/plugin/lib-log4j/lib/log4j-1.2.11.jar" />
< classpathentry kind ="lib" path ="src/plugin/lib-lucene-analyzers/lib/lucene-analyzers-1.9-rc1-dev.jar" />
< classpathentry kind ="lib" path ="src/plugin/lib-nekohtml/lib/nekohtml-0.9.4.jar" />
< classpathentry kind ="lib" path ="src/plugin/lib-xml/lib/jaxen-core.jar" />
< classpathentry kind ="lib" path ="src/plugin/lib-xml/lib/jaxen-jdom.jar" />
< classpathentry kind ="lib" path ="src/plugin/lib-xml/lib/jdom.jar" />
< classpathentry kind ="lib" path ="src/plugin/lib-xml/lib/saxpath.jar" />
< classpathentry kind ="lib" path ="src/plugin/lib-xml/lib/xercesImpl.jar" />
< classpathentry kind ="lib" path ="src/plugin/ontology/lib/commons-logging-1.0.3.jar" />
< classpathentry kind ="lib" path ="src/plugin/ontology/lib/icu4j_2_6_1.jar" />
< classpathentry kind ="lib" path ="src/plugin/ontology/lib/jena-2.1.jar" />
< classpathentry kind ="lib" path ="src/plugin/parse-html/lib/tagsoup-1.0rc3.jar" />
< classpathentry kind ="lib" path ="src/plugin/parse-pdf/lib/PDFBox-0.7.2-log4j.jar" />
< classpathentry kind ="lib" path ="src/plugin/parse-rss/lib/commons-feedparser-0.6-fork.jar" />
< classpathentry kind ="lib" path ="src/plugin/parse-rss/lib/xmlrpc-1.2.jar" />
< classpathentry kind ="lib" path ="src/plugin/parse-swf/lib/javaswf.jar" />
< classpathentry kind ="lib" path ="src/plugin/protocol-ftp/lib/commons-net-1.2.0-dev.jar" />
< classpathentry kind ="lib" path ="src/plugin/protocol-httpclient/lib/commons-codec.jar" />
< classpathentry kind ="lib" path ="src/plugin/summary-lucene/lib/lucene-highlighter-2.0-rc1-dev.jar" />
< classpathentry kind ="lib" path ="src/plugin/urlfilter-automaton/lib/automaton.jar" />
< classpathentry kind ="lib" path ="src/plugin/parse-mp3/lib/jid3lib-0.5.1.jar" />
< classpathentry kind ="lib" path ="src/plugin/parse-rtf/lib/rtf-parser.jar" />
< classpathentry kind ="con" path ="org.eclipse.jdt.launching.JRE_CONTAINER" />
< classpathentry kind ="output" path ="tmp_build" />
</ classpath >
- .project 其内容为:
<? xml version="1.0" encoding="UTF-8" ?>
< projectDescription >
< name > nutch-0.8 </ name >
< comment ></ comment >
< projects >
</ projects >
< buildSpec >
< buildCommand >
< name > org.eclipse.jdt.core.javabuilder </ name >
< arguments >
</ arguments >
</ buildCommand >
</ buildSpec >
< natures >
< nature > org.eclipse.jdt.core.javanature </ nature >
</ natures >
</ projectDescription >
导入eclipse:
- File->Import选择目录D:/nutch-0.8.1
- Run->Run...
配置如下:
Main class: org.apache.nutch.crawl.Crawl
Program arguments: urls -dir crawl.iscas -depth 2 -topN 50
VM arguments: -Dhadoop.log.dir=logs -Dhadoop.log.file=hadoop.log
不知您是否运行,如果还没有,请email我:godric.wu@gmail.com