nutch-0.8在eclipse中运行

nutch-0.8在eclipse中运行
运行环境:
  1. jdk 1.5
  2. eclipse 3.2
  3. nutch-0.8已解压制D:/ (注:此目录是随意的)

nutch 0.8缺少的两个jar包:
  1. 下载---关于parse-mp3的jid3lib-0.5.1.jar
  2. 下载---关于parse-rtf的rtf-parser.jar
分别拷贝至
  1. D:/nutch-0.8.1/src/plugin/parse-mp3/lib
  2. D:/nutch-0.8.1/src/plugin/parse-rtf/lib

配置文件的更改:
主要更改的配置文件为如下三个:
  1. nutch-default.xml
    更改plugin.folders的value为D:/nutch-0.8.1/src/plugin
    更改http.agent.name的value为godric(注:此处随意‘)
  2. 在D:/nutch-0.8.1下添加文件夹urls,在urls目录下新建文本文档nutch.txt(注:文件名随意),内容为你要爬取得网站url,例如:http://www.iscas.ac.cn
  3. crawl-urlfilter.txt
    修改为如下所示:
    # accept hosts in MY.DOMAIN.NAME
    +^http://www.iscas.ac.cn/

项目文件:
  1. .classpath 其内容为:
    <? xml version="1.0" encoding="UTF-8" ?>
    < classpath >
        
    < classpathentry  kind ="src"  path ="conf" />
        
    < classpathentry  kind ="src"  path ="src/java" />
        
    < classpathentry  kind ="src"  path ="src/test" />
        
    < classpathentry  kind ="src"  path ="src/plugin/creativecommons/src/java" />
        
    < classpathentry  kind ="src"  path ="src/plugin/summary-lucene/src/java" />
        
    < classpathentry  kind ="src"  path ="src/plugin/parse-rtf/src/java" />
        
    < classpathentry  kind ="src"  path ="src/plugin/query-url/src/java" />
        
    < classpathentry  kind ="src"  path ="src/plugin/parse-rss/src/java" />
        
    < classpathentry  kind ="src"  path ="src/plugin/parse-msword/src/java" />
        
    < classpathentry  kind ="src"  path ="src/plugin/urlfilter-suffix/src/test" />
        
    < classpathentry  kind ="src"  path ="src/plugin/protocol-http/src/java" />
        
    < classpathentry  kind ="src"  path ="src/plugin/parse-pdf/src/test" />
        
    < classpathentry  kind ="src"  path ="src/plugin/parse-zip/src/java" />
        
    < classpathentry  kind ="src"  path ="src/plugin/summary-basic/src/java" />
        
    < classpathentry  kind ="src"  path ="src/plugin/parse-rtf/src/test" />
        
    < classpathentry  kind ="src"  path ="src/plugin/parse-swf/src/test" />
        
    < classpathentry  kind ="src"  path ="src/plugin/parse-oo/src/java" />
        
    < classpathentry  kind ="src"  path ="src/plugin/urlfilter-regex/src/test" />
        
    < classpathentry  kind ="src"  path ="src/plugin/protocol-httpclient/src/java" />
        
    < classpathentry  kind ="src"  path ="src/plugin/parse-msword/src/test" />
        
    < classpathentry  kind ="src"  path ="src/plugin/subcollection/src/test" />
        
    < classpathentry  kind ="src"  path ="src/plugin/parse-oo/src/test" />
        
    < classpathentry  kind ="src"  path ="src/plugin/protocol-http/src/test" />
        
    < classpathentry  kind ="src"  path ="src/plugin/query-url/src/test" />
        
    < classpathentry  kind ="src"  path ="src/plugin/urlfilter-prefix/src/java" />
        
    < classpathentry  kind ="src"  path ="src/plugin/protocol-ftp/src/java" />
        
    < classpathentry  kind ="src"  path ="src/plugin/query-site/src/java" />
        
    < classpathentry  kind ="src"  path ="src/plugin/parse-rss/src/test" />
        
    < classpathentry  kind ="src"  path ="src/plugin/parse-swf/src/java" />
        
    < classpathentry  kind ="src"  path ="src/plugin/parse-zip/src/test" />
        
    < classpathentry  kind ="src"  path ="src/plugin/parse-text/src/java" />
        
    < classpathentry  kind ="src"  path ="src/plugin/urlfilter-suffix/src/java" />
        
    < classpathentry  kind ="src"  path ="src/plugin/parse-pdf/src/java" />
        
    < classpathentry  kind ="src"  path ="src/plugin/protocol-file/src/java" />
        
    < classpathentry  kind ="src"  path ="src/plugin/query-more/src/java" />
        
    < classpathentry  kind ="src"  path ="src/plugin/scoring-opic/src/java" />
        
    < classpathentry  kind ="src"  path ="src/plugin/urlfilter-automaton/src/test" />
        
    < classpathentry  kind ="src"  path ="src/plugin/urlfilter-automaton/src/java" />
        
    < classpathentry  kind ="src"  path ="src/plugin/query-basic/src/java" />
        
    < classpathentry  kind ="src"  path ="src/plugin/urlfilter-regex/src/java" />
        
    < classpathentry  kind ="src"  path ="src/plugin/subcollection/src/java" />
        
    < classpathentry  kind ="src"  path ="src/plugin/parse-ext/src/java" />
        
    < classpathentry  kind ="src"  path ="src/plugin/parse-mp3/src/test" />
        
    < classpathentry  kind ="src"  path ="src/plugin/ontology/src/test" />
        
    < classpathentry  kind ="src"  path ="src/plugin/creativecommons/src/test" />
        
    < classpathentry  kind ="src"  path ="src/plugin/lib-parsems/src/java" />
        
    < classpathentry  kind ="src"  path ="src/plugin/ontology/src/java" />
        
    < classpathentry  kind ="src"  path ="src/plugin/index-basic/src/java" />
        
    < classpathentry  kind ="src"  path ="src/plugin/parse-js/src/java" />
        
    < classpathentry  kind ="src"  path ="src/plugin/index-more/src/java" />
        
    < classpathentry  kind ="src"  path ="src/plugin/lib-regex-filter/src/test" />
        
    < classpathentry  kind ="src"  path ="src/plugin/analysis-de/src/java" />
        
    < classpathentry  kind ="src"  path ="src/plugin/microformats-reltag/src/java" />
        
    < classpathentry  kind ="src"  path ="src/plugin/parse-mspowerpoint/src/java" />
        
    < classpathentry  kind ="src"  path ="src/plugin/lib-http/src/java" />
        
    < classpathentry  kind ="src"  path ="src/plugin/parse-msexcel/src/test" />
        
    < classpathentry  kind ="src"  path ="src/plugin/languageidentifier/src/java" />
        
    < classpathentry  kind ="src"  path ="src/plugin/lib-regex-filter/src/java" />
        
    < classpathentry  kind ="src"  path ="src/plugin/parse-html/src/test" />
        
    < classpathentry  kind ="src"  path ="src/plugin/parse-html/src/java" />
        
    < classpathentry  kind ="src"  path ="src/plugin/clustering-carrot2/src/test" />
        
    < classpathentry  kind ="src"  path ="src/plugin/parse-ext/src/test" />
        
    < classpathentry  kind ="src"  path ="src/plugin/parse-mspowerpoint/src/test" />
        
    < classpathentry  kind ="src"  path ="src/plugin/parse-mp3/src/java" />
        
    < classpathentry  kind ="src"  path ="src/plugin/parse-msexcel/src/java" />
        
    < classpathentry  kind ="src"  path ="src/plugin/lib-http/src/test" />
        
    < classpathentry  kind ="src"  path ="src/plugin/analysis-fr/src/java" />
        
    < classpathentry  kind ="src"  path ="src/plugin/clustering-carrot2/src/java" />
        
    < classpathentry  kind ="src"  path ="src/plugin/languageidentifier/src/test" />
        
    < classpathentry  kind ="lib"  path ="lib/commons-cli-2.0-SNAPSHOT.jar" />
        
    < classpathentry  kind ="lib"  path ="lib/commons-lang-2.1.jar" />
        
    < classpathentry  kind ="lib"  path ="lib/commons-logging-1.0.4.jar" />
        
    < classpathentry  kind ="lib"  path ="lib/commons-logging-api-1.0.4.jar" />
        
    < classpathentry  kind ="lib"  path ="lib/concurrent-1.3.4.jar" />
        
    < classpathentry  kind ="lib"  path ="lib/hadoop-0.4.0-patched.jar" />
        
    < classpathentry  kind ="lib"  path ="lib/jakarta-oro-2.0.7.jar" />
        
    < classpathentry  kind ="lib"  path ="lib/jetty-5.1.4.jar" />
        
    < classpathentry  kind ="lib"  path ="lib/junit-3.8.1.jar" />
        
    < classpathentry  kind ="lib"  path ="lib/log4j-1.2.13.jar" />
        
    < classpathentry  kind ="lib"  path ="lib/lucene-core-1.9.1.jar" />
        
    < classpathentry  kind ="lib"  path ="lib/lucene-misc-1.9.1.jar" />
        
    < classpathentry  kind ="lib"  path ="lib/servlet-api.jar" />
        
    < classpathentry  kind ="lib"  path ="lib/taglibs-i18n.jar" />
        
    < classpathentry  kind ="lib"  path ="lib/xerces-2_6_2.jar" />
        
    < classpathentry  kind ="lib"  path ="lib/xerces-2_6_2-apis.jar" />
        
    < classpathentry  kind ="lib"  path ="src/plugin/clustering-carrot2/lib/carrot2-filter-lingo.jar" />
        
    < classpathentry  kind ="lib"  path ="src/plugin/clustering-carrot2/lib/carrot2-local-core.jar" />
        
    < classpathentry  kind ="lib"  path ="src/plugin/clustering-carrot2/lib/carrot2-snowball-stemmers.jar" />
        
    < classpathentry  kind ="lib"  path ="src/plugin/clustering-carrot2/lib/carrot2-util-common.jar" />
        
    < classpathentry  kind ="lib"  path ="src/plugin/clustering-carrot2/lib/carrot2-util-tokenizer.jar" />
        
    < classpathentry  kind ="lib"  path ="src/plugin/clustering-carrot2/lib/commons-collections-3.1-patched.jar" />
        
    < classpathentry  kind ="lib"  path ="src/plugin/clustering-carrot2/lib/commons-pool-1.1.jar" />
        
    < classpathentry  kind ="lib"  path ="src/plugin/clustering-carrot2/lib/Jama-1.0.1-patched.jar" />
        
    < classpathentry  kind ="lib"  path ="src/plugin/clustering-carrot2/lib/violinstrings-1.0.2.jar" />
        
    < classpathentry  kind ="lib"  path ="src/plugin/lib-commons-httpclient/lib/commons-httpclient-3.0.jar" />
        
    < classpathentry  kind ="lib"  path ="src/plugin/lib-jakarta-poi/lib/poi-3.0-alpha1-20050704.jar" />
        
    < classpathentry  kind ="lib"  path ="src/plugin/lib-jakarta-poi/lib/poi-scratchpad-3.0-alpha1-20050704.jar" />
        
    < classpathentry  kind ="lib"  path ="src/plugin/lib-log4j/lib/log4j-1.2.11.jar" />
        
    < classpathentry  kind ="lib"  path ="src/plugin/lib-lucene-analyzers/lib/lucene-analyzers-1.9-rc1-dev.jar" />
        
    < classpathentry  kind ="lib"  path ="src/plugin/lib-nekohtml/lib/nekohtml-0.9.4.jar" />
        
    < classpathentry  kind ="lib"  path ="src/plugin/lib-xml/lib/jaxen-core.jar" />
        
    < classpathentry  kind ="lib"  path ="src/plugin/lib-xml/lib/jaxen-jdom.jar" />
        
    < classpathentry  kind ="lib"  path ="src/plugin/lib-xml/lib/jdom.jar" />
        
    < classpathentry  kind ="lib"  path ="src/plugin/lib-xml/lib/saxpath.jar" />
        
    < classpathentry  kind ="lib"  path ="src/plugin/lib-xml/lib/xercesImpl.jar" />
        
    < classpathentry  kind ="lib"  path ="src/plugin/ontology/lib/commons-logging-1.0.3.jar" />
        
    < classpathentry  kind ="lib"  path ="src/plugin/ontology/lib/icu4j_2_6_1.jar" />
        
    < classpathentry  kind ="lib"  path ="src/plugin/ontology/lib/jena-2.1.jar" />
        
    < classpathentry  kind ="lib"  path ="src/plugin/parse-html/lib/tagsoup-1.0rc3.jar" />
        
    < classpathentry  kind ="lib"  path ="src/plugin/parse-pdf/lib/PDFBox-0.7.2-log4j.jar" />
        
    < classpathentry  kind ="lib"  path ="src/plugin/parse-rss/lib/commons-feedparser-0.6-fork.jar" />
        
    < classpathentry  kind ="lib"  path ="src/plugin/parse-rss/lib/xmlrpc-1.2.jar" />
        
    < classpathentry  kind ="lib"  path ="src/plugin/parse-swf/lib/javaswf.jar" />
        
    < classpathentry  kind ="lib"  path ="src/plugin/protocol-ftp/lib/commons-net-1.2.0-dev.jar" />
        
    < classpathentry  kind ="lib"  path ="src/plugin/protocol-httpclient/lib/commons-codec.jar" />
        
    < classpathentry  kind ="lib"  path ="src/plugin/summary-lucene/lib/lucene-highlighter-2.0-rc1-dev.jar" />
        
    < classpathentry  kind ="lib"  path ="src/plugin/urlfilter-automaton/lib/automaton.jar" />
        
    < classpathentry  kind ="lib"  path ="src/plugin/parse-mp3/lib/jid3lib-0.5.1.jar" />
        
    < classpathentry  kind ="lib"  path ="src/plugin/parse-rtf/lib/rtf-parser.jar" />
        
    < classpathentry  kind ="con"  path ="org.eclipse.jdt.launching.JRE_CONTAINER" />
        
    < classpathentry  kind ="output"  path ="tmp_build" />
    </ classpath >

  2. .project 其内容为:
    <? xml version="1.0" encoding="UTF-8" ?>
    < projectDescription >
        
    < name > nutch-0.8 </ name >
        
    < comment ></ comment >
        
    < projects >
        
    </ projects >
        
    < buildSpec >
            
    < buildCommand >
                
    < name > org.eclipse.jdt.core.javabuilder </ name >
                
    < arguments >
                
    </ arguments >
            
    </ buildCommand >
        
    </ buildSpec >
        
    < natures >
            
    < nature > org.eclipse.jdt.core.javanature </ nature >
        
    </ natures >
    </ projectDescription >


导入eclipse:
  1. File->Import选择目录D:/nutch-0.8.1
  2. Run->Run...
    配置如下:


    Main class:  org.apache.nutch.crawl.Crawl
    Program arguments: urls -dir crawl.iscas -depth 2 -topN 50
    VM arguments: -Dhadoop.log.dir=logs -Dhadoop.log.file=hadoop.log



不知您是否运行,如果还没有,请email我:godric.wu@gmail.com






  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值