使用nutch进行搜索代码

使用nutch进行搜索代码
<script type="text/javascript"> </script> <script src="http://pagead2.googlesyndication.com/pagead/show_ads.js" type="text/javascript"> </script> name="google_ads_frame" marginwidth="0" marginheight="0" src="http://pagead2.googlesyndication.com/pagead/ads?client=ca-pub-3593204875158947&dt=1179969527437&lmt=1179969527&prev_fmts=728x90_as&format=250x250_as&output=html&correlator=1179969527328&channel=3252141990&url=http%3A%2F%2Fwww.ideagrace.com%2Fclub%2Fread.php%3Ftid%3D320&color_bg=FFFFFF&color_text=000000&color_link=0000CC&color_url=000000&color_border=FFFFFF&ad_type=text_image&cc=179&flash=9&u_h=768&u_w=1024&u_ah=734&u_aw=1024&u_cd=32&u_tz=480&u_his=3&u_java=true" frameborder="0" width="250" scrolling="no" height="250" allowtransparency="allowtransparency">

package com.ideagrace.nutch;

import java.io.File;

import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolFactory;
import org.apache.nutch.protocol.ProtocolNotFound;
import org.apache.nutch.util.NutchConf;
import org.apache.nutch.searcher.*;
import org.apache.nutch.html.Entities;

public class NutchTest {
  public static void main(String args[]) {
   
    System.out.println(NutchConf.get().get("searcher.dir"));
    try {
        NutchBean bean = new NutchBean(new File("C://cygwin//nutch-0.7.2//bin//crawled2"));
       
//         NutchBean bean = new NutchBean();
        String keyword = "侯白出谜";
        String queryString = Entities.encode("侯白出谜");
        queryString = keyword;
        Query query = Query.parse(queryString);
        Hits hits;
        hits = bean.search(query, 20);
        System.out.println("result size is : " + hits.getLength());

        int length = (int)Math.min(hits.getTotal(), 10);
        Hit[] show = hits.getHits(0, length);
       
//         Hit[] show = hits.getHits(0, 20);
        HitDetails[] details = bean.getDetails(show);
        String[] summaries = bean.getSummary(details, query);
        if (hits != null) {
          for (int i = 0; i < show.length; i++) {
            Hit hit = show[i];
            HitDetails detail = details[i];
            String summary = summaries[i];
            System.out.println("---------------------------");
            System.out.println("hit id is :" + hit.getIndexNo());
            System.out.println("hit doc id is :" + hit.getIndexDocNo());
            System.out.println("title is :" + detail.getValue("title"));
            System.out.println("url is :" + detail.getValue("url"));
            System.out.println("summary is :" + summary);
          }
        }
    } catch (Exception e) {
        e.printStackTrace();
    }
  }
}


以上是对爬行结果进行查询的代码,参考nutch的搜索程序。把它放在application中运行,了解一下nutch的查询。呵呵,只是一个很简单的程序,以此为入口,希望深入的学习nutch。
运行这个查询,试用到了nutch的plugin。nutch的大部分参数都在nutch-default.xml中定义,以下这段代码定义了插件存放的文件夹,以及引入了哪些插件:
<!-- plugin properties -->
<property>
<name>plugin.folders</name>
<value>plugins</value>
<description>Directories where nutch plugins are located. Each
element may be a relative or absolute path. If absolute, it is used
as is. If relative, it is searched for on the classpath.</description>
</property>
<property>
<name>plugin.includes</name>
<value>nutch-extensionpoints|protocol-http|urlfilter-regex|parse-(text|html)|index-basic|query-(basic|site|url)</value>
<description>Regular expression naming plugin directory names to
include. Any plugin not matching this expression is excluded.
In any case you need at least include the nutch-extensionpoints plugin. By
default Nutch includes crawling just HTML and plain text via HTTP,
and basic indexing and search plugins.
</description>
</property>
<property>
<name>plugin.excludes</name>
<value></value>
<description>Regular expression naming plugin directory names to exclude.
</description>
</property>



 
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值