使用nutch进行搜索代码
package com.ideagrace.nutch;
import java.io.File;
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolFactory;
import org.apache.nutch.protocol.ProtocolNotFound;
import org.apache.nutch.util.NutchConf;
import org.apache.nutch.searcher.*;
import org.apache.nutch.html.Entities;
public class NutchTest {
public static void main(String args[]) {
System.out.println(NutchConf.get().get("searcher.dir"));
try {
NutchBean bean = new NutchBean(new File("C://cygwin//nutch-0.7.2//bin//crawled2"));
// NutchBean bean = new NutchBean();
String keyword = "侯白出谜";
String queryString = Entities.encode("侯白出谜");
queryString = keyword;
Query query = Query.parse(queryString);
Hits hits;
hits = bean.search(query, 20);
System.out.println("result size is : " + hits.getLength());
int length = (int)Math.min(hits.getTotal(), 10);
Hit[] show = hits.getHits(0, length);
// Hit[] show = hits.getHits(0, 20);
HitDetails[] details = bean.getDetails(show);
String[] summaries = bean.getSummary(details, query);
if (hits != null) {
for (int i = 0; i < show.length; i++) {
Hit hit = show[i];
HitDetails detail = details[i];
String summary = summaries[i];
System.out.println("---------------------------");
System.out.println("hit id is :" + hit.getIndexNo());
System.out.println("hit doc id is :" + hit.getIndexDocNo());
System.out.println("title is :" + detail.getValue("title"));
System.out.println("url is :" + detail.getValue("url"));
System.out.println("summary is :" + summary);
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
以上是对爬行结果进行查询的代码,参考nutch的搜索程序。把它放在application中运行,了解一下nutch的查询。呵呵,只是一个很简单的程序,以此为入口,希望深入的学习nutch。
运行这个查询,试用到了nutch的plugin。nutch的大部分参数都在nutch-default.xml中定义,以下这段代码定义了插件存放的文件夹,以及引入了哪些插件:
<!-- plugin properties -->
<property>
<name>plugin.folders</name>
<value>plugins</value>
<description>Directories where nutch plugins are located. Each
element may be a relative or absolute path. If absolute, it is used
as is. If relative, it is searched for on the classpath.</description>
</property>
<property>
<name>plugin.includes</name>
<value>nutch-extensionpoints|protocol-http|urlfilter-regex|parse-(text|html)|index-basic|query-(basic|site|url)</value>
<description>Regular expression naming plugin directory names to
include. Any plugin not matching this expression is excluded.
In any case you need at least include the nutch-extensionpoints plugin. By
default Nutch includes crawling just HTML and plain text via HTTP,
and basic indexing and search plugins.
</description>
</property>
<property>
<name>plugin.excludes</name>
<value></value>
<description>Regular expression naming plugin directory names to exclude.
</description>
</property>
<script type="text/javascript"> </script> <script src="http://pagead2.googlesyndication.com/pagead/show_ads.js" type="text/javascript"> </script> name="google_ads_frame" marginwidth="0" marginheight="0" src="http://pagead2.googlesyndication.com/pagead/ads?client=ca-pub-3593204875158947&dt=1179969527437&lmt=1179969527&prev_fmts=728x90_as&format=250x250_as&output=html&correlator=1179969527328&channel=3252141990&url=http%3A%2F%2Fwww.ideagrace.com%2Fclub%2Fread.php%3Ftid%3D320&color_bg=FFFFFF&color_text=000000&color_link=0000CC&color_url=000000&color_border=FFFFFF&ad_type=text_image&cc=179&flash=9&u_h=768&u_w=1024&u_ah=734&u_aw=1024&u_cd=32&u_tz=480&u_his=3&u_java=true" frameborder="0" width="250" scrolling="no" height="250" allowtransparency="allowtransparency"> |
package com.ideagrace.nutch;
import java.io.File;
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolFactory;
import org.apache.nutch.protocol.ProtocolNotFound;
import org.apache.nutch.util.NutchConf;
import org.apache.nutch.searcher.*;
import org.apache.nutch.html.Entities;
public class NutchTest {
public static void main(String args[]) {
System.out.println(NutchConf.get().get("searcher.dir"));
try {
NutchBean bean = new NutchBean(new File("C://cygwin//nutch-0.7.2//bin//crawled2"));
// NutchBean bean = new NutchBean();
String keyword = "侯白出谜";
String queryString = Entities.encode("侯白出谜");
queryString = keyword;
Query query = Query.parse(queryString);
Hits hits;
hits = bean.search(query, 20);
System.out.println("result size is : " + hits.getLength());
int length = (int)Math.min(hits.getTotal(), 10);
Hit[] show = hits.getHits(0, length);
// Hit[] show = hits.getHits(0, 20);
HitDetails[] details = bean.getDetails(show);
String[] summaries = bean.getSummary(details, query);
if (hits != null) {
for (int i = 0; i < show.length; i++) {
Hit hit = show[i];
HitDetails detail = details[i];
String summary = summaries[i];
System.out.println("---------------------------");
System.out.println("hit id is :" + hit.getIndexNo());
System.out.println("hit doc id is :" + hit.getIndexDocNo());
System.out.println("title is :" + detail.getValue("title"));
System.out.println("url is :" + detail.getValue("url"));
System.out.println("summary is :" + summary);
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
以上是对爬行结果进行查询的代码,参考nutch的搜索程序。把它放在application中运行,了解一下nutch的查询。呵呵,只是一个很简单的程序,以此为入口,希望深入的学习nutch。
运行这个查询,试用到了nutch的plugin。nutch的大部分参数都在nutch-default.xml中定义,以下这段代码定义了插件存放的文件夹,以及引入了哪些插件:
<!-- plugin properties -->
<property>
<name>plugin.folders</name>
<value>plugins</value>
<description>Directories where nutch plugins are located. Each
element may be a relative or absolute path. If absolute, it is used
as is. If relative, it is searched for on the classpath.</description>
</property>
<property>
<name>plugin.includes</name>
<value>nutch-extensionpoints|protocol-http|urlfilter-regex|parse-(text|html)|index-basic|query-(basic|site|url)</value>
<description>Regular expression naming plugin directory names to
include. Any plugin not matching this expression is excluded.
In any case you need at least include the nutch-extensionpoints plugin. By
default Nutch includes crawling just HTML and plain text via HTTP,
and basic indexing and search plugins.
</description>
</property>
<property>
<name>plugin.excludes</name>
<value></value>
<description>Regular expression naming plugin directory names to exclude.
</description>
</property>