Nutch1.2标题关键字高亮的正确方法

最新推荐文章于 2024-09-12 19:51:41 发布

j2mvc

最新推荐文章于 2024-09-12 19:51:41 发布

阅读量119

点赞数

文章标签： Nutch 搜索引擎关键字高亮标题 lucene

本文链接：https://blog.csdn.net/gyjavaer/article/details/84221223

版权

最近在弄Nutch1.2，实现关键字高亮，却发现标题关键字高亮的方法，国内网站上的都是错的,最终在nutch.apache.org网站找到了相近的代码，进行修改，终于成功完成
关键字的高亮需要自己再创建一个分词器,关键的类是TokenStream,lucene3.0以上需要用到TermAttribute。

一、内容关键字高亮很简单，修改include/style.html即可：

.highlight {
color:#FF0000;
}

二、标题关键字高亮的方法：

我们从内容关键字高亮的方法可以得到启发：

首先来看这一句：

String summary = summaries[i].toHtml(true);

这个是调用了org.apache.nutch.searcher.Summary方法
public String toHtml(boolean encode)｛...｝

这是标题的获取方法

String title = detail.getValue("title");

我们可不可以也像summary一样调用呢，答案是肯定的,但是,nutch本身并未提标题关键字高亮的方法，这里需要我们写类和方法。

新建Titler.java

package org.apache.nutch.searcher;

import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashSet;

import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.nutch.analysis.NutchDocumentAnalyzer;
import org.apache.nutch.searcher.Summary.Fragment;

public class Titler implements Configurable {

private int maxLength = 40;
private Analyzer analyzer = null;
private Configuration conf = null;

public Titler(Configuration conf) {
setConf(conf);
}

public Configuration getConf() {
return conf;
}

public void setConf(Configuration conf) {
this.conf = conf;
this.analyzer = new NutchDocumentAnalyzer(conf);
this.maxLength = conf.getInt("searcher.title.maxlength", 40);
}

public Summary getSummary(String text, Query query) {
Token[] tokens = getTokens(text); // parse text to token array

if (tokens.length == 0)
return new Summary();

String[] terms = query.getTerms();
HashSet highlight = new HashSet(); // put query terms in table
for (int i = 0; i < terms.length; i++)
highlight.add(terms[i]);

Summary s = new Summary();

for (int i = 0; i < tokens.length && i < maxLength; i++) {
Token token = tokens[i];
//
// If we find a term that's in the query...
//
if (highlight.contains(token.term())) {
s.add(new Highlight(token.term()));
}else{
s.add(new Fragment(token.term()));
}
}
return s;
}

/** A highlighted fragment of text within a summary. */
public static class Highlight extends Fragment {
/** Constructs a highlighted fragment for the given text. */
public Highlight(String text) {
super(text);
}
/** Returns true. */
public boolean isHighlight() {
return true;
}
}

private Token[] getTokens(String text) {
ArrayList result = new ArrayList();
TokenStream ts = analyzer.tokenStream("title", new StringReader(text));

TermAttribute termAtt = (TermAttribute) ts
.getAttribute(TermAttribute.class);
TypeAttribute typeAtt = (TypeAttribute) ts
.getAttribute(TypeAttribute.class);

try {
while (ts.incrementToken()) {
Token token = new Token();
token.setTermBuffer(termAtt.term());
result.add(token);
}
} catch (IOException e) {
e.printStackTrace();
}
return (Token[]) result.toArray(new Token[result.size()]);
}
}

然后在NutchBean.java，添加

private Titler titler;

public Summary getTitle(HitDetails hit, Query query) throws IOException {
return titler.getSummary(hit.getValue("title"), query);
}

public NutchBean(Configuration conf, Path dir) throws IOException {
...
this.titler = new Titler(conf);
}

我这里测试的JSP页面是新建s.jsp.如果要在原来的search.jsp页面内调用，需要修改相应的代码。

<%@ page
session="false"
contentType="text/html; charset=UTF-8"
pageEncoding="UTF-8"

import="java.io.*"
import="java.util.*"
import="java.net.*"
import="javax.servlet.http.*"
import="javax.servlet.*"

import="org.apache.nutch.html.Entities"
import="org.apache.nutch.metadata.Nutch"
import="org.apache.nutch.searcher.*"
import="org.apache.nutch.plugin.*"
import="org.apache.nutch.clustering.*"
import="org.apache.hadoop.conf.*"
import="org.apache.nutch.util.NutchConfiguration"
%>
<jsp:include page="/show/include/style.html"/>
<%

String keyword = "贵阳pep艺术中心― 中心简介";
String crawl = "/home/961a/workspace/test/Nutch1.2Web/ROOT/crawl";
String summarylength = "120";

final Configuration conf = NutchConfiguration.create();
conf.set("searcher.dir", crawl);
conf.set("searcher.summary.length", summarylength);

final NutchBean bean = new NutchBean(conf);
try {
final Query query = Query.parse(keyword, conf);
query.getParams().setMaxHitsPerDup(0);
final Hits hits = bean.search(query);

out.println("Total hits: " + hits.getTotal() + " keyword:贵阳pep艺术中心― 中心简介; ");

final int length = (int) Math.min(hits.getLength(), 10);
final Hit[] show = hits.getHits(0, length);
final HitDetails[] details = bean.getDetails(show);
final Summary[] summaries = bean.getSummary(details, query);

for (int i = 0; i < hits.getLength(); i++) {
String url = Entities.encode(details[i].getValue("url"));
String title = bean.getTitle(details[i], query).toHtml(true);
String summary = summaries[i].toHtml(true);
%>

<a href="<%=url%>"><%=title%></a>
 
<%=summary%>
 
<%=Entities.encode(details[i].getValue("url"))%>

<%
}
} catch (Throwable t) {
}%>