需要引入
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>3.6.0</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-highlighter</artifactId>
<version>3.6.0</version>
</dependency>
示例代码:
import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
public class DocSearch {
private static IndexSearcher isearcher = null;
public static void search(String key) throws IOException, ParseException, InvalidTokenOffsetsException{
Directory directory = FSDirectory.open(new File("E:\\output\\lucence\\index"));
// Now search the index:
IndexReader ireader = IndexReader.open(directory); // read-only=true
isearcher = new IndexSearcher(ireader);
// Parse a simple query that searches for "text":
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
// TokenStream tokenStream = analyzer.tokenStream("context", new StringReader("this is a quick gooobuy"));
// OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
// CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
//
// while (tokenStream.incrementToken()) {
// int startOffset = offsetAttribute.startOffset();
// int endOffset = offsetAttribute.endOffset();
// String term = charTermAttribute.toString();
// System.out.println(offsetAttribute.toString() + "\t" + term);
// }
QueryParser parser = new QueryParser(Version.LUCENE_CURRENT,"context", analyzer);
Query query = parser.parse(key);
ScoreDoc[] hits = isearcher.search(query, null, 1000).scoreDocs;
Highlighter hl = new Highlighter(new QueryScorer(query));
System.out.println(query.toString());
// Iterate through the results:
for (int i = 0; i < hits.length; i++) {
Document hitDoc = isearcher.doc(hits[i].doc);
TokenStream ts = analyzer.tokenStream("context", new StringReader(hitDoc.getValues("context")[0]));
String frament = hl.getBestFragment(ts, hitDoc.getValues("context")[0]);
System.out.println(frament);
// System.out.println(hitDoc.getValues("id")[0] + "\t" + hitDoc.getValues("context")[0] + "\t" + hits[i].score);
// Explanation explan = isearcher.explain(query, hits[i].doc);
// System.out.println(explan);
}
}
public static void main(String[] args) throws IOException, ParseException, InvalidTokenOffsetsException {
search("旧水泥袋");
isearcher.close();
}
}
索引建立和数据参考http://zhwj184.iteye.com/admin/blogs/1522709
输出结果:
context:旧 context:水 context:泥 context:袋
采购<B>旧</B>编织<B>袋</B>、<B>旧</B><B>水</B><B>泥</B><B>袋</B>
<B>水</B><B>泥</B>
采购<B>水</B><B>泥</B>电阻
求购<B>水</B><B>泥</B>输送链条和提升机
1万5 潜<B>水</B>料啤酒手提包 手提<B>袋</B>
大量采购包装用的编织<B>袋</B>(新的<B>旧</B>的,有无商标皆可)
铁<B>泥</B> 铁灰
废<B>旧</B>砂轮
软陶<B>泥</B>,超轻粘土
<B>水</B>泵
手<B>袋</B>
<B>水</B>锈石 上<B>水</B>石 吸<B>水</B>石
足浴<B>袋</B> 泡脚<B>袋</B> 异形<B>袋</B>
手提<B>袋</B>制<B>袋</B>机
回收库存废<B>旧</B>油墨油漆
回收库存<B>旧</B>油漆13463048572
求购废<B>旧</B>油漆油墨13463048572
求购库存<B>旧</B>化工树脂
highlighter类的分析
/**
* Class used to markup highlighted terms found in the best sections of a
* text, using configurable {@link Fragmenter}, {@link Scorer}, {@link Formatter},
* {@link Encoder} and tokenizers.
*/
public class Highlighter
{
public static final int DEFAULT_MAX_CHARS_TO_ANALYZE = 50*1024;
private int maxDocCharsToAnalyze = DEFAULT_MAX_CHARS_TO_ANALYZE;
private Formatter formatter;
private Encoder encoder;
private Fragmenter textFragmenter=new SimpleFragmenter();
private Scorer fragmentScorer=null;
public Highlighter(Scorer fragmentScorer)
{
this(new SimpleHTMLFormatter(),fragmentScorer);
}
public Highlighter(Formatter formatter, Scorer fragmentScorer)
{
this(formatter,new DefaultEncoder(),fragmentScorer);
}
public Highlighter(Formatter formatter, Encoder encoder, Scorer fragmentScorer)
{
this.formatter = formatter;
this.encoder = encoder;
this.fragmentScorer = fragmentScorer;
}
这里有两个扩展,formatter和encoder,formatter其实就是堆高亮部分的显示逻辑,比如默认是直接加<B></B>,encoder编码这里默认是不错任何处理,这里可以对输入的文本进行编码处理,
可以查看highlighter的encoder的一个默认实现
package org.apache.lucene.search.highlight;
/**
* Copyright 2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Simple {@link Encoder} implementation to escape text for HTML output
*
*/
public class SimpleHTMLEncoder implements Encoder
{
public SimpleHTMLEncoder()
{
}
public String encodeText(String originalText)
{
return htmlEncode(originalText);
}
/**
* Encode string into HTML
*/
public final static String htmlEncode(String plainText)
{
if (plainText == null || plainText.length() == 0)
{
return "";
}
StringBuilder result = new StringBuilder(plainText.length());
for (int index=0; index<plainText.length(); index++)
{
char ch = plainText.charAt(index);
switch (ch)
{
case '"':
result.append(""");
break;
case '&':
result.append("&");
break;
case '<':
result.append("<");
break;
case '>':
result.append(">");
break;
default:
if (ch < 128)
{
result.append(ch);
}
else
{
result.append("&#").append((int)ch).append(";");
}
}
}
return result.toString();
}
}
formatter的默认实现
package org.apache.lucene.search.highlight;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Simple {@link Formatter} implementation to highlight terms with a pre and
* post tag.
*/
public class SimpleHTMLFormatter implements Formatter {
private static final String DEFAULT_PRE_TAG = "<B>";
private static final String DEFAULT_POST_TAG = "</B>";
private String preTag;
private String postTag;
public SimpleHTMLFormatter(String preTag, String postTag) {
this.preTag = preTag;
this.postTag = postTag;
}
/** Default constructor uses HTML: <B> tags to markup terms. */
public SimpleHTMLFormatter() {
this(DEFAULT_PRE_TAG, DEFAULT_POST_TAG);
}
/* (non-Javadoc)
* @see org.apache.lucene.search.highlight.Formatter#highlightTerm(java.lang.String, org.apache.lucene.search.highlight.TokenGroup)
*/
public String highlightTerm(String originalText, TokenGroup tokenGroup) {
if (tokenGroup.getTotalScore() <= 0) {
return originalText;
}
// Allocate StringBuilder with the right number of characters from the
// beginning, to avoid char[] allocations in the middle of appends.
StringBuilder returnBuffer = new StringBuilder(preTag.length() + originalText.length() + postTag.length());
returnBuffer.append(preTag);
returnBuffer.append(originalText);
returnBuffer.append(postTag);
return returnBuffer.toString();
}
}