Lucene V3.0.2分词、高亮

最新推荐文章于 2020-01-13 10:20:39 发布

telnetor

最新推荐文章于 2020-01-13 10:20:39 发布

阅读量3.3k

点赞数

分类专栏： Lucene/Nutch 文章标签： lucene string query exception search build

本文链接：https://blog.csdn.net/telnetor/article/details/6073737

版权

Lucene/Nutch 专栏收录该内容

9 篇文章 0 订阅

订阅专栏

Lucene版本升级到3.0以后，原来的分词方式(token=tokenStream.next())的方式已经被抛弃，转而使用TermAttribute，本例使用SmartChineseAnalyzer演示如何分词，此外，本例还演示了如何对命中文档域进行高亮显示。

package index; import java.io.File; import java.io.IOException; import java.io.StringReader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Index; import org.apache.lucene.document.Field.Store; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriter.MaxFieldLength; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.highlight.Highlighter; import org.apache.lucene.search.highlight.InvalidTokenOffsetsException; import org.apache.lucene.search.highlight.QueryScorer; import org.apache.lucene.search.highlight.SimpleFragmenter; import org.apache.lucene.search.highlight.SimpleHTMLFormatter; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.LockObtainFailedException; import org.apache.lucene.util.Version; public class AnalzyerTest { /** * lucene3.0开始已经抛弃了原来的分词方式，转而使用新的分词方式<br> * 本方法以SmartChineseAnalyzer为例，演示如何分词以及取得分词之后的term * * @throws Exception */ public static void analysis() throws Exception { Analyzer analyzer = new SmartChineseAnalyzer(Version.LUCENE_30); String string = "中国人民银行采取了一系列措施防止人民币升值，但是很遗憾，这些措施在今天看来其作用是微乎其微的。难道真的就没有什么别的措施防止人民币再次疯狂升值了吗?"; StringReader reader = new StringReader(string); TokenStream ts = analyzer.tokenStream("", reader); TermAttribute termAttribute = ts.getAttribute(TermAttribute.class); while (ts.incrementToken()) { System.out.print(termAttribute.term() + " "); } System.out.println(); } /** * 建索引<br> * 在构造IndexWriter时必须使用Directory作为参数了 * * @throws CorruptIndexException * @throws LockObtainFailedException * @throws IOException */ private static void build() throws CorruptIndexException, LockObtainFailedException, IOException { String path = "index"; IndexWriter writer = new IndexWriter(FSDirectory.open(new File(path)), new SmartChineseAnalyzer(Version.LUCENE_30), true, MaxFieldLength.LIMITED); Document document = new Document(); document.add(new Field("text", "中国人民银行采取了一系列措施防止人民币升值，但是很遗憾，这些措施在今天看来其作用是微乎其微的。难道真的就没有什么别的措施防止人民币再次疯狂升值了吗？", Store.YES, Index.ANALYZED)); writer.addDocument(document); writer.optimize(); writer.close(); } /** * 搜索也没有返回Hits类型结果的方法了 * * @param keyword * @throws CorruptIndexException * @throws IOException * @throws ParseException * @throws InvalidTokenOffsetsException */ private static void search(String keyword) throws CorruptIndexException, IOException, ParseException, InvalidTokenOffsetsException { Analyzer analyzer = new SmartChineseAnalyzer(Version.LUCENE_30); QueryParser parser = new QueryParser(Version.LUCENE_30, "text", analyzer); IndexSearcher searcher = new IndexSearcher(FSDirectory.open(new File("index"))); Query query = parser.parse(keyword); System.out.println(query); TopDocs topDocs = searcher.search(query, 10); ScoreDoc[] scoreDocs = topDocs.scoreDocs; System.out.println("hits:" + topDocs.totalHits); for (ScoreDoc scoreDoc : scoreDocs) { Document doc = searcher.doc(scoreDoc.doc); String text = doc.get("text"); System.out.println(highlight(text, query, analyzer)); } } /** * 高亮关键词 * * @param content * 需要高亮的内容 * @param query * 搜索时使用的Query对象 * @param analyzer * 分词器 * @return 高亮之后的文本 * @throws IOException * @throws InvalidTokenOffsetsException */ private static String highlight(String content, Query query, Analyzer analyzer) throws IOException, InvalidTokenOffsetsException { SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("<b>", "</b>"); Highlighter highlighter = new Highlighter(formatter, new QueryScorer(query)); highlighter.setTextFragmenter(new SimpleFragmenter(25)); String resultString = highlighter.getBestFragment(analyzer.tokenStream("", new StringReader(content)), content); return resultString + "..."; } public static void main(String[] args) throws Exception { analysis(); build(); search("中国人民币升值"); } }

本例需要使用四个jar包：

lucene-core-3.0.3.jar

lucene-highlighter-3.0.2.jar

lucene-smartcn-3.0.2.jar

lucene-memory-3.0.2.jar

在高亮时有一个特别需要注意的问题：

当输入的查询词为"人民币升值"时，会抛出NoClassDefFoundError：org/apache/lucene/index/memory/MemoryIndex异常，如果将“人民币”与“升值”使用空格分开，则不会出现此异常。

解决方式是将lucene-memory-3.0.2.jar添加到classpath中。