最近在研究lucene,其实很简单,可以整合中文分词器mmseg4j时,总是会报一些异常,这主要是版本兼容问题,在此做一个记录
环境:
lucene:4.3.1
mmseg4j:1.9.1
主要jar包,如下图:
因为我只要mmseg4j的分词器,所以不要solr包
直接上代码:
package com.chenlb.mmseg4j.example;
import java.io.File;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import com.chenlb.mmseg4j.analysis.ComplexAnalyzer;
public class Test {
private static final String INDEXPATH = "D:\\index";
private static Analyzer analyzer = new ComplexAnalyzer();
public static void main(String[] args) {
try {
indexCreate();
search();
} catch (Exception e) {
e.printStackTrace();
}
}
public static void indexCreate() throws Exception {
// 建立索引对象
Directory directory = FSDirectory.open(new File(INDEXPATH));
IndexWriterConfig iwConfig = new IndexWriterConfig(Version.LUCENE_43,
analyzer);
iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
IndexWriter writer = new IndexWriter(directory, iwConfig);
String content = "京华时报2008年1月23日报道 昨天,受一股来自中西伯利亚的强冷空气影响,本市出现大风降温天气,白天最高气温只有零下7摄氏度,同时伴有6到7级的偏北风。";
Document doc = new Document();
TextField textField = new TextField("title", content, Field.Store.YES);
doc.add(textField);
writer.addDocument(doc);
writer.close();
}
public static void search() throws Exception {
File indexDir = new File(INDEXPATH);
// 索引目录
Directory dir = FSDirectory.open(indexDir);
// 根据索引目录创建读索引对象
IndexReader reader = DirectoryReader.open(dir);
// 搜索对象创建
IndexSearcher searcher = new IndexSearcher(reader);
// 创建查询解析对象
QueryParser parser = new MultiFieldQueryParser(Version.LUCENE_43,
new String[] { "title" }, analyzer);
parser.setDefaultOperator(QueryParser.AND_OPERATOR);
String word = "中西伯利亚 ";
// 根据域和目标搜索文本创建查询器
Query query = parser.parse(word);
System.out.println("搜索关键词: " + query.toString(word));
// 对结果进行相似度打分排序
TopScoreDocCollector collector = TopScoreDocCollector.create(5 * 10,
true);
searcher.search(query, collector);
// 获取结果
ScoreDoc[] hits = collector.topDocs().scoreDocs;
int numTotalHits = collector.getTotalHits();
System.out.println("一共匹配" + numTotalHits + "个网页");
// 设置高亮显示格式
SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter(
"<font color='red'><strong>", "</strong></font>");
/* 语法高亮显示设置 */
Highlighter highlighter = new Highlighter(simpleHTMLFormatter,
new QueryScorer(query));
highlighter.setTextFragmenter(new SimpleFragmenter(100));
// 显示搜索结果
for (int i = 0; i < hits.length; i++) {
Document doc = searcher.doc(hits[i].doc);
String title = doc.get("title");
TokenStream titleTokenStream = analyzer.tokenStream(title,
new StringReader(title));
String highLightTitle = highlighter.getBestFragment(
titleTokenStream, title);
System.out.println((i + 1) + "." + title);
System.out.println(highLightTitle);
}
}
}
运行结果:
这就完成了.....