使用luncene3.6
下载地址 http://www.apache.org/dyn/closer.cgi/lucene/java/3.6.1
分词器使用 mmseg4j
下载地址 http://code.google.com/p/mmseg4j/downloads/list
项目中使用到的jar包有 :
lucene-3.6.1/lucene-core-3.6.1.jar(核心包)
lucene-3.6.1/contrib/lucene-analyzers-3.6.1.jar(分词包)
lucene-3.6.1/contrib/lucene-highlighter-3.6.1.jar(高亮包)
mmseg4j-1.8.3/mmseg4j-all-1.8.3.jar(第三方分词器,因为luncene自带的分词器没有词库,汉字都是一个一个拆开的)
使用时记得设置词库的路径
mmseg4j-1.8.3/data
package lucene;
import java.io.File;
import java.io.StringReader;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import com.chenlb.mmseg4j.analysis.MaxWordAnalyzer;
public class LuceneSearch {
//mmseg4j词库路径
private static final String DISC_URL = "D:/My Documents/Downloads/mmseg4j-1.8.3/data";
//指定分词器 StandardAnalyzer、MaxWordAnalyzer、SimpleAnalyzer、ComplexAnalyzer
private static Analyzer analyzer = new MaxWordAnalyzer(new File(DISC_URL));
//lucene版本
private static Version version = Version.LUCENE_36;
//指定索引位置 RAMDirectory(内存)、FSDirectory(文件)
private static Directory directory = new RAMDirectory();
//配置
private static IndexWriterConfig conf = new IndexWriterConfig(version,analyzer);
//配置IndexWriter
private static IndexWriter writer;
static{
try {
writer = new IndexWriter(directory, conf);
} catch (Exception e) {
}
}
/**
* 全量索引
* @Author TangJiaZhi
*/
public void fullIndex(Document[] documentes) throws Exception {
writer.deleteAll();
for (Document document : documentes) {
writer.addDocument(document);
}
writer.commit();
// writer.close();
}
/**
* 根据id删除索引
* @Author TangJiaZhi
*/
public void deleteIndex(Document document)throws Exception{
Term term = new Term("id", document.get("id"));
writer.deleteDocuments(term);
writer.commit();
}
/**
* 根据id增量索引
* @Author TangJiaZhi
*/
public void updateIndex(Document[] documentes) throws Exception{
for (Document document : documentes) {
Term term = new Term("id", document.get("id"));
writer.updateDocument(term, document);
}
writer.commit();
// writer.close();
}
/**
* 直接查询
* @Author TangJiaZhi
*/
public void simpleSearch(String filedStr,String queryStr,int page, int pageSize) throws Exception{
IndexReader reader = IndexReader.open(directory);
IndexSearcher searcher = new IndexSearcher(reader);
TopScoreDocCollector topCollector = TopScoreDocCollector.create(searcher.maxDoc(), false);
Term term = new Term(filedStr, queryStr);
Query query = new TermQuery(term);
searcher.search(query, topCollector);
ScoreDoc[] docs = topCollector.topDocs((page-1)*pageSize, pageSize).scoreDocs;
printScoreDoc(docs, searcher);
}
/**
* 高亮查询
* @Author TangJiaZhi
*/
public void highLightSearch(String filedStr,String queryStr,int page, int pageSize) throws Exception{
IndexReader reader = IndexReader.open(directory);
IndexSearcher searcher = new IndexSearcher(reader);
TopScoreDocCollector topCollector = TopScoreDocCollector.create(searcher.maxDoc(), false);
Term term = new Term(filedStr, queryStr);
Query query = new TermQuery(term);
searcher.search(query, topCollector);
ScoreDoc[] docs = topCollector.topDocs((page-1)*pageSize, pageSize).scoreDocs;
Formatter formatter = new SimpleHTMLFormatter("<span>","</span>");
Highlighter highlighter = new Highlighter(formatter, new QueryScorer(query));
for (int i = 0; i < docs.length; i++) {
List<Fieldable> list = searcher.doc(docs[i].doc).getFields();
for (Fieldable fieldable : list) {
String fieldName = fieldable.name();
String fieldValue = fieldable.stringValue();
TokenStream ts = analyzer.tokenStream(fieldName, new StringReader(fieldValue));
String frament = highlighter.getBestFragment(ts, fieldValue);
System.out.println(fieldName+" : "+frament);
}
}
}
/**
* 根据前缀查询
* @Author TangJiaZhi
*/
public void prefixSearch(String filedStr,String queryStr) throws Exception{
IndexReader reader = IndexReader.open(directory);
IndexSearcher searcher = new IndexSearcher(reader);
Term term = new Term(filedStr, queryStr);
Query query = new PrefixQuery(term);
ScoreDoc[] docs = searcher.search(query, 3).scoreDocs;
printScoreDoc(docs, searcher);
}
/**
* 通配符查询
* @Author TangJiaZhi
*/
public void wildcardSearch(String filedStr,String queryStr) throws Exception{
IndexReader reader = IndexReader.open(directory);
IndexSearcher searcher = new IndexSearcher(reader);
Term term = new Term(filedStr, queryStr);
Query query = new WildcardQuery(term);
ScoreDoc[] docs = searcher.search(query, 3).scoreDocs;
printScoreDoc(docs, searcher);
}
/**
* 分词查询
* @Author TangJiaZhi
*/
public void analyzerSearch(String filedStr,String queryStr) throws Exception{
IndexReader reader = IndexReader.open(directory);
IndexSearcher searcher = new IndexSearcher(reader);
QueryParser queryParser = new QueryParser(version, filedStr, analyzer);
Query query = queryParser.parse(queryStr);
ScoreDoc[] docs = searcher.search(query, 3).scoreDocs;
printScoreDoc(docs, searcher);
}
/**
* 多属性分词查询
* @Author TangJiaZhi
*/
public void multiAnalyzerSearch(String[] filedStr,String queryStr) throws Exception{
IndexReader reader = IndexReader.open(directory);
IndexSearcher searcher = new IndexSearcher(reader);
QueryParser queryParser = new MultiFieldQueryParser(version, filedStr, analyzer);
Query query = queryParser.parse(queryStr);
ScoreDoc[] docs = searcher.search(query, 3).scoreDocs;
printScoreDoc(docs, searcher);
}
public void printScoreDoc(ScoreDoc[] docs,IndexSearcher searcher)throws Exception{
for (int i = 0; i < docs.length; i++) {
List<Fieldable> list = searcher.doc(docs[i].doc).getFields();
for (Fieldable fieldable : list) {
String fieldName = fieldable.name();
String fieldValue = fieldable.stringValue();
System.out.println(fieldName+" : "+fieldValue);
}
}
}
public static void main(String[] args) throws Exception {
LuceneSearch t = new LuceneSearch();
Document d1 = new Document();
d1.add(new Field("id", "1", Store.YES, Index.ANALYZED));
d1.add(new Field("name", "苦逼的程序员", Store.YES, Index.ANALYZED));
Document d2 = new Document();
d2.add(new Field("id", "2", Store.YES, Index.ANALYZED));
d2.add(new Field("name", "2b的程序员", Store.YES, Index.ANALYZED));
Document[] documentes = {d1,d2};
System.out.println("--------------------------全量索引--------------------------");
t.fullIndex(documentes);
t.simpleSearch("name", "程序", 1, 10);
t.highLightSearch("name", "程序", 1, 10);
System.out.println("--------------------------增量索引--------------------------");
d1.removeField("name");
d1.add(new Field("name", "程序", Store.YES, Index.ANALYZED));
t.updateIndex(documentes);
t.simpleSearch("name", "程序", 1, 10);
System.out.println("--------------------------删除索引--------------------------");
t.deleteIndex(d1);
t.simpleSearch("name", "2b", 1, 10);
System.out.println("--------------------------分词查询--------------------------");
t.multiAnalyzerSearch(new String[]{"id","name"}, "苦逼的程序员");
}
}
使用QueryParser接口查询时,会根据指定的分词器对查询条件先分词再搜索
这就是为什么
t.multiAnalyzerSearch(new String[]{"id","name"}, "苦逼的程序员");
能搜索出结果的原因
不过实际项目中所要求的站内搜索直接使用lucene的子项目solr就可以轻松的实现了。