Lucene搜索功能
实现搜索功能,主要的API
示例:
public static void search(String indexDir, String q) throws IOException, ParseException {
Directory dir = FSDirectory.open(new File(indexDir));
IndexSearcher is = new IndexSearcher(dir);
QueryParser parser = new QueryParser(Version.LUCENE_30, "contents", new StandardAnalyzer(Version.LUCENE_30));
Query query = parser.parse(q);
long start = System.currentTimeMillis();
TopDocs hits = is.search(query, 10);
long end = System.currentTimeMillis();
System.err.println("Found " + hits.totalHits + " document(s) (in " + (end - start) + " milliseconds) that matched query '" + q + "':");
for (ScoreDoc scoreDoc:hits.scoreDocs
) {
Document doc = is.doc(scoreDoc.doc);
System.out.println(doc.get("fullpath"));
}
}
近实时搜索
Lucene提供了IndexReader.reopen()方法来支持近实时搜索,这个实时搜索是一个相对的,因为不完全严格做到实时。需要注意的是在jvm内存占用较多的时候,IndexReader.reopen()会比较慢。
示例:
package com.lucene._3_3;
import junit.framework.TestCase;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import java.io.IOException;
/**
* Created by xun.zhang on 2017/10/30.
*/
public class NearRealTimeTest extends TestCase {
public void testNearReadTime() throws IOException {
Directory dir = new RAMDirectory();
IndexWriter writer = new IndexWriter(dir, new StandardAnalyzer(Version.LUCENE_30),IndexWriter.MaxFieldLength.UNLIMITED);
for (int i = 0; i < 10; i++) {
Document doc = new Document();
doc.add(new Field("id",""+i,Field.Store.NO,Field.Index.NOT_ANALYZED_NO_NORMS));
doc.add(new Field("text","aaa",Field.Store.NO,Field.Index.ANALYZED));
writer.addDocument(doc);
}
IndexReader reader = writer.getReader();
IndexSearcher searcher = new IndexSearcher(reader);
Query query = new TermQuery(new Term("text","aaa"));
TopDocs docs = searcher.search(query,1);
assertEquals(10,docs.totalHits);
writer.deleteDocuments(new Term("id","7"));
Document doc = new Document();
doc.add(new Field("id","11",Field.Store.NO,Field.Index.NOT_ANALYZED_NO_NORMS));
doc.add(new Field("text","bbb",Field.Store.NO,Field.Index.ANALYZED));
writer.addDocument(doc);
IndexReader newReader = reader.reopen();
assertFalse(reader == newReader);
reader.close();
searcher = new IndexSearcher(newReader);
TopDocs hits = searcher.search(query,10);
assertEquals(9,hits.totalHits);
query = new TermQuery(new Term("text","bbb"));
hits = searcher.search(query,1);
assertEquals(1,hits.totalHits);
newReader.close();
writer.close();
}
}
评分机制
Lucene的评分公式
有一个位网友的博客写的非常好,这里引用一下:Lucene评分公式
expain()查看评分结果
package com.lucene._3_4;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import java.io.File;
import java.io.IOException;
/**
* Created by xun.zhang on 2017/10/30.
*/
public class Explainer {
public static void main(String[] args) throws IOException, ParseException {
if(args.length != 2) {
System.err.print("Usage: Explainer <index dir> <query>");
System.exit(1);
}
String indexDir = args[0];
String queryExpression = args[1];
Directory directory = FSDirectory.open(new File(indexDir));
QueryParser parser = new QueryParser(Version.LUCENE_30, "contents", new SimpleAnalyzer());
Query query = parser.parse(queryExpression);
System.out.println("Query: " + queryExpression);
IndexSearcher searcher = new IndexSearcher(directory);
TopDocs topDocs = searcher.search(query, 10);
for (ScoreDoc match:topDocs.scoreDocs
) {
Explanation explanation = searcher.explain(query, match.doc);
System.out.println("-----------------");
Document doc = searcher.doc(match.doc);
System.out.println(doc.get("filename"));
System.out.println(explanation.toString());
}
searcher.close();
directory.close();
}
}
多个Query
TermQuery类(通过项进行搜索)
对索引中特定项进行搜索是最基本的搜索方式。Term是最小的索引片段,每个Term包含了一个域名和一个文本值。
Term和Field的对照关系:
public void testKeyword() throws IOException {
Directory dir = new RAMDirectory();
IndexSearcher searcher = new IndexSearcher(dir);
Term t = new Term("isbn","987577656789");
Query query = new TermQuery(t);
TopDocs docs = searcher.search(query,10);
assertEquals("JUnit in Action, Second Edition",1,docs.totalHits);
searcher.close();
dir.close();
}
TermRangeQuery(指定范围内搜索)
索引中各个Term会按照字符进行排序,比如从a-z。TermRangeQuery是通过排序后的字符范围进行搜索。
public void testTermRangeQuery() throws IOException {
Directory dir = new RAMDirectory();
IndexSearcher searcher = new IndexSearcher(dir);
TermRangeQuery query = new TermRangeQuery("title2","d","j",true,true);
TopDocs matches = searcher.search(query,100);
assertEquals(3,matches.totalHits);
searcher.close();
dir.close();
}
NumericRangeQuery(指定时间内搜索)
NumericRangeQuery可以用来搜索通过NumericField进行索引的域,比如按时间查找等。
public void testInclusive() throws IOException {
Directory dir = new RAMDirectory();
IndexSearcher searcher = new IndexSearcher(dir);
NumericRangeQuery query = NumericRangeQuery.newIntRange("pubmonth",200605,200609,true,true);
TopDocs matches = searcher.search(query,10);
assertEquals(1,matches.totalHits);
dir.close();
}
PrefixQuery(字符串搜索)
用来搜索包含以指定字符串开头的项的文档。
package com.lucene._3_5;
import junit.framework.TestCase;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import java.io.IOException;
/**
* Created by xun.zhang on 2017/10/31.
*/
public class PrefixQueryTest extends TestCase {
public void testPrefix() throws IOException {
Directory dir = new RAMDirectory();
IndexSearcher searcher = new IndexSearcher(dir);
Term term = new Term("category","/technology/computers/programming");
PrefixQuery query = new PrefixQuery(term);
TopDocs matches = searcher.search(query,10);
int programmingAndBelow = matches.totalHits;
matches = searcher.search(new TermQuery(term),10);
int justProgramming = matches.totalHits;
assertTrue(programmingAndBelow > justProgramming);
searcher.close();
dir.close();
}
}
BooleanQuery(组合查询)
使用BooleanQuery可以把各种Query通过逻辑关系AND、OR、NOT组合起来搜索。
逻辑关系常量:
- BooleanClause.Occur.MUST
- BooleanClause.Occur.SHOULD
- BooleanClause.Occur.MUST_NOT
public void testAnd() throws IOException {
TermQuery searchingBooks = new TermQuery(new Term("subject","search"));
Query books2010 = NumericRangeQuery.newIntRange("pubmonth",201001,201012,true,true);
BooleanQuery searchingBooks2010 = new BooleanQuery();
searchingBooks2010.add(searchingBooks, BooleanClause.Occur.MUST);
searchingBooks2010.add(books2010,BooleanClause.Occur.MUST);
Directory dir = new RAMDirectory();
IndexSearcher searcher = new IndexSearcher(dir);
TopDocs matches = searcher.search(searchingBooks2010,10);
searcher.close();
dir.close();
}
public void testOr() throws IOException {
TermQuery query1 = new TermQuery(new Term("category","/tech/com/pro/method"));
TermQuery query2 = new TermQuery(new Term("category","/phil/eastern"));
BooleanQuery englishTenmentBooks = new BooleanQuery();
englishTenmentBooks.add(query1,BooleanClause.Occur.SHOULD);
englishTenmentBooks.add(query2,BooleanClause.Occur.SHOULD);
Directory dir = new RAMDirectory();
IndexSearcher searcher = new IndexSearcher(dir);
TopDocs docs = searcher.search(englishTenmentBooks,10);
System.out.println("or = " + englishTenmentBooks);
searcher.close();
dir.close();
}
PhraseQuery(短语搜索)
PhraseQuery类会根据这些位置信息定位某个距离范围内的项对应的文档,例如:假设某个城市中包含短语”the quick brown fox jumped over the lazy”,即使我们不知道这个短语的完整写法,也一样可以查找域中quick和fox相关并且距离很近的文档。这里有一个slop因子,表示两个项间隔几个字符位置也能搜索的到。
package com.lucene._3_8;
import com.sun.org.apache.xpath.internal.WhitespaceStrippingElementMatcher;
import junit.framework.TestCase;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import java.io.IOException;
/**
* Created by xun.zhang on 2017/10/31.
*/
public class PhraseQueryTest extends TestCase {
private Directory dir;
private IndexSearcher searcher;
protected void setUp() throws IOException {
dir = new RAMDirectory();
IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(),IndexWriter.MaxFieldLength.UNLIMITED);
Document doc = new Document();
doc.add(new Field("field","the quick brown fox jumped over the lazy dog",Field.Store.YES,Field.Index.ANALYZED));
writer.addDocument(doc);
writer.close();
searcher = new IndexSearcher(dir);
}
protected void tearDown() throws IOException {
searcher.close();
dir.close();
}
private boolean matched(String[] phrase, int slop) throws IOException {
PhraseQuery query = new PhraseQuery();
query.setSlop(slop);
for (String word:phrase
) {
query.add(new Term("field",word));
}
TopDocs matches = searcher.search(query,10);
return matches.totalHits > 0;
}
}
WildcardQuery(通配符搜索)
两个通配符:
- *,标识0个或多个字母
- ?,0个或1个字母
package com.lucene._3_9;
import junit.framework.TestCase;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import java.io.File;
import java.io.IOException;
/**
* Created by xun.zhang on 2017/10/31.
*/
public class WildcardQueryTest extends TestCase {
private static Directory dir = new RAMDirectory();
private void indexSingleFieldDocs(Field[] fields) throws IOException {
IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED);
for (Field f : fields
) {
Document doc = new Document();
doc.add(f);
writer.addDocument(doc);
}
writer.optimize();
writer.close();
}
public void testWildcard() throws IOException {
indexSingleFieldDocs(new Field[]{new Field("contents", "wild", Field.Store.YES, Field.Index.ANALYZED),
new Field("contents", "child", Field.Store.YES, Field.Index.ANALYZED),
new Field("contents", "mild", Field.Store.YES, Field.Index.ANALYZED),
new Field("contents", "mildew", Field.Store.YES, Field.Index.ANALYZED)});
IndexSearcher searcher = new IndexSearcher(dir);
Query query = new WildcardQuery(new Term("contents","?ild"));
TopDocs docs = searcher.search(query,10);
searcher.close();
}
}
FuzzyQuery(搜索类似项)
FuzzyQuery用来解决相似字搜索的问题,使用Levenshtein距离算法。
public void testFuzzy() throws IOException {
indexSingleFieldDocs(new Field[]{new Field("contents", "fuzzy", Field.Store.YES, Field.Index.ANALYZED),
new Field("contents", "wuzzy", Field.Store.YES, Field.Index.ANALYZED)});
IndexSearcher searcher = new IndexSearcher(dir);
Query query = new FuzzyQuery(new Term("contents", "wuzza"));
TopDocs matches = searcher.search(query, 10);
assertEquals("both close enough", 2, matches.totalHits);
assertTrue("wuzzy closer than fuzzy", matches.scoreDocs[0].score != matches.scoreDocs[1].score);
Document doc = searcher.doc(matches.scoreDocs[0].doc);
assertEquals("wuzza bear", "wuzzy", doc.get("contents"));
searcher.close();
}
MatchAllDocsQuery(匹配所有文档)
MatchAllDocsQuery,顾名思义,就是匹配索引中所有的文档。
QueryParser类,及其表达式
尽管使用API创建的Query很强大,不过看起来让然感觉很笨拙。Lucene提供了QueryParser类来分析传入的参数,自动创建多个Query的组合查询。
QueryParser处理的表达式范例:
- java,默认域包含java项的文档
- java junit(java OR junit),默认域包含java和junit中一个或两个的文档
- +java +junit(java AND junit),默认域中同时包含java和junit文档
- title:ant,title域中包含ant项的文档
- title:extreme -subject:sports(title:extreme AND NOT subject:sports),title域中包含extreme切subject域中不包含sports的文档
- (aglie OR extreme) AND methoddogy,默认域中包含methodogy且包含aglie和extreme中的一个或两个的文档
- title:”junit in action”,title域为junit in action的文档
- title:”junit action”~5,title域中junit和action之间的距离小于5的文档
- java*,包含由java开头的项的文档
- java~,包含与单词java相近的项的文档,如lava
- lastmodified:[1/1/09 TO 12/31/09],lastmodified域值在2009年1月1号和2009年12月31号之间的文档
注意:如果要搜素内容本身含有这些字符\ + - ! ( ) : ^ ] { } ~ * ?
时,需要使用转义符:\
Query.toString方法
Query.toString方法可以把Query翻译成搜索表达式。
public void testToString() {
BooleanQuery query = new BooleanQuery();
query.add(new FuzzyQuery(new Term("field", "kountry")), BooleanClause.Occur.MUST);
query.add(new TermQuery(new Term("title", "western")), BooleanClause.Occur.SHOULD);
assertEquals("both kinds", "+kountry~0.5 title:western", query.toString("field"));
}
TermQuery单项查询
public void testTermQuery() throws ParseException {
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
QueryParser parser = new QueryParser(Version.LUCENE_30, "subject", analyzer);
Query query = parser.parse("computers");
System.out.println("term: " + query);
}
项范围查询
public void testTermRangeQuery() throws ParseException, IOException {
Directory dir = new RAMDirectory();
StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
QueryParser parser = new QueryParser(Version.LUCENE_30, "subject", analyzer);
Query query = parser.parse("title2:[Q TO V]");
assertTrue(query instanceof TermRangeQuery);
}
前缀查询和通配符查询
public void testLowercasing() throws ParseException {
StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
QueryParser parser = new QueryParser(Version.LUCENE_30, "subject", analyzer);
Query query = parser.parse("PrefixQuery*");
assertEquals("lowercased","prefixquery*", query.toString("field"));
parser.setLowercaseExpandedTerms(false);
query = parser.parse("PrefixQuery*");
assertEquals("not lowercased","PrefixQuery*", query.toString("field"));
}
设置默认布尔操作符
StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
QueryParser parser = new QueryParser(Version.LUCENE_30, "subject", analyzer);
parser.setDefaultOperator(QueryParser.AND_OPERATOR);
短语查询
public void testPhraseQuery() throws ParseException {
StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
QueryParser parser = new QueryParser(Version.LUCENE_30, "field", analyzer);
Query query = parser.parse("\"This is Some Phrase\"");
assertEquals("analyzed", "\"? ? some phrase\"", query.toString("field"));
assertTrue("reduced to TermQuery", query instanceof TermQuery);
}
public void testSlop() throws ParseException {
StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
QueryParser parser = new QueryParser(Version.LUCENE_30, "field", analyzer);
Query query = parser.parse("\"exact phrase\"");
assertEquals("zero slop", "\"exact phrase\"", query.toString("field"));
parser.setPhraseSlop(5);
query = parser.parse("\"sloppy phrase\"");
assertEquals("sloppy phrase", "\"sloppy phrase\"~5", query.toString());
}
模糊查询
public void testFuzzyQuery() throws ParseException {
StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
QueryParser parser = new QueryParser(Version.LUCENE_30, "subject", analyzer);
Query query = parser.parse("kountry~");
System.out.println("fuzzy: " + query);
query = parser.parse("kountry~0.7");
System.out.println("fuzzy 2: " + query);
}
分组查询
public void testGrouping() throws ParseException, IOException {
Directory dir = new RAMDirectory();
StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
QueryParser parser = new QueryParser(Version.LUCENE_30, "subject", analyzer);
Query query = parser.parse("(agile OR extreme) AND methodology");
IndexSearcher searcher = new IndexSearcher(dir);
TopDocs matches = searcher.search(query,10);
}
为子查询加权
比如有个表达式如下:
Query query = parser.parse("junit^2.0 testing");
通过QueryParseer分析后,会生成两个TermQuery的OR组合查询,其中junit的TermQuery的权重为2.0,testing的TermQuery的权重为1.0。
参考:
Lucene实战(第2版)著:Michael McCandles、Erik Hatcher、Otis Gospodnetic 译:牛长流、肖宇