对于检索来说,Lucene4.10默认提供了很多检索模式,包括模糊查询、正则匹配、通配符匹配等有用的匹配模式,但是在实际使用时需要考虑Lucene匹配的效率和系统的需求然后选择相应的匹配模式。
Lucene也提供了分页的查询方式。可以在scoredocs中进行分页,适合数据量比较小的情况,数据量太大有可能导致内存溢出;使用SearchAfter分页,每页都从索引中查询数据,查询速度较上一种慢,但是不会有内存溢出的情况出现,这也是推荐的用法。
注意:在Lucene中是不存在date类型的,对于日期类型来说,如果需要进行索引查询,需要将日期转换为long类型进行存储和比较。
代码最有说服力,先上代码:
package com.johnny.lucene02.search;
import java.io.File;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
import org.apache.commons.io.ByteOrderMark;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.IntField;
import org.apache.lucene.document.LongField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.NumericRangeQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TermRangeFilter;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Version;
/**
* 对于中文来说,Lucene提供的search基本上不能使用,使用中文分词器替换即可
* @author Johnny
*
*/
public class SearchUtil {
private Version Lucene_Version = Version.LUCENE_4_10_2;
private Directory directory;
private DirectoryReader reader = null;
private String[] ids = {"1","2","3","4","5","6"};
private String[] emails = {"aa@itat.org","bb@itat.org","cc@cc.org","dd@sina.org","ee@zttc.edu","ff@itat.org"};
private String[] contents = {
"welcome to visited the space,I like book java",
"hello boy, I like pingpeng ball",
"my name is cc I like game java",
"I like football",
"I like football and I like basketball too",
"I like movie and swim java"
};
private Date[] dates = null;
private int[] attachs = {2,3,1,4,5,5};
private String[] names = {"zhangsan","lisi","john","jetty","mike","jake"};
public SearchUtil() {
// directory = new RAMDirectory();
try {
directory = FSDirectory.open(new File("/Users/ChinaMWorld/Desktop/index/"));
setDates();
} catch (IOException e) {
e.printStackTrace();
}
}
private void setDates() {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
try {
dates = new Date[ids.length];
dates[0] = sdf.parse("2010-02-19");
dates[1] = sdf.parse("2012-01-11");
dates[2] = sdf.parse("2011-09-19");
dates[3] = sdf.parse("2010-12-22");
dates[4] = sdf.parse("2012-01-01");
dates[5] = sdf.parse("2011-05-19");
} catch (ParseException e) {
e.printStackTrace();
}
}
public void index() {
IndexWriter writer = null;
try {
writer = new IndexWriter(directory,new IndexWriterConfig(Lucene_Version, new StandardAnalyzer()));
//writer.deleteAll();
Document doc = null;
for(int i=0;i<ids.length;i++) {
doc = new Document();
doc.add(new StringField("id",ids[i],Store.YES));
doc.add(new StringField("email", emails[i],Store.YES));
doc.add(new TextField("content", contents[i], Store.NO));
doc.add(new StringField("name",names[i], Store.YES));
//存储数字
doc.add(new IntField("attach",attachs[i], Store.YES));
//存储日期
doc.add(new LongField("date", dates[i].getTime(), Store.YES));
String et = emails[i].substring(emails[i].lastIndexOf("@")+1);
System.out.println(et);
/**
* 在Lucene4.x中,只能给域加权,部门给文档加权,如果要提高文档的加权,需要给
* 文档的每个域进行加权
* **/
writer.addDocument(doc);
}
} catch (IOException e) {
e.printStackTrace();
} finally{
try{
if(writer!=null) writer.close();
}catch(Exception ex){
ex.printStackTrace();
}
}
}
public IndexSearcher getSearcher() {
try {
if(reader==null) {
reader = DirectoryReader.open(directory);
} else {
DirectoryReader tr = DirectoryReader.openIfChanged(reader) ;
if(tr!=null) {
reader.close();
reader = tr;
}
}
return new IndexSearcher(reader);
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
/**
* 指定field进行查询,termquery不能进行数字和日期的查询
* 日期的查询需要转成数字进行查询,
* 数字查询使用NumbericRangeQuery
* @param field
* @param name
* @param num
*/
public void searchByTerm(String field,String name,int num) {
try {
IndexSearcher searcher = getSearcher();
Query query = new TermQuery(new Term(field,name));
TopDocs tds = searcher.search(query, num);
System.out.println("一共查询了:"+tds.totalHits);
for(ScoreDoc sd:tds.scoreDocs) {
Document doc = searcher.doc(sd.doc);
System.out.println(doc.get("id")+"---->"+
doc.get("name")+"["+doc.get("email")+"]-->"+doc.get("id")+","+
doc.get("attach")+","+doc.get("date"));
}
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
public void searchByTermRange(String field,String start,String end,int num) {
try {
IndexSearcher searcher = getSearcher();
Query query = new TermRangeQuery(field,new BytesRef(start.getBytes()),new BytesRef(end.getBytes()) , true, true);
TopDocs tds = searcher.search(query, num);
System.out.println("一共查询了:"+tds.totalHits);
for(ScoreDoc sd:tds.scoreDocs) {
Document doc = searcher.doc(sd.doc);
System.out.println(doc.get("id")+"---->"+
doc.get("name")+"["+doc.get("email")+"]-->"+doc.get("id")+","+
doc.get("attach")+","+doc.get("date"));
}
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
public void searchByQueryParse(Query query,int num) {
try {
IndexSearcher searcher = getSearcher();
TopDocs tds = searcher.search(query, num);
System.out.println("一共查询了:"+tds.totalHits);
for(ScoreDoc sd:tds.scoreDocs) {
Document doc = searcher.doc(sd.doc);
System.out.println(doc.get("id")+"---->"+
doc.get("name")+"["+doc.get("email")+"]-->"+doc.get("id")+","+
doc.get("attach")+","+doc.get("date")+"=="+sd.score);
}
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
/***如果想要获取为存储到索引中得值,可以根据ID去源文件中进行查找并返回**/
public void searchPage(String query,int pageIndex,int pageSize) {
try {
IndexSearcher searcher = getSearcher();
QueryParser parser = new QueryParser("content",new StandardAnalyzer());
Query q =null;
try {
q = parser.parse(query);
} catch (org.apache.lucene.queryparser.classic.ParseException e) {
e.printStackTrace();
}
TopDocs tds = searcher.search(q, 500);
ScoreDoc[] sds = tds.scoreDocs;
int start = (pageIndex-1)*pageSize;
int end = pageIndex*pageSize;
if(end>=sds.length) end = sds.length;
for(int i=start;i<end;i++) {
Document doc = searcher.doc(sds[i].doc);
String id = doc.get("id");
int arrInt = -1;
for(int j=0;j<ids.length;j++){
if(id.equals(ids[j])){
arrInt = j;
break;
}
}
System.out.println(sds[i].doc+":"+doc.get("name")+"-->"+contents[arrInt]);
}
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 根据页码和分页大小获取上一次的最后一个ScoreDoc
*/
private ScoreDoc getLastScoreDoc(int pageIndex,int pageSize,Query query,IndexSearcher searcher) throws IOException {
if(pageIndex==1)return null;//如果是第一页就返回空
int num = pageSize*(pageIndex-1);//获取上一页的数量
TopDocs tds = searcher.search(query, num);
return tds.scoreDocs[num-1];
}
/***
* 在使用时,searchAfter查询的是指定页数后面的数据,效率更高,推荐使用
* @param query
* @param pageIndex
* @param pageSize
*/
public void searchPageByAfter(String query,int pageIndex,int pageSize) {
try {
IndexSearcher searcher = getSearcher();
QueryParser parser = new QueryParser("content",new StandardAnalyzer());
Query q = null;
try {
q = parser.parse(query);
} catch (org.apache.lucene.queryparser.classic.ParseException e) {
e.printStackTrace();
}
//先获取上一页的最后一个元素
ScoreDoc lastSd = getLastScoreDoc(pageIndex, pageSize, q, searcher);
//通过最后一个元素搜索下页的pageSize个元素
TopDocs tds = searcher.searchAfter(lastSd,q, pageSize);
for(ScoreDoc sd:tds.scoreDocs) {
Document doc = searcher.doc(sd.doc);
String id = doc.get("id");
int arrInt = -1;
for(int j=0;j<ids.length;j++){
if(id.equals(ids[j])){
arrInt = j;
break;
}
}
System.out.println(doc.get("name")+"-->"+contents[arrInt]);
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
测试代码如下:
package com.johnny.lucene02.search;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.Version;
import org.junit.Before;
import org.junit.Test;
public class TestSearch {
private SearchUtil su = null;
@Before
public void init(){
su = new SearchUtil();
}
@Test
public void testInex(){
su.index();
}
@Test
public void testSearchByTerm(){
su.searchByTerm("content","i", 3);
}
@Test
public void testSearchByTermRange(){
//查询name以a开头和s结尾的
su.searchByTermRange("name","a","s",10);
//由于attachs是数字类型,使用TermRange无法查询
System.out.println("------------");
su.searchByTermRange("attach","2","10", 5);
}
@Test
public void testSearchByQueryParse() throws Exception{
//1、创建QueryParser对象,默认搜索域为content
QueryParser parser = new QueryParser( "content", new StandardAnalyzer());
//改变空格的默认操作符,以下可以改成AND
//parser.setDefaultOperator(Operator.AND);
//开启第一个字符的通配符匹配,默认关闭因为效率不高
parser.setAllowLeadingWildcard(true);
//搜索content中包含有like的
Query query = parser.parse("like");
//有basketball或者football的,空格默认就是OR
//query = parser.parse("basketball football");
//改变搜索域为name为mike
//query = parser.parse("content:like");
//同样可以使用*和?来进行通配符匹配
// query = parser.parse("name:j*");
//query = parser.parse("email:*@itat.org");
//匹配name中没有mike但是content中必须有pingpeng的,+和-要放置到域说明前面
query = parser.parse("-name:mike +like +pingpeng");
//匹配一个区间,注意:TO必须是大写
//query = parser.parse("id:[1 TO 6]");
//闭区间匹配只会匹配到2
//query = parser.parse("id:{1 TO 3}");
//完全匹配I Like Football的
//query = parser.parse("\"I like football\"");
//匹配I 和football之间有一个单词距离的
//query = parser.parse("\"I football\"~1");
//模糊查询
//query = parser.parse("name:make~");
//没有办法匹配数字范围(自己扩展Parser)
//query = parser.parse("attach:[2 TO 10]");
su.searchByQueryParse(query, 10);
}
@Test
public void testSearchPage() {
su.searchPage("java", 1,20);
}
@Test
public void testSearchPageByAfter() {
su.searchPageByAfter("java", 1,20);
}
}