一、搜索知识架构
二、例子
import java.io.File;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.NumericField;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.NumericRangeQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.util.Version;
public class SearcherUtil {
private Directory directory;
private IndexReader reader;
private String[] ids = {"1","2","3","4","5","6"};
private String[] emails = {"aa@itat.org","bb@itat.org","cc@cc.org","dd@sina.org","ee@zttc.edu","ff@itat.org"};
private String[] contents = {
"welcome to visited the space,I like book",
"hello boy, I like pingpeng ball",
"my name is cc I like game",
"I like football",
"I like football and I like basketball too",
"I like movie and swim"
};
private Date[] dates = null;
private int[] attachs = {2,3,1,4,5,5};
private String[] names = {"zhangsan","lisi","john","jetty","mike","jake"};
private Map<String,Float> scores = new HashMap<String,Float>();
public SearcherUtil() {
// directory = new RAMDirectory();//内存索引
try {
directory = FSDirectory.open(new File("d:/lucenc/index03"));//硬盘索引
setDates();
scores.put("itat.org",2.0f);
scores.put("zttc.edu", 1.5f);
index();
} catch (IOException e) {
e.printStackTrace();
}
}
private void setDates() {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
try {
dates = new Date[ids.length];
dates[0] = sdf.parse("2010-02-19");
dates[1] = sdf.parse("2012-01-11");
dates[2] = sdf.parse("2011-09-19");
dates[3] = sdf.parse("2010-12-22");
dates[4] = sdf.parse("2012-01-01");
dates[5] = sdf.parse("2011-05-19");
} catch (ParseException e) {
e.printStackTrace();
}
}
/** 建立索引
* @author
* @param
* @return
*/
public void index() {
IndexWriter writer = null;
try {
writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35)));
writer.deleteAll();
Document doc = null;
for(int i=0;i<ids.length;i++) {
doc = new Document();
doc.add(new Field("id",ids[i],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));
doc.add(new Field("email",emails[i],Field.Store.YES,Field.Index.NOT_ANALYZED));
doc.add(new Field("content",contents[i],Field.Store.NO,Field.Index.ANALYZED));
doc.add(new Field("name",names[i],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));
//存储数字
doc.add(new NumericField("attach",Field.Store.YES,true).setIntValue(attachs[i]));
//存储日期
doc.add(new NumericField("date",Field.Store.YES,true).setLongValue(dates[i].getTime()));
String et = emails[i].substring(emails[i].lastIndexOf("@")+1);
if(scores.containsKey(et)) {
doc.setBoost(scores.get(et));
} else {
doc.setBoost(0.5f);
}
writer.addDocument(doc);
}
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (LockObtainFailedException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if(writer!=null)writer.close();
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
/** 得到IndexSearcher
* @author
* @param
* @return
*/
public IndexSearcher getSearcher() {
try {
if(reader==null) {
reader = IndexReader.open(directory);
} else {
IndexReader tr = IndexReader.openIfChanged(reader);
if(tr!=null) {
reader.close();
reader = tr;
}
}
return new IndexSearcher(reader);
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
public IndexSearcher getSearcher(Directory directory) {
try {
if(reader==null) {
reader = IndexReader.open(directory);
} else {
IndexReader tr = IndexReader.openIfChanged(reader);
if(tr!=null) {
reader.close();
reader = tr;
}
}
return new IndexSearcher(reader);
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
/** 精确查询
* @author
* @param
* @return
*/
public void searchByTerm(String field,String name,int num) {
try {
IndexSearcher searcher = getSearcher();
Query query = new TermQuery(new Term(field,name));
TopDocs tds = searcher.search(query, num);
System.out.println("一共查询了:"+tds.totalHits);
for(ScoreDoc sd:tds.scoreDocs) {
Document doc = searcher.doc(sd.doc);
System.out.println(doc.get("id")+"---->"+
doc.get("name")+"["+doc.get("email")+"]-->"+doc.get("id")+","+
doc.get("attach")+","+doc.get("date"));
}
searcher.close();
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
/** 范围查询
* @author
* @param
* @return
*/
public void searchByTermRange(String field,String start,String end,int num) {
try {
IndexSearcher searcher = getSearcher();
Query query = new TermRangeQuery(field,start,end,true, true);
TopDocs tds = searcher.search(query, num);
System.out.println("一共查询了:"+tds.totalHits);
for(ScoreDoc sd:tds.scoreDocs) {
Document doc = searcher.doc(sd.doc);
System.out.println(doc.get("id")+"---->"+
doc.get("name")+"["+doc.get("email")+"]-->"+doc.get("id")+","+
doc.get("attach")+","+doc.get("date"));
}
searcher.close();
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
/** 数字范围查询
* @author
* @param
* @return
*/
public void searchByNumricRange(String field,int start,int end,int num) {
try {
IndexSearcher searcher = getSearcher();
Query query = NumericRangeQuery.newIntRange(field,start, end,true,true);
TopDocs tds = searcher.search(query, num);
System.out.println("一共查询了:"+tds.totalHits);
for(ScoreDoc sd:tds.scoreDocs) {
Document doc = searcher.doc(sd.doc);
System.out.println(doc.get("id")+"---->"+
doc.get("name")+"["+doc.get("email")+"]-->"+doc.get("id")+","+
doc.get("attach")+","+doc.get("date"));
}
searcher.close();
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
/** 前缀查询
* @author
* @param
* @return
*/
public void searchByPrefix(String field,String value,int num) {
try {
IndexSearcher searcher = getSearcher();
Query query = new PrefixQuery(new Term(field,value));
TopDocs tds = searcher.search(query, num);
System.out.println("一共查询了:"+tds.totalHits);
for(ScoreDoc sd:tds.scoreDocs) {
Document doc = searcher.doc(sd.doc);
System.out.println(doc.get("id")+"---->"+
doc.get("name")+"["+doc.get("email")+"]-->"+doc.get("id")+","+
doc.get("attach")+","+doc.get("date"));
}
searcher.close();
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
/** 通配符查询
* @author
* @param
* @return
*/
public void searchByWildcard(String field,String value,int num) {
try {
IndexSearcher searcher = getSearcher();
//在传入的value中可以使用通配符:?和*,?表示匹配一个字符,*表示匹配任意多个字符
Query query = new WildcardQuery(new Term(field,value));
TopDocs tds = searcher.search(query, num);
System.out.println("一共查询了:"+tds.totalHits);
for(ScoreDoc sd:tds.scoreDocs) {
Document doc = searcher.doc(sd.doc);
System.out.println(doc.get("id")+"---->"+
doc.get("name")+"["+doc.get("email")+"]-->"+doc.get("id")+","+
doc.get("attach")+","+doc.get("date"));
}
searcher.close();
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
/** 连接子查询
* @author
* @param
* @return
*/
public void searchByBoolean(int num) {
try {
IndexSearcher searcher = getSearcher();
BooleanQuery query = new BooleanQuery();
/*
* BooleanQuery可以连接多个子查询
* Occur.MUST表示必须出现
* Occur.SHOULD表示可以出现
* Occur.MUSE_NOT表示不能出现
*/
query.add(new TermQuery(new Term("name","zhangsan")), Occur.MUST_NOT);
query.add(new TermQuery(new Term("content","game")),Occur.SHOULD);
TopDocs tds = searcher.search(query, num);
System.out.println("一共查询了:"+tds.totalHits);
for(ScoreDoc sd:tds.scoreDocs) {
Document doc = searcher.doc(sd.doc);
System.out.println(doc.get("id")+"---->"+
doc.get("name")+"["+doc.get("email")+"]-->"+doc.get("id")+","+
doc.get("attach")+","+doc.get("date"));
}
searcher.close();
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
/** 根据短语和距离查询
* @author
* @param
* @return
*/
public void searchByPhrase(int num) {
try {
IndexSearcher searcher = getSearcher();
PhraseQuery query = new PhraseQuery();
query.setSlop(3);
query.add(new Term("content","pingpeng"));
//第一个Term
query.add(new Term("content","i"));
//产生距离之后的第二个Term
// query.add(new Term("content","football"));
TopDocs tds = searcher.search(query, num);
System.out.println("一共查询了:"+tds.totalHits);
for(ScoreDoc sd:tds.scoreDocs) {
Document doc = searcher.doc(sd.doc);
System.out.println(doc.get("id")+"---->"+
doc.get("name")+"["+doc.get("email")+"]-->"+doc.get("id")+","+
doc.get("attach")+","+doc.get("date"));
}
searcher.close();
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
/** 通过短语查询,用得最多,以上都可以实现,具体的query实现不同的功能
* @author
* @param
* @return
*/
public void searchByQueryParse(Query query,int num) {
try {
IndexSearcher searcher = getSearcher();
TopDocs tds = searcher.search(query, num);
System.out.println("一共查询了:"+tds.totalHits);
for(ScoreDoc sd:tds.scoreDocs) {
Document doc = searcher.doc(sd.doc);
System.out.println(doc.get("id")+"---->"+
doc.get("name")+"["+doc.get("email")+"]-->"+doc.get("id")+","+
doc.get("attach")+","+doc.get("date")+"=="+sd.score);
}
searcher.close();
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
测试类:
import java.io.File;
import java.io.IOException;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.Version;
import org.itat.index.FileIndexUtils;
import org.itat.index.SearcherUtil;
import org.junit.Before;
import org.junit.Test;
public class TestSearch {
private SearcherUtil su;
@Before
public void init() {
su = new SearcherUtil();
}
@Test
public void testCopyFiles() {
try {
File file = new File("d:/lucene/example/");
for(File f:file.listFiles()) {
String destFileName = FilenameUtils.getFullPath(f.getAbsolutePath())+
FilenameUtils.getBaseName(f.getName())+".she";
FileUtils.copyFile(f, new File(destFileName));
}
} catch (IOException e) {
e.printStackTrace();
}
}
@Test
public void searchByTerm() {
su.searchByTerm("content","i",3);
}
@Test
public void searchByTermRange() {
//查询name以a开头和s结尾的
// su.searchByTermRange("name","a","s",10);
//由于attachs是数字类型,使用TermRange无法查询
su.searchByTermRange("attach","2","10", 5);
}
@Test
public void searchByNumRange() {
su.searchByNumricRange("attach",2,10, 5);
}
@Test
public void searchByPrefix() {
su.searchByPrefix("content", "s", 10);
}
@Test
public void searchByWildcard() {
//匹配@itat.org结尾的所有字符
su.searchByWildcard("email", "*@itat.org", 10);
//匹配j开头的有三个字符的name
su.searchByWildcard("name", "j???", 10);
}
@Test
public void searchByBoolean() {
su.searchByBoolean(10);
}
@Test
public void searchByPhrase() {
su.searchByPhrase(10);
}
@Test
public void searchByQueryParse() throws ParseException {
//1、创建QueryParser对象,默认搜索域为content
QueryParser parser = new QueryParser(Version.LUCENE_35, "content", new StandardAnalyzer(Version.LUCENE_35));
//改变空格的默认操作符,以下可以改成AND
//parser.setDefaultOperator(Operator.AND);
//开启第一个字符的通配符匹配,默认关闭因为效率不高
parser.setAllowLeadingWildcard(true);
//搜索content中包含有like的
Query query = parser.parse("like");
//有basketball或者football的,空格默认就是OR
query = parser.parse("basketball football");
//改变搜索域为name为mike
//query = parser.parse("content:like");
//同样可以使用*和?来进行通配符匹配
// query = parser.parse("name:j*");
//通配符默认不能放在首位
// query = parser.parse("email:*@itat.org");
//匹配name中没有mike但是content中必须有football的,+和-要放置到域说明前面
query = parser.parse("- name:mike + like");
//匹配一个区间,注意:TO必须是大写
//query = parser.parse("id:[1 TO 6]");
//闭区间匹配只会匹配到2
//query = parser.parse("id:{1 TO 3}");
//完全匹配I Like Football的
//query = parser.parse("\"I like football\"");
//匹配I 和football之间有一个单词距离的
//query = parser.parse("\"I football\"~1");
//模糊查询
//query = parser.parse("name:make~");
//没有办法匹配数字范围(自己扩展Parser)
//query = parser.parse("attach:[2 TO 10]");
su.searchByQueryParse(query, 10);
}
@Test
public void indexFile() {
FileIndexUtils.index(true);
}
@Test
public void testSearchPage01() {
su.searchPage("java", 2,20);
System.out.println("-------------------------------");
// su.searchNoPage("java");
su.searchPageByAfter("java", 2,20);
}
@Test
public void testSearchPage02() {
su.searchPageByAfter("java", 3,20);
}
}