FileIndexUtil.java
package org.itat.index;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.NumericField;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
public class FileIndexUtil {
private static Directory directory = null;
static {
try {
directory = FSDirectory.open(new File("d:/lucene/files"));
} catch (Exception e) {
e.printStackTrace();
}
}
public static Directory getDirectory() {
return directory;
}
/**
*@MethodName:index
*@Description:创建索引
*@param hasNew是否要新建索引
*@author:半仙儿
*@return void
*@date:2015-4-15下午04:05:04
*/
public static void index(boolean hasNew) {
IndexWriter writer = null;
try {
writer = new IndexWriter(directory, new IndexWriterConfig(
Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35)));
if (hasNew) {
writer.deleteAll();
}
File file = new File("d:/lucene/example");
Document doc = null;
for (File f : file.listFiles()) {
doc = new Document();
doc.add(new Field("content", new FileReader(f)));
doc.add(new Field("filename", f.getName(), Field.Store.YES,
Field.Index.NOT_ANALYZED));
doc.add(new Field("path", f.getAbsolutePath(), Field.Store.YES,
Field.Index.NOT_ANALYZED));
doc.add(new NumericField("date", Field.Store.YES, true)
.setLongValue(f.lastModified()));
doc.add(new NumericField("size", Field.Store.YES, true)
.setIntValue((int) f.length() / 1024));
writer.addDocument(doc);
}
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
if (writer != null)
writer.close();
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
SearcherUtil.java
package org.itat.index;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.GregorianCalendar;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.NumericField;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.NumericRangeQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
public class SearcherUtil {
private Directory directory;
private IndexReader reader;
private String[] ids = { "1", "2", "3", "4", "5", "6" };
private String[] emails = { "aa@itat.org", "bb@itat.org", "cc@cc.org",
"dd@sina.org", "ee@zttc.edu", "ff@itat.org" };
private String[] contents = { "welcome to visited the space,I like book",
"hello boy,I like pingpeng ball", "my name is cc I like game",
"I like football", "I like football and I like basketball too",
"I like movie and swim" };
// 邮件日期
private Date[] dates = null;
private int[] attachs = { 2, 3, 1, 4, 5, 5 };
private String[] names = { "zhangsan", "lisi", "john", "jetty", "mike",
"jake" };
// 加权
private Map<String, Float> scores = new HashMap<String, Float>();
public SearcherUtil() {
directory = new RAMDirectory();
setDates();
index();
}
public IndexSearcher getSearcher() {
try {
if (reader == null) {
reader = IndexReader.open(directory);
} else {
IndexReader tr = IndexReader.openIfChanged(reader);
if (tr != null) {
reader.close();
reader = tr;
}
}
return new IndexSearcher(reader);
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
/**
*@MethodName:seachByTerm
*@Description:精确匹配查询
*@param field域
*@param name名字
*@param num显示多少条
*@author:半仙儿
*@return void
*@date:2015-4-15上午11:00:39
*/
public void seachByTerm(String field, String name, int num) {
try {
IndexSearcher searcher = getSearcher();
Query query = new TermQuery(new Term(field, name));
TopDocs tds = searcher.search(query, num);
System.out.println("一共查询了:" + tds.totalHits);
for (ScoreDoc sd : tds.scoreDocs) {
Document doc = searcher.doc(sd.doc);
System.out.println("(文档号:"
+ sd.doc
+ "权值:"
+ doc.getBoost()
+ "评分:"
+ sd.score
+ ")姓名:"
+ doc.get("name")
+ "["
+ "邮箱:"
+ doc.get("email")
+ "]编号:"
+ doc.get("id")
+ "附件数:"
+ doc.get("attach")
+ ",时间:"
+ formateNumToDateString(Long
.parseLong(doc.get("date"))));
}
searcher.close();
} catch (Exception e) {
e.printStackTrace();
}
}
/**
*@MethodName:searchByTermRange
*@Description:查询域以start开头以end结尾的
*@param field
*@param start开始
*@param end结束
*@param num显示数量
*@author:半仙儿
*@return void
*@date:2015-4-15上午11:23:41
*/
public void searchByTermRange(String field, String start, String end,
int num) {
try {
IndexSearcher searcher = getSearcher();
Query query = new TermRangeQuery(field, start, end, true, true);// true表示是否包含开始和结束字符
TopDocs tds = searcher.search(query, num);
System.out.println("一共查询了:" + tds.totalHits);
for (ScoreDoc sd : tds.scoreDocs) {
Document doc = searcher.doc(sd.doc);
System.out.println("(文档号:"
+ sd.doc
+ "权值:"
+ doc.getBoost()
+ "评分:"
+ sd.score
+ ")姓名:"
+ doc.get("name")
+ "["
+ "邮箱:"
+ doc.get("email")
+ "]编号:"
+ doc.get("id")
+ "附件数:"
+ doc.get("attach")
+ ",时间:"
+ formateNumToDateString(Long
.parseLong(doc.get("date"))));
}
searcher.close();
} catch (Exception e) {
e.printStackTrace();
}
}
/**
*@MethodName:searchByNumricRange
*@Description:搜索附件(数字)
*@param field
*@param start
*@param end
*@param num
*@author:半仙儿
*@return void
*@date:2015-4-15上午11:30:26
*/
public void searchByNumricRange(String field, int start, int end, int num) {
try {
IndexSearcher searcher = getSearcher();
Query query = NumericRangeQuery.newIntRange(field, start, end,
true, true);
TopDocs tds = searcher.search(query, num);
System.out.println("一共查询了:" + tds.totalHits);
for (ScoreDoc sd : tds.scoreDocs) {
Document doc = searcher.doc(sd.doc);
System.out.println("(文档号:"
+ sd.doc
+ "权值:"
+ doc.getBoost()
+ "评分:"
+ sd.score
+ ")姓名:"
+ doc.get("name")
+ "["
+ "邮箱:"
+ doc.get("email")
+ "]编号:"
+ doc.get("id")
+ "附件数:"
+ doc.get("attach")
+ ",时间:"
+ formateNumToDateString(Long
.parseLong(doc.get("date"))));
}
searcher.close();
} catch (Exception e) {
e.printStackTrace();
}
}
/**
*@MethodName:searchByPrefix
*@Description:通过前缀搜索
*@param field
*@param value此时value的值就是通过前缀来匹配了
*@param num
*@author:半仙儿
*@return void
*@date:2015-4-15下午01:45:34
*/
public void searchByPrefix(String field, String value, int num) {
try {
IndexSearcher searcher = getSearcher();
Query query = new PrefixQuery(new Term(field, value));// true表示是否包含开始和结束字符
TopDocs tds = searcher.search(query, num);
System.out.println("一共查询了:" + tds.totalHits);
for (ScoreDoc sd : tds.scoreDocs) {
Document doc = searcher.doc(sd.doc);
System.out.println("(文档号:"
+ sd.doc
+ "权值:"
+ doc.getBoost()
+ "评分:"
+ sd.score
+ ")姓名:"
+ doc.get("name")
+ "["
+ "邮箱:"
+ doc.get("email")
+ "]编号:"
+ doc.get("id")
+ "附件数:"
+ doc.get("attach")
+ ",时间:"
+ formateNumToDateString(Long
.parseLong(doc.get("date"))));
}
searcher.close();
} catch (Exception e) {
e.printStackTrace();
}
}
/**
*@MethodName:searchByPrefix
*@Description:通配符搜索
*@param field
*@param value此时value的值就是通过前缀来匹配了
*@param num
*@author:半仙儿
*@return void
*@date:2015-4-15下午01:45:34
*/
public void searchByWildcard(String field, String value, int num) {
try {
IndexSearcher searcher = getSearcher();
// 在传入的value
Query query = new WildcardQuery(new Term(field, value));
TopDocs tds = searcher.search(query, num);
System.out.println("一共查询了:" + tds.totalHits);
for (ScoreDoc sd : tds.scoreDocs) {
Document doc = searcher.doc(sd.doc);
System.out.println("(文档号:"
+ sd.doc
+ "权值:"
+ doc.getBoost()
+ "评分:"
+ sd.score
+ ")姓名:"
+ doc.get("name")
+ "["
+ "邮箱:"
+ doc.get("email")
+ "]编号:"
+ doc.get("id")
+ "附件数:"
+ doc.get("attach")
+ ",时间:"
+ formateNumToDateString(Long
.parseLong(doc.get("date"))));
}
searcher.close();
} catch (Exception e) {
e.printStackTrace();
}
}
/**
*@MethodName:searchByPrefix
*@Description:通配符搜索
*@param field
*@param value此时value的值就是通过前缀来匹配了
*@param num
*@author:半仙儿
*@return void
*@date:2015-4-15下午01:45:34
*/
public void searchByBoolean(int num) {
try {
IndexSearcher searcher = getSearcher();
// 在传入的value
BooleanQuery query = new BooleanQuery();
// 名字必须是zhangsan
// Occur.MUST_NOT必须没有
query.add(new TermQuery(new Term("name", "zhangsan")), Occur.MUST);
// 内容必须是like
// Occur.SHOULD,没有也行
query.add(new TermQuery(new Term("content", "like")), Occur.MUST);
TopDocs tds = searcher.search(query, num);
System.out.println("一共查询了:" + tds.totalHits);
for (ScoreDoc sd : tds.scoreDocs) {
Document doc = searcher.doc(sd.doc);
System.out.println("(文档号:"
+ sd.doc
+ "权值:"
+ doc.getBoost()
+ "评分:"
+ sd.score
+ ")姓名:"
+ doc.get("name")
+ "["
+ "邮箱:"
+ doc.get("email")
+ "]编号:"
+ doc.get("id")
+ "附件数:"
+ doc.get("attach")
+ ",时间:"
+ formateNumToDateString(Long
.parseLong(doc.get("date"))));
}
searcher.close();
} catch (Exception e) {
e.printStackTrace();
}
}
/**
*@MethodName:searchByPhrase
*@Description:短语查询
*@param num
*@author:半仙儿
*@return void
*@date:2015-4-15下午02:05:18
*/
public void searchByPhrase(int num) {
try {
IndexSearcher searcher = getSearcher();
PhraseQuery query = new PhraseQuery();
query.setSlop(1);
query.add(new Term("content", "i"));
query.add(new Term("content", "football"));
TopDocs tds = searcher.search(query, num);
System.out.println("一共查询了:" + tds.totalHits);
for (ScoreDoc sd : tds.scoreDocs) {
Document doc = searcher.doc(sd.doc);
System.out.println("(文档号:"
+ sd.doc
+ "权值:"
+ doc.getBoost()
+ "评分:"
+ sd.score
+ ")姓名:"
+ doc.get("name")
+ "["
+ "邮箱:"
+ doc.get("email")
+ "]编号:"
+ doc.get("id")
+ "附件数:"
+ doc.get("attach")
+ ",时间:"
+ formateNumToDateString(Long
.parseLong(doc.get("date"))));
}
searcher.close();
} catch (Exception e) {
e.printStackTrace();
}
}
/**
*@MethodName:searchByPhrase
*@Description:模糊查询,匹配一个有问题的
*@param num
*@author:半仙儿
*@return void
*@date:2015-4-15下午02:05:18
*/
public void searchByFuzzy(int num) {
try {
IndexSearcher searcher = getSearcher();
Query query = new FuzzyQuery(new Term("name", "mike"));// 改成make也可以查询出来
TopDocs tds = searcher.search(query, num);
System.out.println("一共查询了:" + tds.totalHits);
for (ScoreDoc sd : tds.scoreDocs) {
Document doc = searcher.doc(sd.doc);
System.out.println("(文档号:"
+ sd.doc
+ "权值:"
+ doc.getBoost()
+ "评分:"
+ sd.score
+ ")姓名:"
+ doc.get("name")
+ "["
+ "邮箱:"
+ doc.get("email")
+ "]编号:"
+ doc.get("id")
+ "附件数:"
+ doc.get("attach")
+ ",时间:"
+ formateNumToDateString(Long
.parseLong(doc.get("date"))));
}
searcher.close();
} catch (Exception e) {
e.printStackTrace();
}
}
/**
*@MethodName:searchByPhrase
*@Description:模糊查询,匹配一个有问题的
*@param num
*@author:半仙儿
*@return void
*@date:2015-4-15下午02:05:18
*/
public void searchByQueryParse(Query query, int num) {
try {
IndexSearcher searcher = getSearcher();
TopDocs tds = searcher.search(query, num);
System.out.println("一共查询了:" + tds.totalHits);
for (ScoreDoc sd : tds.scoreDocs) {
Document doc = searcher.doc(sd.doc);
System.out.println("(文档号:"
+ sd.doc
+ "权值:"
+ doc.getBoost()
+ "评分:"
+ sd.score
+ ")姓名:"
+ doc.get("name")
+ "["
+ "邮箱:"
+ doc.get("email")
+ "]编号:"
+ doc.get("id")
+ "附件数:"
+ doc.get("attach")
+ ",时间:"
+ formateNumToDateString(Long
.parseLong(doc.get("date"))));
}
searcher.close();
} catch (Exception e) {
e.printStackTrace();
}
}
/**
*@MethodName:formateNumToDateString
*@Description:将字符串的数字格式的日期转换为日期格式
*@param num
*@author:半仙儿
*@return String
*@date:2015-4-14下午04:03:18
*/
public String formateNumToDateString(Long num) {
Date dat = new Date(num);
GregorianCalendar gc = new GregorianCalendar();
gc.setTime(dat);
java.text.SimpleDateFormat format = new java.text.SimpleDateFormat(
"yyyy-MM-dd");
String sb = format.format(gc.getTime());
return sb;
}
/**
*@MethodName:index
*@Description:创建索引
*@author:半仙儿
*@return void
*@date:2015-4-14上午09:59:01
*/
public void index() {
IndexWriter writer = null;
try {
writer = new IndexWriter(directory, new IndexWriterConfig(
Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35)));
Document doc = null;
// 清空
writer.deleteAll();
for (int i = 0; i < ids.length; i++) {
doc = new Document();
doc.add(new Field("id", ids[i], Field.Store.YES,
Field.Index.NOT_ANALYZED_NO_NORMS));
doc.add(new Field("email", emails[i], Field.Store.YES,
Field.Index.NOT_ANALYZED));
doc.add(new Field("content", contents[i], Field.Store.NO,
Field.Index.ANALYZED));
doc.add(new Field("name", names[i], Field.Store.YES,
Field.Index.NOT_ANALYZED_NO_NORMS));
doc.add(new NumericField("attach", Field.Store.YES, true)
.setIntValue(attachs[i]));
doc.add(new NumericField("date", Field.Store.YES, true)
.setLongValue(dates[i].getTime()));
// 获取邮箱后缀
String et = emails[i].substring(emails[i].lastIndexOf("@") + 1);
if (scores.containsKey(et)) {
doc.setBoost(scores.get(et));
} else {
doc.setBoost(0.5f);
}
writer.addDocument(doc);
}
} catch (Exception e) {
e.printStackTrace();
} finally {
if (writer != null) {
try {
writer.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
}
public void setDates() {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
dates = new Date[ids.length];
try {
dates[0] = sdf.parse("2010-02-19");
dates[1] = sdf.parse("2012-01-11");
dates[2] = sdf.parse("2010-09-19");
dates[3] = sdf.parse("2010-12-22");
dates[4] = sdf.parse("2012-01-01");
dates[5] = sdf.parse("2011-05-19");
} catch (Exception e) {
e.printStackTrace();
}
}
/**
*@MethodName:searchPage
*@Description:lucene分页再查询
*@param query
*@param pageIndex
*@param pageSize
*@author:半仙儿
*@return void
*@date:2015-4-15下午04:30:31
*/
public void searchPage(String query, int pageIndex, int pageSize) {
try {
Directory dir = FileIndexUtil.getDirectory();
IndexSearcher searcher = getSearcher(dir);
QueryParser parser = new QueryParser(Version.LUCENE_35, "content",
new StandardAnalyzer(Version.LUCENE_35));
Query q = parser.parse(query);
TopDocs tds = searcher.search(q, 500);
ScoreDoc[] sds = tds.scoreDocs;
int start = (pageIndex - 1) * pageSize;
// (start+pageSize)
int end = pageIndex * pageSize;
for (int i = start; i < end; i++) {
Document doc = searcher.doc(sds[i].doc);
System.out.println(sds[i].doc + ":" + doc.get("path") + "-->"
+ doc.get("filename"));
}
searcher.close();
} catch (Exception e) {
e.printStackTrace();
}
}
/**
*@MethodName:searchPage
*@Description:lucene不带分页的查询,为了验证lucene是否正常分页
*@param query
*@param pageIndex
*@param pageSize
*@author:半仙儿
*@return void
*@date:2015-4-15下午04:30:31
*/
public void searchNoPage(String query) {
try {
Directory dir = FileIndexUtil.getDirectory();
IndexSearcher searcher = getSearcher(dir);
QueryParser parser = new QueryParser(Version.LUCENE_35, "content",
new StandardAnalyzer(Version.LUCENE_35));
Query q = parser.parse(query);
TopDocs tds = searcher.search(q, 100);
ScoreDoc[] sds = tds.scoreDocs;
for (int i = 0; i < sds.length; i++) {
Document doc = searcher.doc(sds[i].doc);
System.out.println(sds[i].doc + ":" + doc.get("path") + "-->"
+ doc.get("filename"));
}
searcher.close();
} catch (Exception e) {
e.printStackTrace();
}
}
public IndexSearcher getSearcher(Directory directory) {
try {
if (reader == null) {
reader = IndexReader.open(directory);
} else {
IndexReader tr = IndexReader.openIfChanged(reader);
if (tr != null) {
reader.close();
reader = tr;
}
}
return new IndexSearcher(reader);
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
}
TestSearch.java
package org.itat.test;
import java.io.File;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.Version;
import org.itat.index.FileIndexUtil;
import org.itat.index.SearcherUtil;
import org.junit.Before;
import org.junit.Test;
public class TestSeach {
private SearcherUtil su;
@Before
public void init() {
su = new SearcherUtil();
}
@Test
public void testCopyFiles() {
try {
File file = new File("d:/lucene/example/");
for (File f : file.listFiles()) {
String desFileName = FilenameUtils.getFullPath(f
.getAbsolutePath())
+ FilenameUtils.getBaseName(f.getName()) + ".hhh";
FileUtils.copyFile(f, new File(desFileName));
}
} catch (Exception e) {
e.printStackTrace();
}
}
@Test
public void searchByTerm() {
su.seachByTerm("name", "mike", 3);
}
@Test
public void seachByTermRange() {
// su.searchByTermRange("id", "1", "3", 10);
su.searchByTermRange("name", "a", "s", 10);
// 由于attachs是数字类型,使用TermRange查不出来
// su.searchByTermRange("attach", "2", "10", 5);
}
@Test
public void searchByNumricRange() {
su.searchByNumricRange("attach", 2, 10, 5);
}
@Test
public void searchByPrefix() {
su.searchByPrefix("name", "j", 10);
// su.searchByPrefix("content", "s", 10);
}
@Test
public void searchByWildcard() {
// *表示多个字符,?表示一个字符
// su.searchByWIldcard("name", "j*", 10);
// 匹配@itat.org结尾的所有的字符
// su.searchByWildcard("email", "*@itat.org", 10);
// 匹配j开头的有三个字符的name
su.searchByWildcard("name", "j???", 10);
}
@Test
public void searchByBoolean() {
su.searchByBoolean(10);
}
@Test
public void searchByPhrase() {
su.searchByPhrase(10);
}
@Test
public void searchByFuzzy() {
su.searchByFuzzy(10);
}
@Test
public void searchByQueryParse() throws ParseException {
// 1.创建QueryParse对象,默认的搜索域为content
QueryParser parser = new QueryParser(Version.LUCENE_35, "content",
new StandardAnalyzer(Version.LUCENE_35));
// 改变空格的默认操作符
// parser.setDefaultOperator(Operator.AND);
// 搜索content中包含有like的
Query query = parser.parse("like");
// 有I 和 football的
// query = parser.parse("I AND football");
// 有basketball或者football的空格默认就是OR
query = parser.parse("basketball football");
// 改变搜索域为name为mike的
query = parser.parse("name:mike");
// 名字为j开头的
// 同样可以使用*和?进行通配符匹配
query = parser.parse("name:j*");
// 邮件以itat.org结尾的
// 开启第一个字符的通配符匹配,默认关闭,因为效率低
parser.setAllowLeadingWildcard(true);
// 通配符默认不能放到首位
query = parser.parse("email:*itat.org");
// 名字不能为mike,但是内容中有football
query = parser.parse("-name:mike +football ");
// id从1到3区间,TO必须是大写
query = parser.parse("id:[1 TO 3]");
// 闭区间只会匹配到2
query = parser.parse("id:{1 TO 3}");
// 完全匹配I like football的
query = parser.parse("\"I like football\"");
// 匹配I和football之间有一个单词距离的
query = parser.parse("\"I football\"~");
// 模糊查询
query = parser.parse("name:make~");
// 用这种方法无法匹配数字范围(需要自己扩展parser)
query = parser.parse("attach:[2 TO 10]");
su.searchByQueryParse(query, 10);
}
@Test
public void indexFile() {
FileIndexUtil.index(true);
}
@Test
public void testSearchPage01() {
su.searchPage("java", 2, 20);
System.out.println("--------------------------------------------------------");
su.searchNoPage("java");
}
}
需要用到的jar
lucene.jar
junit.jar
commons-io.jar