分享下自己的lucene工具类,不足之处欢迎吐槽!
0.2版
package com.jiuxing.qa.util.lucene;
import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import net.paoding.analysis.analyzer.PaodingAnalyzer;
import net.paoding.analysis.knife.Paoding;
import net.paoding.analysis.knife.PaodingMaker;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.util.Version;
import com.jiuxing.qa.util.PropertyUtil;
/**
* luence操作工具类 提供索引创建、查询功能 lucene vsrsion 3.6.1
*
* @author jiaojun [junjiao.j@gmail.com]
* @version v0.0.2
* @param <T>
* @date 2012-08-20
*/
public class LuceneUtil<T> {
private static Log log = LogFactory.getLog(LuceneUtil.class);
/**
* 索引优化后文件段的数量,数量越大,优化效率越大
*/
private static final int DEFAULT_MAX_NUM_SEGMENTS = 3;
/**
* 低版本的查询索引存活周期
*/
private static final long STALE_INDEXREADER_SURVIVAL_TIME = 60000;
private static Map<String, IndexWriter> writerPool = new HashMap<String, IndexWriter>();
private static Map<String, IndexReader> readerPool = new HashMap<String, IndexReader>();
/**
* 存放IndexReader的Map,Map里存放的都是已经实例化好的IndexReader
*/
private static Map<Long, IndexReader> stalereaderPool = new HashMap<Long, IndexReader>();
private static LuceneUtil util = null;
private LuceneUtil() {
}
public synchronized static LuceneUtil getInstance() {
if (util == null) {
util = new LuceneUtil();
}
return util;
}
static {
init();
}
/**
* 始化索引池初
*/
public static void init() {
log.info("索引池初始化开始");
String indexDir = PropertyUtil.getPropertiesByKey("lucene.properties",
"lucene.index.dir");
String pool = PropertyUtil.getPropertiesByKey("lucene.properties",
"lucene.index.pool");
for (String poolDir : pool.split(",")) {
synchronized (writerPool) {
try {
IndexWriter iw = createIndexWriter(indexDir + poolDir);
if (iw != null)
writerPool.put(poolDir, iw);
} catch (IOException e) {
log.error("writerPool初始化失败,原因:" + e.getMessage());
}
}
synchronized (readerPool) {
try {
IndexReader ir = IndexReader.open(FSDirectory
.open(getIndexFile(indexDir + poolDir)),false);
if (ir != null)
readerPool.put(poolDir, ir);
} catch (Exception e) {
log.error("readerPool初始化失败,原因:" + e.getMessage());
}
}
}
log.info("索引池初始化完成");
}
/**
* 创建索引池初始化
*/
public static void initIndexWriter() {
log.info("【创建索引池】初始化开始");
String indexDir = PropertyUtil.getPropertiesByKey("lucene.properties",
"lucene.index.dir");
String pool = PropertyUtil.getPropertiesByKey("lucene.properties",
"lucene.index.pool");
for (String poolDir : pool.split(",")) {
synchronized (writerPool) {
try {
IndexWriter iw = createIndexWriter(indexDir + poolDir);
if (iw != null)
writerPool.put(poolDir, iw);
} catch (IOException e) {
log.error("writerPool初始化失败,原因:" + e.getMessage());
}
}
}
log.info("【创建索引池】初始化完成");
}
/**
* 创建索引,建议定时更新即可
*
* @param <T>
* @param indexDir
* 索引根保存位置
* @param poolDir
* 索引池保存位置
* @param list
* 需要创建索引的数据
* @param clz
* 数据绑定的对象
* @param fields
* 须创建索引的属性(小写)
* @throws IOException
* @throws NoSuchMethodException
* @throws SecurityException
* @throws InvocationTargetException
* @throws IllegalAccessException
* @throws IllegalArgumentException
*/
public static <T> void createIndex(String indexDir, String poolDir,
List<?> list, Class<T> clz, String[] fields) throws IOException,
SecurityException, NoSuchMethodException, IllegalArgumentException,
IllegalAccessException, InvocationTargetException {
createIndex(indexDir,poolDir,list,clz, fields,false);
}
/**
* 创建索引,建议定时更新即可
*
* @param <T>
* @param indexDir
* 索引根保存位置
* @param poolDir
* 索引池保存位置
* @param list
* 需要创建索引的数据
* @param clz
* 数据绑定的对象
* @param fields
* 须创建索引的属性(小写)
* @param isDel
* 是否删除原索引重新创建
* @throws IOException
* @throws NoSuchMethodException
* @throws SecurityException
* @throws InvocationTargetException
* @throws IllegalAccessException
* @throws IllegalArgumentException
*/
public static <T> void createIndex(String indexDir, String poolDir,
List<?> list, Class<T> clz, String[] fields,boolean isDel) throws IOException,
SecurityException, NoSuchMethodException, IllegalArgumentException,
IllegalAccessException, InvocationTargetException {
log.info("索引开始创建,服务于 " + clz + " | " + fields.toString());
long start = new Date().getTime();
IndexWriter writer = getIndexWriter(indexDir, poolDir);
if (null == writer) {
log.error("IndexWriter获取失败");
return;
}
// 删除全部索引
if(isDel){
//writer.deleteAll();
}
SimpleDateFormat simpleDateFormat = new SimpleDateFormat(
"yyyy-MM-dd hh:mm:ss");
if (null != list && list.size() > 0) {
for (int i = 0; i < list.size(); i++) {
Document doc = new Document();
java.lang.reflect.Field[] cfs = clz.getDeclaredFields();
for (java.lang.reflect.Field cf : cfs) {
String fieldName = cf.getName();
String stringLetter = fieldName.substring(0, 1)
.toUpperCase();
String getName = "get" + stringLetter
+ fieldName.substring(1);
// String setName="set"+stringLetter+fieldName.substring(1);
Method getMethod = clz.getMethod(getName);
// Method setMethod=clz.getMethod(setName, new
// Class[]{cf.getType()});
Object value = getMethod.invoke((T) list.get(i));
if (Arrays.asList(fields).contains(fieldName)) {
if (value != null && !"".equals(value.toString())) {
String tmp = "";
if (cf.getGenericType().toString().equals(
"class java.util.Date")) {
tmp = simpleDateFormat.format(value);
} else {
tmp = value.toString();
}
doc.add(new Field(fieldName, tmp, Field.Store.YES,
Field.Index.ANALYZED));
}
}
}
if(!isDel){
/**
* 先将fields[0]的索引查找到,然后再删除,最后将新的索引添加到索引文件中
*/
if(null != doc.get(fields[0])){
writer.updateDocument(new Term(fields[0], doc.get(fields[0])),
doc);
}
}
}
log.info("索引创建完成,保存目录:" + indexDir + poolDir + ",索引创建/记录:"
+ writer.maxDoc() + "/" + list.size() + "条,花费时间:"
+ (new Date().getTime() - start) / 1000 + "秒!" + writer);
list.clear();
}
writer.forceMerge(DEFAULT_MAX_NUM_SEGMENTS);
writer.commit();
}
/**
* 分页查询索引 排序就默认按传入的fields属性的第一个元素的匹配度降序排列
*
* @param indexDir
* 索引根保存位置
* @param poolDir
* 索引池保存位置
* @param keyWords
* 关键词
* @param fields
* 属性
* @param pageSize
* 每页记录数
* @param currentPage
* 当前页数
* @throws IOException
* @return SearchResult 查询结果集
* @throws IOException
* @throws InvalidTokenOffsetsException
*/
public static SearchResult searchPage(String indexDir, String poolDir,
String[] keyWords, String[] fields, int pageSize, int currentPage)
throws IOException, InvalidTokenOffsetsException {
return searchPage(indexDir, poolDir, keyWords, fields, true, pageSize,
currentPage);
}
/**
* 分页查询索引 排序就默认按传入的fields属性的第一个元素的匹配度降序排列
*
* @param indexDir
* 索引根保存位置
* @param poolDir
* 索引池保存位置
* @param keyWords
* 关键词
* @param fields
* 属性
* @param isHighlighter
* 是否高亮显示
* @param pageSize
* 每页记录数
* @param currentPage
* 当前页数
* @throws IOException
* @return SearchResult 查询结果集
* @throws IOException
* @throws InvalidTokenOffsetsException
*/
public static SearchResult searchPage(String indexDir, String poolDir,
String[] keyWords, String[] fields, boolean isHighlighter,
int pageSize, int currentPage) throws IOException,
InvalidTokenOffsetsException {
return searchPage(indexDir, poolDir, keyWords, fields, true, pageSize,
currentPage,true);
}
/**
* 分页查询索引 排序就默认按传入的fields属性的第一个元素的匹配度降序排列
*
* @param indexDir
* 索引根保存位置
* @param poolDir
* 索引池保存位置
* @param keyWords
* 关键词
* @param fields
* 属性
* @param isHighlighter
* 是否高亮显示
* @param pageSize
* 每页记录数
* @param currentPage
* 当前页数
* @param isPage
* 是否分页,如无需分页只查条数的话,直接传入条数即可,大大优化索引查询效率
* @throws IOException
* @return SearchResult 查询结果集
* @throws IOException
* @throws InvalidTokenOffsetsException
*/
public static SearchResult searchPage(String indexDir, String poolDir,
String[] keyWords, String[] fields, boolean isHighlighter,
int pageSize, int currentPage,boolean isPage) throws IOException,
InvalidTokenOffsetsException {
//将关键字中的特殊符号过滤
if(null != keyWords && keyWords.length>0){
String[] tmp = new String[keyWords.length];
for(int i = 0;i<keyWords.length;i++){
tmp[i] = stringFilter(keyWords[i]);
}
keyWords = tmp;
}
SearchResult searchResult = new SearchResult();
IndexSearcher searcher = null;
try {
IndexReader reader = getIndexReader(indexDir, poolDir);
if (null != reader) {
reader = refreshIndexReader(poolDir, reader);
}
if(null == reader){
log.error("索引文件为空,请检查!");
return null;
}
searcher = new IndexSearcher(reader);
searcher.setDefaultFieldSortScoring(true, false);
Analyzer analyzer = getAnalyzer();
if (keyWords[0].length() < 2) {
analyzer = new StandardAnalyzer(Version.LUCENE_36);
}
Query query = MultiFieldQueryParser.parse(Version.LUCENE_36,
keyWords, fields, analyzer);
// query.setBoost(0.1f);
/*
* 1.被排序的字段必须被索引过(Indexecd),在索引时不能 用 Field.Index.TOKENIZED
* (用UN_TOKENIZED可以正常实现.用NO时查询正常,但排序不能正常设置升降序) 2.SortField类型
* SCORE、DOC、AUTO、STRING、INT、FLOAT、CUSTOM 此类型主要是根据字段的类型选择
* 3.SortField的第三个参数代表是否是降序true:降序 false:升序
*/
Sort sort = new Sort(new SortField[] { SortField.FIELD_SCORE,
new SortField(fields[0], SortField.STRING, true) });
TopDocs topDocs = null;
if(isPage){
topDocs = searcher.search(query, searcher.maxDoc(), sort);
}else{
int searchNum = pageSize<searcher.maxDoc()?pageSize:searcher.maxDoc();
topDocs = searcher.search(query, searchNum, sort);
}
ScoreDoc[] hits = topDocs.scoreDocs;
int begin = pageSize * (currentPage - 1);
int end = Math.min(begin + pageSize, hits.length);
List<Document> documents = new ArrayList<Document>();
for (int i = begin; i < end; i++) {
Document document = searcher.doc(hits[i].doc);
if (isHighlighter) {
document.getField(fields[0])
.setValue(toHighlighter(query, document, fields[0],
analyzer));
}
documents.add(document);
// hits[i].score 匹配度分值
}
searchResult.setPageSize(pageSize);
searchResult.setCurrentPage(currentPage);
searchResult.setDocuments(documents);
searchResult.setTotalCount(hits.length);
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (ParseException e) {
e.printStackTrace();
}
return searchResult;
}
/**
* 使用Field信息来批量删除文档
* @description: <br>
* @author:jiaojun
* @param indexDir
* @param poolDir
* @param field
* @param keyWord
* @throws IOException
* @throws CorruptIndexException
*/
public static void deleteIndex(String indexDir, String poolDir,String field,String keyWord) {
IndexWriter writer = null;
try {
writer = getIndexWriter(indexDir, poolDir);
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (LockObtainFailedException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
if (null == writer) {
log.error("IndexWriter获取失败");
return;
}
Term term=new Term(field,keyWord); //分别代表FieldName,和field的值。
try {
writer.deleteDocuments(term);
writer.forceMerge(DEFAULT_MAX_NUM_SEGMENTS);
writer.commit();
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
//reader.close();//在调用close方法前的删除只是标记删除,只有调用了writer.optimize后才//是真正的在物理上删除,否则是可以使用reader.undeleteAll(),方法进行恢复的
}
/**
* 释放索引
*/
public static void destroy() {
synchronized (writerPool) {
Iterator<Entry<String, IndexWriter>> iterator = writerPool
.entrySet().iterator();
while (iterator.hasNext()) {
Entry<String, IndexWriter> entry = iterator.next();
IndexWriter indexWriter = entry.getValue();
try {
indexWriter.commit();
indexWriter.close();
} catch (Exception e) {
log.error("writerPool销毁失败,原因:" + e.getMessage());
}
}
writerPool.clear();
}
synchronized (readerPool) {
Iterator<Entry<String, IndexReader>> iterator = readerPool
.entrySet().iterator();
while (iterator.hasNext()) {
Entry<String, IndexReader> entry = iterator.next();
IndexReader indexReader = entry.getValue();
try {
indexReader.close();
} catch (Exception e) {
log.error("readerPool销毁失败,原因:" + e.getMessage());
}
}
readerPool.clear();
}
}
/**
* 释放创建索引
*/
public static void destroyIndexWriter() {
synchronized (writerPool) {
Iterator<Entry<String, IndexWriter>> iterator = writerPool
.entrySet().iterator();
while (iterator.hasNext()) {
Entry<String, IndexWriter> entry = iterator.next();
IndexWriter indexWriter = entry.getValue();
try {
indexWriter.close();
} catch (Exception e) {
log.error("writerPool销毁失败,原因:" + e.getMessage());
}
}
writerPool.clear();
}
log.info("【创建索引池】完成销毁");
}
/**
* 释放旧查询索引
*/
public static void destroyIndexReader(Map<Long, IndexReader> readerPool) {
synchronized (readerPool) {
Iterator<Entry<Long, IndexReader>> iterator = readerPool.entrySet()
.iterator();
while (iterator.hasNext()) {
Entry<Long, IndexReader> entry = iterator.next();
if ((System.currentTimeMillis() - entry.getKey()) >= STALE_INDEXREADER_SURVIVAL_TIME) {
IndexReader indexReader = entry.getValue();
try {
indexReader.close();
log.info("【查询索引池】完成销毁" + entry.getValue());
} catch (Exception e) {
log.error("readerPool销毁失败,原因:" + e.getMessage());
}
}
}
readerPool.clear();
}
}
/**
* 刷新指定的indexReader--加载新的索引数据,若产生新的indexReader,
* 则在indexReaderMap里替换旧的indexReader
*
* @param indexDirName
* @param indexReader
* @return {@link IndexReader}
*/
private synchronized static IndexReader refreshIndexReader(String poolDir,
IndexReader indexReader) {
try {
destroyIndexReader(stalereaderPool);
IndexReader newIndexReader = indexReader.reopen(false);
if (newIndexReader != indexReader) {
IndexReader oldIndexReader = indexReader;
stalereaderPool.put(System.currentTimeMillis(), oldIndexReader);
readerPool.put(poolDir, newIndexReader);
}
} catch (Exception e) {
log.error("刷新索引失败" + e.getMessage());
}
// return newest IndexReader
return readerPool.get(poolDir);
}
/**
* 过滤特殊符号
*
* @param str
* @return
* @throws PatternSyntaxException
*/
public static String stringFilter(String str) throws PatternSyntaxException {
String regEx = "[`~!@#$%^&*()+=|{ }':;',\\[\\].<>/?~!@#¥%……&*()——+|{}【】‘;:”“’。,、?·\'\"\\-\t\n\r]";
Pattern p = Pattern.compile(regEx);
Matcher m = p.matcher(str);
return m.replaceAll("").trim();
}
/**
* 高亮设置
*
* @param query
* @param doc
* @param field
* @return
*/
private static String toHighlighter(Query query, Document doc,
String field, Analyzer analyzer) {
try {
SimpleHTMLFormatter simpleHtmlFormatter = new SimpleHTMLFormatter(
"<font color=\"red\">", "</font>");
Highlighter highlighter = new Highlighter(simpleHtmlFormatter,
new QueryScorer(query));
highlighter.setTextFragmenter(new SimpleFragmenter(doc.get(field)
.length() + 100));
TokenStream tokenStream = analyzer.tokenStream(field,
new StringReader(doc.get(field)));
String highlighterStr = highlighter.getBestFragment(tokenStream,
doc.get(field));
return highlighterStr == null ? doc.get(field) : highlighterStr;
} catch (IOException e) {
log.error(e.getMessage());
} catch (InvalidTokenOffsetsException e) {
log.error(e.getMessage());
}
return null;
}
@SuppressWarnings("static-access")
private static IndexWriter getIndexWriter(String indexDir, String poolDir)
throws CorruptIndexException, LockObtainFailedException,
IOException {
IndexWriter writer = writerPool.get(poolDir);
if (writer == null) {
synchronized (writerPool) {
if (!writerPool.containsKey(poolDir)) {
try {
writer = createIndexWriter(indexDir + poolDir);
if (writer != null)
writerPool.put(poolDir, writer);
} catch (IOException e) {
if (IndexWriter.isLocked(FSDirectory
.open(getIndexFile(indexDir + poolDir)))) {
IndexWriter.unlock(FSDirectory
.open(getIndexFile(indexDir + poolDir)));
}
log.error(e.getMessage());
e.printStackTrace();
destroy();
}
}
}
}
return writer;
}
private static IndexReader getIndexReader(String indexDir, String poolDir)
throws CorruptIndexException, IOException {
IndexReader reader = readerPool.get(poolDir);
synchronized (readerPool) {
if (!readerPool.containsKey(poolDir)) {
try {
reader = IndexReader.open(FSDirectory
.open(getIndexFile(indexDir + poolDir)),false);
if (reader != null)
readerPool.put(poolDir, reader);
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return reader;
}
private static IndexWriter createIndexWriter(String dir)
throws CorruptIndexException, LockObtainFailedException,
IOException {
/*
* mmseg4j:ComplexAnalyzer 适用于高匹配度的中文 lucene标准:StandardAnalyzer
*/
IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_36,
getAnalyzer());
/*
* 创建索引模式:CREATE,覆盖模式; conf.setOpenMode(OpenMode.CREATE);
*
* APPEND,追加模式 conf.setOpenMode(OpenMode.CREATE_OR_APPEND);
*/
conf.setOpenMode(OpenMode.CREATE_OR_APPEND);
if (IndexWriter.isLocked(FSDirectory.open(getIndexFile(dir)))) {
IndexWriter.unlock(FSDirectory.open(getIndexFile(dir)));
}
IndexWriter writer = new IndexWriter(FSDirectory
.open(getIndexFile(dir)), conf);
return writer;
}
/**
* 获取分词模式 paodingAnalyer Paoding paoding = PaodingMaker.make(); return
* PaodingAnalyzer.writerMode(paoding); //writer mode意味要同时支持最大和最小切词
* lucene标准:StandardAnalyzer new StandardAnalyzer(Version.LUCENE_36);
*
* @return
*/
private static Analyzer getAnalyzer() {
Paoding paoding = PaodingMaker.make();
return PaodingAnalyzer.writerMode(paoding);
}
private static File getIndexFile(String dir) {
return new File(new StringBuilder(new File(LuceneUtil.class
.getResource("/").getPath()).getParentFile().getParentFile()
.getPath().replace('\\', '/').toString()).append(dir)
.toString());
}
public static void main(String[] args) {
System.out.println(stringFilter("[不懂就要问]请问 H6不能插u盘听歌吗 知道的说下 谢谢!"));
// init();
// for (int i = 0; i < 50; i++) {
// new Thread(new Runnable() {
//
// @Override
// public void run() {
// try {
// Thread.currentThread().sleep(500);
// } catch (InterruptedException e) {
// e.printStackTrace();
// }
//
// IndexWriter close = null;
// IndexWriter noClose = null;
// IndexWriter searchLog = null;
// try {
// close = getIndexWriter("/WEB-INF/index/", "close");
// noClose = getIndexWriter("/WEB-INF/index/", "noClose");
// searchLog = getIndexWriter("/WEB-INF/index/",
// "searchLog");
//
// IndexReader readerc = getIndexReader("/WEB-INF/index/",
// "close");
// IndexReader readern = getIndexReader("/WEB-INF/index/",
// "noClose");
// IndexReader readers = getIndexReader("/WEB-INF/index/",
// "searchLog");
//
// // System.out.println(readerc);
// // System.out.println(readern);
// // System.out.println(readers);
//
// } catch (CorruptIndexException e) {
// e.printStackTrace();
// } catch (LockObtainFailedException e) {
// e.printStackTrace();
// } catch (IOException e) {
// e.printStackTrace();
// }
//
// if (close == null || noClose == null) {
// System.out.println("-----------");
// }
// // System.out.println(close);
// // System.out.println(noClose);
// // System.out.println(searchLog);
//
// }
// }).start();
// }
//
// // destroy();
}
}
0.1版
package com.junjiao.util.search;
import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import net.paoding.analysis.analyzer.PaodingAnalyzer;
import net.paoding.analysis.knife.Paoding;
import net.paoding.analysis.knife.PaodingMaker;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.util.Version;
import com.junjiao.util.java.PropertyUtil;
/**
* luence操作工具类 提供索引创建、查询功能 lucene vsrsion 3.6.1
*
* 【索引的检查与修复】
* CheckIndex在lucene-core jar包的org.apache.lucene.index目录下。它的功能是检查索引的的健康情况和修复索引。<br/>
* 如果检查出某些segments有错误, 可以通过-fix参数执行修复操作,修复的过程就是创建一个新的segments,把所有引 <br/>
* 用错误segments的索引数据删除。
*
* cd /var/www/virtualhost/qa.51auto.cn/WEB-INF/lib
* java -cp /var/www/virtualhost/qa.51auto.cn/WEB-INF/lib/lucene-core-3.6.1.jar -ea:org.apache.lucene... org.apache.lucene.index.CheckIndex /var/www/virtualhost/qa.51auto.cn/WEB-INF/index/all
* 检查
* java -cp lucene-core-3.6.1.jar -ea:org.apache.lucene... org.apache.lucene.index.CheckIndex /var/www/virtualhost/qa.51auto.cn/WEB-INF/index/all
* 修复
* java -cp lucene-core-3.6.1.jar -ea:org.apache.lucene... org.apache.lucene.index.CheckIndex /var/www/virtualhost/qa.51auto.cn/WEB-INF/index/tag -fix
* @author jiaojun [junjiao.j@gmail.com]
* @version v0.0.1
* @param <T>
* @date 2012-08-20
*/
public class LuceneUtil<T> {
private static Log log = LogFactory.getLog(LuceneUtil.class);
/**
* 索引优化后文件段的数量,数量越大,优化效率越大
*/
private static final int DEFAULT_MAX_NUM_SEGMENTS = 3;
/**
* 低版本的查询索引存活周期
*/
private static final long STALE_INDEXREADER_SURVIVAL_TIME = 60000;
private static Map<String, IndexWriter> writerPool = new HashMap<String, IndexWriter>();
private static Map<String, IndexReader> readerPool = new HashMap<String, IndexReader>();
/**
* 存放IndexReader的Map,Map里存放的都是已经实例化好的IndexReader
*/
private static Map<Long, IndexReader> stalereaderPool = new HashMap<Long, IndexReader>();
private static LuceneUtil util = null;
private LuceneUtil() {
}
public synchronized static LuceneUtil getInstance() {
if (util == null) {
util = new LuceneUtil();
}
return util;
}
static {
init();
}
/**
* 始化索引池初
*/
public static void init() {
log.info("索引池初始化开始");
String indexDir = PropertyUtil.getPropertiesByKey("lucene.properties",
"lucene.index.dir");
String pool = PropertyUtil.getPropertiesByKey("lucene.properties",
"lucene.index.pool");
for (String poolDir : pool.split(",")) {
synchronized (writerPool) {
try {
IndexWriter iw = createIndexWriter(indexDir + poolDir);
if (iw != null)
writerPool.put(poolDir, iw);
} catch (IOException e) {
log.error("writerPool初始化失败,原因:" + e.getMessage());
}
}
synchronized (writerPool) {
try {
IndexReader ir = IndexReader.open(FSDirectory
.open(getIndexFile(indexDir + poolDir)));
if (ir != null)
readerPool.put(poolDir, ir);
} catch (Exception e) {
log.error("readerPool初始化失败,原因:" + e.getMessage());
}
}
}
log.info("索引池初始化完成");
}
/**
* 创建索引池初始化
*/
public static void initIndexWriter() {
log.info("【创建索引池】初始化开始");
String indexDir = PropertyUtil.getPropertiesByKey("lucene.properties",
"lucene.index.dir");
String pool = PropertyUtil.getPropertiesByKey("lucene.properties",
"lucene.index.pool");
for (String poolDir : pool.split(",")) {
synchronized (writerPool) {
try {
IndexWriter iw = createIndexWriter(indexDir + poolDir);
if (iw != null)
writerPool.put(poolDir, iw);
} catch (IOException e) {
log.error("writerPool初始化失败,原因:" + e.getMessage());
}
}
}
log.info("【创建索引池】初始化完成");
}
/**
* 创建索引,建议定时更新即可
*
* @param <T>
* @param indexDir
* 索引根保存位置
* @param poolDir
* 索引池保存位置
* @param list
* 需要创建索引的数据
* @param clz
* 数据绑定的对象
* @param fields
* 须创建索引的属性(小写)
* @throws IOException
* @throws NoSuchMethodException
* @throws SecurityException
* @throws InvocationTargetException
* @throws IllegalAccessException
* @throws IllegalArgumentException
*/
public static <T> void createIndex(String indexDir, String poolDir,
List<?> list, Class<T> clz, String[] fields) throws IOException,
SecurityException, NoSuchMethodException, IllegalArgumentException,
IllegalAccessException, InvocationTargetException {
createIndex(indexDir,poolDir,list,clz, fields,false);
}
/**
* 创建索引,建议定时更新即可
*
* @param <T>
* @param indexDir
* 索引根保存位置
* @param poolDir
* 索引池保存位置
* @param list
* 需要创建索引的数据
* @param clz
* 数据绑定的对象
* @param fields
* 须创建索引的属性(小写)
* @param isDel
* 是否删除原索引重新创建
* @throws IOException
* @throws NoSuchMethodException
* @throws SecurityException
* @throws InvocationTargetException
* @throws IllegalAccessException
* @throws IllegalArgumentException
*/
public static <T> void createIndex(String indexDir, String poolDir,
List<?> list, Class<T> clz, String[] fields,boolean isDel) throws IOException,
SecurityException, NoSuchMethodException, IllegalArgumentException,
IllegalAccessException, InvocationTargetException {
log.info("索引开始创建,服务于 " + clz + " | " + fields.toString());
long start = new Date().getTime();
IndexWriter writer = getIndexWriter(indexDir, poolDir);
if (null == writer) {
log.error("IndexWriter获取失败");
return;
}
// 删除全部索引
if(isDel){
writer.deleteAll();
}
SimpleDateFormat simpleDateFormat = new SimpleDateFormat(
"yyyy-MM-dd hh:mm:ss");
if (null != list && list.size() > 0) {
for (int i = 0; i < list.size(); i++) {
Document doc = new Document();
java.lang.reflect.Field[] cfs = clz.getDeclaredFields();
for (java.lang.reflect.Field cf : cfs) {
String fieldName = cf.getName();
String stringLetter = fieldName.substring(0, 1)
.toUpperCase();
String getName = "get" + stringLetter
+ fieldName.substring(1);
// String setName="set"+stringLetter+fieldName.substring(1);
Method getMethod = clz.getMethod(getName);
// Method setMethod=clz.getMethod(setName, new
// Class[]{cf.getType()});
Object value = getMethod.invoke((T) list.get(i));
if (Arrays.asList(fields).contains(fieldName)) {
if (value != null && !"".equals(value.toString())) {
String tmp = "";
if (cf.getGenericType().toString().equals(
"class java.util.Date")) {
tmp = simpleDateFormat.format(value);
} else {
tmp = value.toString();
}
doc.add(new Field(fieldName, tmp, Field.Store.YES,
Field.Index.ANALYZED));
}
}
}
if(!isDel){
/**
* 先将fields[0]的索引查找到,然后再删除,最后将新的索引添加到索引文件中
*/
if(null != doc.get(fields[0])){
writer.updateDocument(new Term(fields[0], doc.get(fields[0])),
doc);
}
}
}
log.info("索引创建完成,保存目录:" + indexDir + poolDir + ",索引创建/记录:"
+ writer.maxDoc() + "/" + list.size() + "条,花费时间:"
+ (new Date().getTime() - start) / 1000 + "秒!" + writer);
list.clear();
}
writer.forceMerge(DEFAULT_MAX_NUM_SEGMENTS);
writer.commit();
}
/**
* 分页查询索引 排序就默认按传入的fields属性的第一个元素的匹配度降序排列
*
* @param indexDir
* 索引根保存位置
* @param poolDir
* 索引池保存位置
* @param keyWords
* 关键词
* @param fields
* 属性
* @param pageSize
* 每页记录数
* @param currentPage
* 当前页数
* @throws IOException
* @return SearchResult 查询结果集
* @throws IOException
* @throws InvalidTokenOffsetsException
*/
public static SearchResult searchPage(String indexDir, String poolDir,
String[] keyWords, String[] fields, int pageSize, int currentPage)
throws IOException, InvalidTokenOffsetsException {
return searchPage(indexDir, poolDir, keyWords, fields, true, pageSize,
currentPage);
}
/**
* 分页查询索引 排序就默认按传入的fields属性的第一个元素的匹配度降序排列
*
* @param indexDir
* 索引根保存位置
* @param poolDir
* 索引池保存位置
* @param keyWords
* 关键词
* @param fields
* 属性
* @param isHighlighter
* 是否高亮显示
* @param pageSize
* 每页记录数
* @param currentPage
* 当前页数
* @throws IOException
* @return SearchResult 查询结果集
* @throws IOException
* @throws InvalidTokenOffsetsException
*/
public static SearchResult searchPage(String indexDir, String poolDir,
String[] keyWords, String[] fields, boolean isHighlighter,
int pageSize, int currentPage) throws IOException,
InvalidTokenOffsetsException {
return searchPage(indexDir, poolDir, keyWords, fields, true, pageSize,
currentPage,true);
}
/**
* 分页查询索引 排序就默认按传入的fields属性的第一个元素的匹配度降序排列
*
* @param indexDir
* 索引根保存位置
* @param poolDir
* 索引池保存位置
* @param keyWords
* 关键词
* @param fields
* 属性
* @param isHighlighter
* 是否高亮显示
* @param pageSize
* 每页记录数
* @param currentPage
* 当前页数
* @param isPage
* 是否分页,如无需分页只查条数的话,直接传入条数即可,大大优化索引查询效率
* @throws IOException
* @return SearchResult 查询结果集
* @throws IOException
* @throws InvalidTokenOffsetsException
*/
public static SearchResult searchPage(String indexDir, String poolDir,
String[] keyWords, String[] fields, boolean isHighlighter,
int pageSize, int currentPage,boolean isPage) throws IOException,
InvalidTokenOffsetsException {
//将关键字中的特殊符号过滤
if(null != keyWords && keyWords.length>0){
String[] tmp = new String[keyWords.length];
for(int i = 0;i<keyWords.length;i++){
tmp[i] = stringFilter(keyWords[i]);
}
keyWords = tmp;
}
SearchResult searchResult = new SearchResult();
IndexSearcher searcher = null;
try {
IndexReader reader = getIndexReader(indexDir, poolDir);
if (null != reader) {
reader = refreshIndexReader(poolDir, reader);
}
if(null == reader){
log.error("索引文件为空,请检查!");
return null;
}
searcher = new IndexSearcher(reader);
searcher.setDefaultFieldSortScoring(true, false);
Analyzer analyzer = getAnalyzer();
if (keyWords[0].length() < 2) {
analyzer = new StandardAnalyzer(Version.LUCENE_36);
}
Query query = MultiFieldQueryParser.parse(Version.LUCENE_36,
keyWords, fields, analyzer);
// query.setBoost(0.1f);
/*
* 1.被排序的字段必须被索引过(Indexecd),在索引时不能 用 Field.Index.TOKENIZED
* (用UN_TOKENIZED可以正常实现.用NO时查询正常,但排序不能正常设置升降序) 2.SortField类型
* SCORE、DOC、AUTO、STRING、INT、FLOAT、CUSTOM 此类型主要是根据字段的类型选择
* 3.SortField的第三个参数代表是否是降序true:降序 false:升序
*/
Sort sort = new Sort(new SortField[] { SortField.FIELD_SCORE,
new SortField(fields[0], SortField.STRING, true) });
TopDocs topDocs = null;
if(isPage){
topDocs = searcher.search(query, searcher.maxDoc(), sort);
}else{
int searchNum = pageSize<searcher.maxDoc()?pageSize:searcher.maxDoc();
topDocs = searcher.search(query, searchNum, sort);
}
ScoreDoc[] hits = topDocs.scoreDocs;
int begin = pageSize * (currentPage - 1);
int end = Math.min(begin + pageSize, hits.length);
List<Document> documents = new ArrayList<Document>();
for (int i = begin; i < end; i++) {
Document document = searcher.doc(hits[i].doc);
if (isHighlighter) {
document.getField(fields[0])
.setValue(toHighlighter(query, document, fields[0],
analyzer));
}
documents.add(document);
// hits[i].score 匹配度分值
}
searchResult.setPageSize(pageSize);
searchResult.setCurrentPage(currentPage);
searchResult.setDocuments(documents);
searchResult.setTotalCount(hits.length);
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (ParseException e) {
e.printStackTrace();
}
return searchResult;
}
/**
* 释放索引
*/
public static void destroy() {
synchronized (writerPool) {
Iterator<Entry<String, IndexWriter>> iterator = writerPool
.entrySet().iterator();
while (iterator.hasNext()) {
Entry<String, IndexWriter> entry = iterator.next();
IndexWriter indexWriter = entry.getValue();
try {
indexWriter.commit();
indexWriter.close();
} catch (Exception e) {
log.error("writerPool销毁失败,原因:" + e.getMessage());
}
}
writerPool.clear();
}
synchronized (readerPool) {
Iterator<Entry<String, IndexReader>> iterator = readerPool
.entrySet().iterator();
while (iterator.hasNext()) {
Entry<String, IndexReader> entry = iterator.next();
IndexReader indexReader = entry.getValue();
try {
indexReader.close();
} catch (Exception e) {
log.error("readerPool销毁失败,原因:" + e.getMessage());
}
}
readerPool.clear();
}
}
/**
* 释放创建索引
*/
public static void destroyIndexWriter() {
synchronized (writerPool) {
Iterator<Entry<String, IndexWriter>> iterator = writerPool
.entrySet().iterator();
while (iterator.hasNext()) {
Entry<String, IndexWriter> entry = iterator.next();
IndexWriter indexWriter = entry.getValue();
try {
indexWriter.close();
} catch (Exception e) {
log.error("writerPool销毁失败,原因:" + e.getMessage());
}
}
writerPool.clear();
}
log.info("【创建索引池】完成销毁");
}
/**
* 释放旧查询索引
*/
public static void destroyIndexReader(Map<Long, IndexReader> readerPool) {
synchronized (readerPool) {
Iterator<Entry<Long, IndexReader>> iterator = readerPool.entrySet()
.iterator();
while (iterator.hasNext()) {
Entry<Long, IndexReader> entry = iterator.next();
if ((System.currentTimeMillis() - entry.getKey()) >= STALE_INDEXREADER_SURVIVAL_TIME) {
IndexReader indexReader = entry.getValue();
try {
indexReader.close();
log.info("【查询索引池】完成销毁" + entry.getValue());
} catch (Exception e) {
log.error("readerPool销毁失败,原因:" + e.getMessage());
}
}
}
readerPool.clear();
}
}
/**
* 刷新指定的indexReader--加载新的索引数据,若产生新的indexReader,
* 则在indexReaderMap里替换旧的indexReader
*
* @param indexDirName
* @param indexReader
* @return {@link IndexReader}
*/
private synchronized static IndexReader refreshIndexReader(String poolDir,
IndexReader indexReader) {
try {
destroyIndexReader(stalereaderPool);
IndexReader newIndexReader = indexReader.reopen();
if (newIndexReader != indexReader) {
IndexReader oldIndexReader = indexReader;
stalereaderPool.put(System.currentTimeMillis(), oldIndexReader);
readerPool.put(poolDir, newIndexReader);
}
} catch (Exception e) {
log.error("刷新索引失败" + e.getMessage());
}
// return newest IndexReader
return readerPool.get(poolDir);
}
/**
* 过滤特殊符号
*
* @param str
* @return
* @throws PatternSyntaxException
*/
public static String stringFilter(String str) throws PatternSyntaxException {
String regEx = "[`~!@#$%^&*()+=|{ }':;',\\[\\].<>/?~!@#¥%……&*()——+|{}【】‘;:”“’。,、?·\'\"\\-\t\n\r]";
Pattern p = Pattern.compile(regEx);
Matcher m = p.matcher(str);
return m.replaceAll("").trim();
}
/**
* 高亮设置
*
* @param query
* @param doc
* @param field
* @return
*/
private static String toHighlighter(Query query, Document doc,
String field, Analyzer analyzer) {
try {
SimpleHTMLFormatter simpleHtmlFormatter = new SimpleHTMLFormatter(
"<font color=\"red\">", "</font>");
Highlighter highlighter = new Highlighter(simpleHtmlFormatter,
new QueryScorer(query));
highlighter.setTextFragmenter(new SimpleFragmenter(doc.get(field)
.length() + 100));
TokenStream tokenStream = analyzer.tokenStream(field,
new StringReader(doc.get(field)));
String highlighterStr = highlighter.getBestFragment(tokenStream,
doc.get(field));
return highlighterStr == null ? doc.get(field) : highlighterStr;
} catch (IOException e) {
log.error(e.getMessage());
} catch (InvalidTokenOffsetsException e) {
log.error(e.getMessage());
}
return null;
}
@SuppressWarnings("static-access")
private static IndexWriter getIndexWriter(String indexDir, String poolDir)
throws CorruptIndexException, LockObtainFailedException,
IOException {
IndexWriter writer = writerPool.get(poolDir);
if (writer == null) {
synchronized (writerPool) {
if (!writerPool.containsKey(poolDir)) {
try {
writer = createIndexWriter(indexDir + poolDir);
if (writer != null)
writerPool.put(poolDir, writer);
} catch (IOException e) {
if (IndexWriter.isLocked(FSDirectory
.open(getIndexFile(indexDir + poolDir)))) {
IndexWriter.unlock(FSDirectory
.open(getIndexFile(indexDir + poolDir)));
}
log.error(e.getMessage());
e.printStackTrace();
destroy();
}
}
}
}
return writer;
}
private static IndexReader getIndexReader(String indexDir, String poolDir)
throws CorruptIndexException, IOException {
IndexReader reader = readerPool.get(poolDir);
synchronized (readerPool) {
if (!readerPool.containsKey(poolDir)) {
try {
reader = IndexReader.open(FSDirectory
.open(getIndexFile(indexDir + poolDir)));
if (reader != null)
readerPool.put(poolDir, reader);
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return reader;
}
private static IndexWriter createIndexWriter(String dir)
throws CorruptIndexException, LockObtainFailedException,
IOException {
/*
* mmseg4j:ComplexAnalyzer 适用于高匹配度的中文 lucene标准:StandardAnalyzer
*/
IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_36,
getAnalyzer());
/*
* 创建索引模式:CREATE,覆盖模式; conf.setOpenMode(OpenMode.CREATE);
*
* APPEND,追加模式 conf.setOpenMode(OpenMode.CREATE_OR_APPEND);
*/
conf.setOpenMode(OpenMode.CREATE_OR_APPEND);
if (IndexWriter.isLocked(FSDirectory.open(getIndexFile(dir)))) {
IndexWriter.unlock(FSDirectory.open(getIndexFile(dir)));
}
IndexWriter writer = new IndexWriter(FSDirectory
.open(getIndexFile(dir)), conf);
return writer;
}
/**
* 获取分词模式 paodingAnalyer Paoding paoding = PaodingMaker.make(); return
* PaodingAnalyzer.writerMode(paoding); //writer mode意味要同时支持最大和最小切词
* lucene标准:StandardAnalyzer new StandardAnalyzer(Version.LUCENE_36);
*
* @return
*/
private static Analyzer getAnalyzer() {
Paoding paoding = PaodingMaker.make();
return PaodingAnalyzer.writerMode(paoding);
}
private static File getIndexFile(String dir) {
return new File(new StringBuilder(new File(LuceneUtil.class
.getResource("/").getPath()).getParentFile().getParentFile()
.getPath().replace('\\', '/').toString()).append(dir)
.toString());
}
public static void main(String[] args) {
System.out.println(stringFilter("[不懂就要问]请问 H6不能插u盘听歌吗 知道的说下 谢谢!"));
// init();
// for (int i = 0; i < 50; i++) {
// new Thread(new Runnable() {
//
// @Override
// public void run() {
// try {
// Thread.currentThread().sleep(500);
// } catch (InterruptedException e) {
// e.printStackTrace();
// }
//
// IndexWriter close = null;
// IndexWriter noClose = null;
// IndexWriter searchLog = null;
// try {
// close = getIndexWriter("/WEB-INF/index/", "close");
// noClose = getIndexWriter("/WEB-INF/index/", "noClose");
// searchLog = getIndexWriter("/WEB-INF/index/",
// "searchLog");
//
// IndexReader readerc = getIndexReader("/WEB-INF/index/",
// "close");
// IndexReader readern = getIndexReader("/WEB-INF/index/",
// "noClose");
// IndexReader readers = getIndexReader("/WEB-INF/index/",
// "searchLog");
//
// // System.out.println(readerc);
// // System.out.println(readern);
// // System.out.println(readers);
//
// } catch (CorruptIndexException e) {
// e.printStackTrace();
// } catch (LockObtainFailedException e) {
// e.printStackTrace();
// } catch (IOException e) {
// e.printStackTrace();
// }
//
// if (close == null || noClose == null) {
// System.out.println("-----------");
// }
// // System.out.println(close);
// // System.out.println(noClose);
// // System.out.println(searchLog);
//
// }
// }).start();
// }
//
// // destroy();
}
}