java lucene 开发介绍
lucene 版本号5.3.2,索引查看器 luke 版本号 5.5.0,非maven项目,需要自行下载jar包,使用ansj_seg分词器
在src下新建类 library.java
import java.nio.file.Paths;
import java.util.ListResourceBundle;
public class library extends ListResourceBundle{
private final Object myData[][]= {
{"dic","1111"},
{"ambiguity","2222"},
{"ambiguityLibrary","3333"},
{"isRealName","true"},
{"isNameRecognition","true"},
{"isNumRecognition","true"},
{"isQuantifierRecognition","true"}
};
@Override
protected Object[][] getContents() {
String path=library.class.getResource("/").getPath()+"library";
path=path.substring(1);
myData[0][1]=Paths.get(path, "default.dic").toString();
myData[1][1]=Paths.get(path, "ambiguity.dic").toString();
myData[2][1]=Paths.get(path, "ambiguity.dic").toString();
return myData;
}
}
在src下创建library文件夹,里面写入分词字典*.dic
ambiguity.dic
default.dic
regex.dic
stop.dic
synonyms.dic
在src下新建资源文件lucene.properties
# lucene 索引位置
lucene.index.path=D:/lucene/index
因为分词器在低版本中需要自己实现一些方法
AnsjTokenizerFactory.java
package com.lucene.utils;
import java.util.Map;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.util.TokenizerFactory;
import org.apache.lucene.util.AttributeFactory;
import org.nlpcn.commons.lang.util.logging.Log;
import org.nlpcn.commons.lang.util.logging.LogFactory;
public class AnsjTokenizerFactory extends TokenizerFactory {
public final Log logger = LogFactory.getLog();
private Map<String, String> args;
public AnsjTokenizerFactory(Map<String, String> args) {
super(args);
this.args = args ;
}
@Override
public Tokenizer create(AttributeFactory factory) {
return AnsjAnalyzer.getTokenizer(null, args);
}
}
AnsjTokenizer.java
package com.lucene.utils;
import org.ansj.domain.Result;
import org.ansj.domain.Term;
import org.ansj.recognition.impl.StopRecognition;
import org.ansj.recognition.impl.SynonymsRecgnition;
import org.ansj.splitWord.Analysis;
import org.ansj.util.AnsjReader;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
public final class AnsjTokenizer extends Tokenizer {
// 当前词
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
// 偏移量
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
// 距离
private final PositionIncrementAttribute positionAttr = addAttribute(PositionIncrementAttribute.class);
// 分词词性
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
protected Analysis ta = null;
private LinkedList<Object> result;
private List<StopRecognition> stops; //停用词对象
private List<SynonymsRecgnition> synonyms; //同义词词典
public AnsjTokenizer(Analysis ta, List<StopRecognition> stops, List<SynonymsRecgnition> synonyms) {
this.ta = ta;
this.stops = stops;
this.synonyms = synonyms;
}
@Override
public final boolean incrementToken() throws IOException {
int position = 0;
if (result == null) {
parse();
}
Object obj = result.pollFirst();
if (obj == null) {
result = null;
return false;
}
if (obj instanceof Term) {
clearAttributes();
Term term = (Term) obj;
while (filterTerm(term)) { //停用词
term = (Term) result.pollFirst();
if (term == null) {
result = null;
return false;
}
position++;
}
List<String> synonyms = term.getSynonyms(); //获得同义词
String rName = null;
if (synonyms != null) {
for (int i = 1; i < synonyms.size(); i++) {
result.addFirst(synonyms.get(i));
}
rName = synonyms.get(0);
} else {
rName = term.getName();
}
position++;
offsetAtt.setOffset(term.getOffe(), term.getOffe() + term.getName().length());
typeAtt.setType(term.getNatureStr());
positionAttr.setPositionIncrement(position);
termAtt.setEmpty().append(rName);
} else {
positionAttr.setPositionIncrement(position);
termAtt.setEmpty().append(obj.toString());
}
return true;
}
private boolean filterTerm(Term term) {
if (stops != null && stops.size() > 0) {
for (StopRecognition filterRecognition : stops) {
if (filterRecognition.filter(term)) {
return true;
}
}
}
return false;
}
/**
* 必须重载的方法,否则在批量索引文件时将会导致文件索引失败
*/
@Override
public void reset() throws IOException {
super.reset();
ta.resetContent(new AnsjReader(this.input));
parse();
}
private void parse() throws IOException {
Result parse = ta.parse();
if (synonyms != null) {
for (SynonymsRecgnition sr : synonyms) {
parse.recognition(sr);
}
}
result = new LinkedList<Object>(parse.getTerms());
}
}
AnsjAnalyzer.java
package com.lucene.utils;
import java.io.BufferedReader;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.ansj.library.AmbiguityLibrary;
import org.ansj.library.CrfLibrary;
import org.ansj.library.DicLibrary;
import org.ansj.library.StopLibrary;
import org.ansj.library.SynonymsLibrary;
import org.ansj.recognition.impl.StopRecognition;
import org.ansj.recognition.impl.SynonymsRecgnition;
import org.ansj.splitWord.Analysis;
import org.ansj.splitWord.analysis.BaseAnalysis;
import org.ansj.splitWord.analysis.DicAnalysis;
import org.ansj.splitWord.analysis.IndexAnalysis;
import org.ansj.splitWord.analysis.NlpAnalysis;
import org.ansj.splitWord.analysis.ToAnalysis;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Tokenizer;
import org.nlpcn.commons.lang.tire.domain.Forest;
import org.nlpcn.commons.lang.tire.domain.SmartForest;
import org.nlpcn.commons.lang.util.StringUtil;
import org.nlpcn.commons.lang.util.logging.Log;
import org.nlpcn.commons.lang.util.logging.LogFactory;
public class AnsjAnalyzer extends Analyzer {
public static final Log LOG = LogFactory.getLog();
/**
* dic equals user , query equals to
* @author ansj
*
*/
public static enum TYPE {
base_ansj, index_ansj, query_ansj, dic_ansj, nlp_ansj
}
/**
* 分词类型
*/
private Map<String, String> args;
/**
*/
public AnsjAnalyzer(Map<String, String> args) {
this.args = args;
}
public AnsjAnalyzer(TYPE type, String dics) {
this.args = new HashMap<String, String>();
args.put("type", type.name());
args.put(DicLibrary.DEFAULT, dics);
}
public AnsjAnalyzer(TYPE type) {
this.args = new HashMap<String, String>();
args.put("type", type.name());
}
@Override
protected TokenStreamComponents createComponents(String text) {
BufferedReader reader = new BufferedReader(new StringReader(text));
Tokenizer tokenizer = null;
tokenizer = getTokenizer(reader, this.args);
return new TokenStreamComponents(tokenizer);
}
/**
* 获得一个tokenizer
*
* @param reader
* @return
*/
public static Tokenizer getTokenizer(Reader reader, Map<String, String> args) {
if (LOG.isDebugEnabled()) {
LOG.debug("to create tokenizer " + args);
}
Analysis analysis = null;
String temp = null;
String type = args.get("type");
if (type == null) {
type = AnsjAnalyzer.TYPE.base_ansj.name();
}
switch (AnsjAnalyzer.TYPE.valueOf(type)) {
case base_ansj:
analysis = new BaseAnalysis();
break;
case index_ansj:
analysis = new IndexAnalysis();
break;
case dic_ansj:
analysis = new DicAnalysis();
break;
case query_ansj:
analysis = new ToAnalysis();
break;
case nlp_ansj:
analysis = new NlpAnalysis();
if (StringUtil.isNotBlank(temp = args.get(CrfLibrary.DEFAULT))) {
((NlpAnalysis) analysis).setCrfModel(CrfLibrary.get(temp));
}
break;
default:
analysis = new BaseAnalysis();
}
if (reader != null) {
analysis.resetContent(reader);
}
if (StringUtil.isNotBlank(temp = args.get(DicLibrary.DEFAULT))) { //用户自定义词典
String[] split = temp.split(",");
Forest[] forests = new Forest[split.length];
for (int i = 0; i < forests.length; i++) {
if (StringUtil.isBlank(split[i])) {
continue;
}
forests[i] = DicLibrary.get(split[i]);
}
analysis.setForests(forests);
}
List<StopRecognition> filters = null;
if (StringUtil.isNotBlank(temp = args.get(StopLibrary.DEFAULT))) { //用户自定义词典
String[] split = temp.split(",");
filters = new ArrayList<StopRecognition>();
for (String key : split) {
StopRecognition stop = StopLibrary.get(key.trim());
if (stop != null)
filters.add(stop);
}
}
List<SynonymsRecgnition> synonyms = null;
if (StringUtil.isNotBlank(temp = args.get(SynonymsLibrary.DEFAULT))) { //同义词词典
String[] split = temp.split(",");
synonyms = new ArrayList<SynonymsRecgnition>();
for (String key : split) {
SmartForest<List<String>> sf = SynonymsLibrary.get(key.trim());
if (sf != null)
synonyms.add(new SynonymsRecgnition(sf));
}
}
if (StringUtil.isNotBlank(temp = args.get(AmbiguityLibrary.DEFAULT))) { //歧义词典
analysis.setAmbiguityForest(AmbiguityLibrary.get(temp.trim()));
}
if (StringUtil.isNotBlank(temp = args.get("isNameRecognition"))) { // 是否开启人名识别
analysis.setIsNameRecognition(Boolean.valueOf(temp));
}
if (StringUtil.isNotBlank(temp = args.get("isNumRecognition"))) { // 是否开启数字识别
analysis.setIsNumRecognition(Boolean.valueOf(temp));
}
if (StringUtil.isNotBlank(temp = args.get("isQuantifierRecognition"))) { //量词识别
analysis.setIsQuantifierRecognition(Boolean.valueOf(temp));
}
if (StringUtil.isNotBlank(temp = args.get("isRealName"))) { //是否保留原字符
analysis.setIsRealName(Boolean.parseBoolean(temp));
}
return new AnsjTokenizer(analysis, filters, synonyms);
}
}
读取lucene配置文件PropertiesLucene.java
package com.lucene.utils;
import java.util.ResourceBundle;
public class PropertiesLucene {
private static ResourceBundle resource = ResourceBundle.getBundle("lucene");
public final static String getValue(String key){
String result=null;
try {
result=resource.getString(key);
}catch(Exception e) {
result=null;
}
return result;
}
}
创建读写器唯一实例LuceneUtils.java
package com.lucene.utils;
import java.nio.file.Paths;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
public class LuceneUtils {
// 数据在多少的时候进行提交,提交之后可以清空缓存信息,释放内存
public static final int commitNumber = 10000;
// 使用volatile 保证其可见性,在多并发中保证其原子性
private static LuceneUtils luceneUtils;
private static Map<String,IndexWriter> writeMap=new HashMap<>();
private static Map<String,IndexReader> readMap=new HashMap<>();
public static Analyzer analyzer = new AnsjAnalyzer(AnsjAnalyzer.TYPE.index_ansj);
public static LuceneUtils getInstance(String lucene_index_path) {
try {
if (luceneUtils == null) {
luceneUtils=new LuceneUtils();
Directory dir = FSDirectory.open(Paths.get(lucene_index_path));//索引建立的位置
//创建indexwriterCofig对象
IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
// 新建索引文档,删除之前的索引文档
iwc.setOpenMode(OpenMode.CREATE);
IndexWriter writer = new IndexWriter(dir, iwc);
writeMap.put(lucene_index_path,writer);
//创建一个Directory对象,指定索引库存放的路径
//创建IndexReader对象,需要指定Directory对象
IndexReader reader = DirectoryReader.open(dir);
readMap.put(lucene_index_path, reader);
}
} catch (Exception e) {
e.printStackTrace();
}
return luceneUtils;
}
public Map<String, IndexWriter> getWriteMap() {
return writeMap;
}
public Map<String, IndexReader> getReadMap() {
return readMap;
}
}
lucene查询数据收集器AccessibleHitCollector.java
package com.lucene.utils;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.Scorer;
public abstract class AccessibleHitCollector implements Collector {
protected Scorer scorer;
protected boolean shouldScore;
protected int docBase;
protected boolean outOfOrder;
public abstract int getTotalHits();
public abstract int getDocId(int pos);
public abstract float getScore(int pos);
public abstract void reset();
}
AllHitsCollector.java
package com.lucene.utils;
import java.io.IOException;
import java.util.ArrayList;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.LeafCollector;
import org.apache.lucene.search.Scorer;
public class AllHitsCollector extends AccessibleHitCollector{
private ArrayList<AllHit> hits = new ArrayList<AllHit>();
public AllHitsCollector(boolean outOfOrder, boolean shouldScore) {
this.outOfOrder = outOfOrder;
this.shouldScore = shouldScore;
}
public void collect(int doc) {
float score = 1.0f;
if (shouldScore) {
try {
score = scorer.score();
} catch (IOException e) {
e.printStackTrace();
}
}
hits.add(new AllHit(docBase + doc, score));
}
public int getTotalHits() {
return hits.size();
}
public int getDocId(int i) {
return (hits.get(i)).docId;
}
public float getScore(int i) {
return (hits.get(i)).score;
}
@Override
public LeafCollector getLeafCollector(LeafReaderContext leafReaderContext) throws IOException {
this.docBase = leafReaderContext.docBase;
return new LeafCollector() {
private Scorer scorer;
@Override
public void setScorer(Scorer scorer) throws IOException {
this.scorer = scorer;
}
@Override
public void collect(int doc) throws IOException {
float score = 1.0f;
if (shouldScore) {
try {
score = scorer.score();
} catch (IOException e) {
e.printStackTrace();
}
}
hits.add(new AllHit(docBase + doc, score));
}
};
}
@Override
public boolean needsScores() {
return false;
}
private static class AllHit {
public int docId;
public float score;
public AllHit(int docId, float score) {
this.docId = docId;
this.score = score;
}
}
@Override
public void reset() {
hits.clear();
}
}
lucene 写入和搜索代码
public synchronized void write() {
IndexWriter writer = null;
try {
log.debug("start index for lucene");
LuceneUtils utils=LuceneUtils.getInstance(lucene_index_path);
//为什么需要进行停顿呢,如果不进行停顿,lucene的初始化工作还没有做好,会出现报错问题
Thread.sleep(10000);
Map<String, IndexWriter> map=utils.getWriteMap();
writer=map.get(lucene_index_path);
log.debug("delete index for lucene");
writer.deleteAll();//删除索引
//写入lucene
menuDao.writeMenuAll(writer);
writer.forceMerge(1);
} catch (Exception e) {
e.printStackTrace();
}finally{
try {
if(writer!=null) {
writer.commit();
log.debug("create index for lucene is success");
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
public Map<String,List<LuceneBean>> search(String managerId,String userId,String searchText) throws ParseException {
Map<String,List<LuceneBean>> map=new HashMap<>();
try {
IndexReader reader=LuceneUtils.getInstance(lucene_index_path).getReadMap().get(lucene_index_path);
//创建Indexsearcher对象,需要指定IndexReader对象
IndexSearcher searcher = new IndexSearcher(reader);
List<LuceneBean> menuList=searchMenu(LuceneUtils.analyzer, searcher, userId, searchText);
map.put("menu", menuList);
} catch (IOException e) {
e.printStackTrace();
} catch (InvalidTokenOffsetsException e) {
e.printStackTrace();
}
return map;
}
/**
* 高亮设置
* @param query
* @param doc
* @param field
* @return
* @throws InvalidTokenOffsetsException
* @throws IOException
*/
private String toHighlighter(Analyzer analyzer, Query query, Document doc,String hightField)
throws IOException, InvalidTokenOffsetsException {
SimpleHTMLFormatter simpleHtmlFormatter = new SimpleHTMLFormatter("<font style='color:red;'>", "</font>");
Highlighter highlighter = new Highlighter(simpleHtmlFormatter, new QueryScorer(query));
TokenStream tokenStream = analyzer.tokenStream("text", doc.get(hightField));
// String[] highlighterStr = highlighter.getBestFragments(tokenStream, doc.get(hightField),15);
String highlighterStr = highlighter.getBestFragment(tokenStream, doc.get(hightField));
return highlighterStr == null ? doc.get(hightField) : highlighterStr;
}
//具体查询操作
public List<LuceneBean> searchDetail(IndexSearcher balchangeSearcher,IndexSearcher fundinfoSearcher,IndexSearcher balancehistSearcher,LuceneReaderPO po) throws IOException {
List<LuceneBean> menuList=new ArrayList<>();
AccessibleHitCollector collector=new AllHitsCollector(false,false);
Query dateQuery = new TermQuery(new Term("date2", po.getDate()));
Query productIdQuery = new TermQuery(new Term("productId", po.getProductId()));
BooleanQuery builder = new BooleanQuery.Builder()
.add(dateQuery, BooleanClause.Occur.MUST)
.build();
QueryWrapperFilter filter=new QueryWrapperFilter(builder);
BooleanQuery booleanQuery=new BooleanQuery.Builder()
.add(filter,Occur.FILTER)
.add(productIdQuery, BooleanClause.Occur.MUST)
.build();
balancehistSearcher.search(booleanQuery, collector);
Document doc = null;
LuceneReaderPO newPo=null;
for(int i=0,size=collector.getTotalHits();i<size;i++) {
int docid=collector.getDocId(i);
doc = balancehistSearcher.doc(docid);
newPo= new LuceneReaderPO();
newPo.setProductId(po.getProductId());
bean.setMenuNameCn(toHighlighter(analyzer, booleanQuery, doc,"menuNameCn"));
newPo.setMinDate(po.getMinDate());
newPo.setDate(po.getDate());
menuList.add(newPo);
}
collector.reset();
collector=null;
return menuList;
}