java lucene 开发介绍

最新推荐文章于 2024-06-20 10:09:55 发布

java 大叔

最新推荐文章于 2024-06-20 10:09:55 发布

阅读量2.7k

点赞数

分类专栏： java 文章标签： lucene

本文链接：https://blog.csdn.net/qq_36970064/article/details/110498740

版权

java 专栏收录该内容

11 篇文章 0 订阅

订阅专栏

java lucene 开发介绍

lucene 版本号5.3.2，索引查看器 luke 版本号 5.5.0,非maven项目，需要自行下载jar包，使用ansj_seg分词器

在src下新建类 library.java

import java.nio.file.Paths;
import java.util.ListResourceBundle;

public class library  extends ListResourceBundle{
	
	private final Object myData[][]= {
			{"dic","1111"},
			{"ambiguity","2222"},
			{"ambiguityLibrary","3333"},
			{"isRealName","true"},
			{"isNameRecognition","true"},
			{"isNumRecognition","true"},
			{"isQuantifierRecognition","true"}
	};
	
	@Override
	protected Object[][] getContents() {
		String path=library.class.getResource("/").getPath()+"library";
		path=path.substring(1);
		myData[0][1]=Paths.get(path, "default.dic").toString();
		myData[1][1]=Paths.get(path, "ambiguity.dic").toString();
		myData[2][1]=Paths.get(path, "ambiguity.dic").toString();
		
		return myData;
	}
}

在src下创建library文件夹，里面写入分词字典*.dic

ambiguity.dic
default.dic
regex.dic
stop.dic
synonyms.dic

在src下新建资源文件lucene.properties

# lucene 索引位置
lucene.index.path=D:/lucene/index

因为分词器在低版本中需要自己实现一些方法

AnsjTokenizerFactory.java

package com.lucene.utils;

import java.util.Map;

import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.util.TokenizerFactory;
import org.apache.lucene.util.AttributeFactory;
import org.nlpcn.commons.lang.util.logging.Log;
import org.nlpcn.commons.lang.util.logging.LogFactory;

public class AnsjTokenizerFactory extends TokenizerFactory {

	public final Log logger = LogFactory.getLog();

	private Map<String, String> args;

	public AnsjTokenizerFactory(Map<String, String> args) {
		super(args);
		this.args = args ;
	}

	@Override
	public Tokenizer create(AttributeFactory factory) {
		return AnsjAnalyzer.getTokenizer(null, args);
	}
}

AnsjTokenizer.java

package com.lucene.utils;

import org.ansj.domain.Result;
import org.ansj.domain.Term;
import org.ansj.recognition.impl.StopRecognition;
import org.ansj.recognition.impl.SynonymsRecgnition;
import org.ansj.splitWord.Analysis;
import org.ansj.util.AnsjReader;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;

import java.io.IOException;
import java.util.LinkedList;
import java.util.List;

public final class AnsjTokenizer extends Tokenizer {
	// 当前词
	private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
	// 偏移量
	private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
	// 距离
	private final PositionIncrementAttribute positionAttr = addAttribute(PositionIncrementAttribute.class);
	// 分词词性
	private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);

	protected Analysis ta = null;

	private LinkedList<Object> result;

	private List<StopRecognition> stops; //停用词对象

	private List<SynonymsRecgnition> synonyms; //同义词词典

	public AnsjTokenizer(Analysis ta, List<StopRecognition> stops, List<SynonymsRecgnition> synonyms) {
		this.ta = ta;
		this.stops = stops;
		this.synonyms = synonyms;
	}
	
	@Override
	public final boolean incrementToken() throws IOException {
		int position = 0;
		if (result == null) {
			parse();
		}

		Object obj = result.pollFirst();
		if (obj == null) {
			result = null;
			return false;
		}

		if (obj instanceof Term) {
			clearAttributes();
			Term term = (Term) obj;
			while (filterTerm(term)) { //停用词
				term = (Term) result.pollFirst();
				if (term == null) {
					result = null;
					return false;
				}
				position++;
			}

			List<String> synonyms = term.getSynonyms(); //获得同义词

			String rName = null;

			if (synonyms != null) {
				for (int i = 1; i < synonyms.size(); i++) {
					result.addFirst(synonyms.get(i));
				}
				rName = synonyms.get(0);
			} else {
				rName = term.getName();
			}
			position++;
			offsetAtt.setOffset(term.getOffe(), term.getOffe() + term.getName().length());
			typeAtt.setType(term.getNatureStr());

			positionAttr.setPositionIncrement(position);
			termAtt.setEmpty().append(rName);

		} else {
			positionAttr.setPositionIncrement(position);
			termAtt.setEmpty().append(obj.toString());
		}

		return true;
	}

	private boolean filterTerm(Term term) {
		if (stops != null && stops.size() > 0) {
			for (StopRecognition filterRecognition : stops) {
				if (filterRecognition.filter(term)) {
					return true;
				}
			}
		}
		return false;
	}

	/**
	 * 必须重载的方法，否则在批量索引文件时将会导致文件索引失败
	 */
	@Override
	public void reset() throws IOException {
		super.reset();
		ta.resetContent(new AnsjReader(this.input));
		parse();
	}

	private void parse() throws IOException {
		Result parse = ta.parse();
		if (synonyms != null) {
			for (SynonymsRecgnition sr : synonyms) {
				parse.recognition(sr);
			}
		}

		result = new LinkedList<Object>(parse.getTerms());
	}
}

AnsjAnalyzer.java

package com.lucene.utils;

import java.io.BufferedReader;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.ansj.library.AmbiguityLibrary;
import org.ansj.library.CrfLibrary;
import org.ansj.library.DicLibrary;
import org.ansj.library.StopLibrary;
import org.ansj.library.SynonymsLibrary;
import org.ansj.recognition.impl.StopRecognition;
import org.ansj.recognition.impl.SynonymsRecgnition;
import org.ansj.splitWord.Analysis;
import org.ansj.splitWord.analysis.BaseAnalysis;
import org.ansj.splitWord.analysis.DicAnalysis;
import org.ansj.splitWord.analysis.IndexAnalysis;
import org.ansj.splitWord.analysis.NlpAnalysis;
import org.ansj.splitWord.analysis.ToAnalysis;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Tokenizer;
import org.nlpcn.commons.lang.tire.domain.Forest;
import org.nlpcn.commons.lang.tire.domain.SmartForest;
import org.nlpcn.commons.lang.util.StringUtil;
import org.nlpcn.commons.lang.util.logging.Log;
import org.nlpcn.commons.lang.util.logging.LogFactory;

public class AnsjAnalyzer extends Analyzer {
	public static final Log LOG = LogFactory.getLog();

	/**
	 * dic equals user , query equals to
	 * @author ansj
	 *
	 */
	public static enum TYPE {
		base_ansj, index_ansj, query_ansj, dic_ansj, nlp_ansj
	}

	/**
	 * 分词类型
	 */
	private Map<String, String> args;

	/**
	 */
	public AnsjAnalyzer(Map<String, String> args) {
		this.args = args;
	}

	public AnsjAnalyzer(TYPE type, String dics) {
		this.args = new HashMap<String, String>();
		args.put("type", type.name());
		args.put(DicLibrary.DEFAULT, dics);
	}

	public AnsjAnalyzer(TYPE type) {
		this.args = new HashMap<String, String>();
		args.put("type", type.name());
	}

	@Override
	protected TokenStreamComponents createComponents(String text) {
		BufferedReader reader = new BufferedReader(new StringReader(text));
		Tokenizer tokenizer = null;
		tokenizer = getTokenizer(reader, this.args);
		return new TokenStreamComponents(tokenizer);
	}

	/**
	 * 获得一个tokenizer
	 * 
	 * @param reader
	 * @return
	 */
	public static Tokenizer getTokenizer(Reader reader, Map<String, String> args) {
		if (LOG.isDebugEnabled()) {
			LOG.debug("to create tokenizer " + args);
		}
		Analysis analysis = null;

		String temp = null;
		String type = args.get("type");

		if (type == null) {
			type = AnsjAnalyzer.TYPE.base_ansj.name();
		}

		switch (AnsjAnalyzer.TYPE.valueOf(type)) {
		case base_ansj:
			analysis = new BaseAnalysis();
			break;
		case index_ansj:
			analysis = new IndexAnalysis();
			break;
		case dic_ansj:
			analysis = new DicAnalysis();
			break;
		case query_ansj:
			analysis = new ToAnalysis();
			break;
		case nlp_ansj:
			analysis = new NlpAnalysis();
			if (StringUtil.isNotBlank(temp = args.get(CrfLibrary.DEFAULT))) {
				((NlpAnalysis) analysis).setCrfModel(CrfLibrary.get(temp));
			}
			break;
		default:
			analysis = new BaseAnalysis();
		}

		if (reader != null) {
			analysis.resetContent(reader);
		}

		if (StringUtil.isNotBlank(temp = args.get(DicLibrary.DEFAULT))) { //用户自定义词典
			String[] split = temp.split(",");
			Forest[] forests = new Forest[split.length];
			for (int i = 0; i < forests.length; i++) {
				if (StringUtil.isBlank(split[i])) {
					continue;
				}
				forests[i] = DicLibrary.get(split[i]);
			}
			analysis.setForests(forests);
		}

		List<StopRecognition> filters = null;
		if (StringUtil.isNotBlank(temp = args.get(StopLibrary.DEFAULT))) { //用户自定义词典
			String[] split = temp.split(",");
			filters = new ArrayList<StopRecognition>();
			for (String key : split) {
				StopRecognition stop = StopLibrary.get(key.trim());
				if (stop != null)
					filters.add(stop);
			}
		}

		List<SynonymsRecgnition> synonyms = null;
		if (StringUtil.isNotBlank(temp = args.get(SynonymsLibrary.DEFAULT))) { //同义词词典
			String[] split = temp.split(",");
			synonyms = new ArrayList<SynonymsRecgnition>();
			for (String key : split) {
				SmartForest<List<String>> sf = SynonymsLibrary.get(key.trim());
				if (sf != null)
					synonyms.add(new SynonymsRecgnition(sf));
			}
		}

		if (StringUtil.isNotBlank(temp = args.get(AmbiguityLibrary.DEFAULT))) { //歧义词典
			analysis.setAmbiguityForest(AmbiguityLibrary.get(temp.trim()));
		}

		if (StringUtil.isNotBlank(temp = args.get("isNameRecognition"))) { // 是否开启人名识别
			analysis.setIsNameRecognition(Boolean.valueOf(temp));
		}

		if (StringUtil.isNotBlank(temp = args.get("isNumRecognition"))) { // 是否开启数字识别
			analysis.setIsNumRecognition(Boolean.valueOf(temp));
		}

		if (StringUtil.isNotBlank(temp = args.get("isQuantifierRecognition"))) { //量词识别
			analysis.setIsQuantifierRecognition(Boolean.valueOf(temp));
		}

		if (StringUtil.isNotBlank(temp = args.get("isRealName"))) { //是否保留原字符
			analysis.setIsRealName(Boolean.parseBoolean(temp));
		}

		return new AnsjTokenizer(analysis, filters, synonyms);

	}

}

读取lucene配置文件PropertiesLucene.java

package com.lucene.utils;

import java.util.ResourceBundle;

public class PropertiesLucene {
	
	private static ResourceBundle resource = ResourceBundle.getBundle("lucene");
	
	public final static String getValue(String key){
		String result=null;
		try {
			result=resource.getString(key);
		}catch(Exception e) {
			result=null;
		}
		return result;
	}
}

创建读写器唯一实例LuceneUtils.java

package com.lucene.utils;

import java.nio.file.Paths;
import java.util.HashMap;
import java.util.Map;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

public class LuceneUtils {

	// 数据在多少的时候进行提交，提交之后可以清空缓存信息，释放内存
	public static final int commitNumber = 10000;
	// 使用volatile 保证其可见性，在多并发中保证其原子性
	private static LuceneUtils luceneUtils;
	
	private static Map<String,IndexWriter> writeMap=new HashMap<>();
	private static Map<String,IndexReader> readMap=new HashMap<>();
	
	public static Analyzer analyzer = new AnsjAnalyzer(AnsjAnalyzer.TYPE.index_ansj);

	public static LuceneUtils getInstance(String lucene_index_path) {
		try {
			if (luceneUtils == null) {
				luceneUtils=new LuceneUtils();
				Directory dir = FSDirectory.open(Paths.get(lucene_index_path));//索引建立的位置
				//创建indexwriterCofig对象
				IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
				// 新建索引文档，删除之前的索引文档
				iwc.setOpenMode(OpenMode.CREATE);
				IndexWriter writer = new IndexWriter(dir, iwc);
				writeMap.put(lucene_index_path,writer);
				
				//创建一个Directory对象，指定索引库存放的路径
				//创建IndexReader对象，需要指定Directory对象
				IndexReader reader = DirectoryReader.open(dir);
				readMap.put(lucene_index_path, reader);
			}
		} catch (Exception e) {
			e.printStackTrace();
		}
		return luceneUtils;
	}

	public Map<String, IndexWriter> getWriteMap() {
		return writeMap;
	}

	public Map<String, IndexReader> getReadMap() {
		return readMap;
	}
}

lucene查询数据收集器AccessibleHitCollector.java

package com.lucene.utils;

import org.apache.lucene.search.Collector;
import org.apache.lucene.search.Scorer;

public abstract class AccessibleHitCollector implements Collector {
	protected Scorer scorer;
	protected boolean shouldScore;
	protected int docBase;
	protected boolean outOfOrder;

	public abstract int getTotalHits();

	public abstract int getDocId(int pos);

	public abstract float getScore(int pos);

	public abstract void reset();

}

AllHitsCollector.java

package com.lucene.utils;

import java.io.IOException;
import java.util.ArrayList;

import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.LeafCollector;
import org.apache.lucene.search.Scorer;

public class AllHitsCollector extends AccessibleHitCollector{
	private ArrayList<AllHit> hits = new ArrayList<AllHit>();

	public AllHitsCollector(boolean outOfOrder, boolean shouldScore) {
		this.outOfOrder = outOfOrder;
		this.shouldScore = shouldScore;
	}

	public void collect(int doc) {
		float score = 1.0f;
		if (shouldScore) {
			try {
				score = scorer.score();
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
		hits.add(new AllHit(docBase + doc, score));
	}

	public int getTotalHits() {
		return hits.size();
	}

	public int getDocId(int i) {
		return (hits.get(i)).docId;
	}

	public float getScore(int i) {
		return (hits.get(i)).score;
	}

	@Override
	public LeafCollector getLeafCollector(LeafReaderContext leafReaderContext) throws IOException {
		this.docBase = leafReaderContext.docBase;
		return new LeafCollector() {
			private Scorer scorer;

			@Override
			public void setScorer(Scorer scorer) throws IOException {
				this.scorer = scorer;
			}

			@Override
			public void collect(int doc) throws IOException {
				float score = 1.0f;
				if (shouldScore) {
					try {
						score = scorer.score();
					} catch (IOException e) {
						e.printStackTrace();
					}
				}
				hits.add(new AllHit(docBase + doc, score));
			}
		};
	}

	@Override
	public boolean needsScores() {
		return false;
	}

	private static class AllHit {
		public int docId;
		public float score;

		public AllHit(int docId, float score) {
			this.docId = docId;
			this.score = score;
		}
	}

	@Override
	public void reset() {
		hits.clear();
	}
}

lucene 写入和搜索代码

public synchronized void write() {
    IndexWriter writer = null;
    try {
        log.debug("start index for lucene");
        LuceneUtils utils=LuceneUtils.getInstance(lucene_index_path);
        //为什么需要进行停顿呢，如果不进行停顿，lucene的初始化工作还没有做好，会出现报错问题
        Thread.sleep(10000);
        Map<String, IndexWriter> map=utils.getWriteMap();
        writer=map.get(lucene_index_path);
        log.debug("delete index for lucene");
        writer.deleteAll();//删除索引
        //写入lucene
        menuDao.writeMenuAll(writer);
        writer.forceMerge(1);
    } catch (Exception e) {
        e.printStackTrace();
    }finally{
        try {
            if(writer!=null) {
                writer.commit();
                log.debug("create index for lucene is success");
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

public Map<String,List<LuceneBean>> search(String managerId,String userId,String searchText) throws ParseException {
    Map<String,List<LuceneBean>> map=new HashMap<>();
    try {
        IndexReader reader=LuceneUtils.getInstance(lucene_index_path).getReadMap().get(lucene_index_path);
        //创建Indexsearcher对象，需要指定IndexReader对象
        IndexSearcher searcher = new IndexSearcher(reader);
        List<LuceneBean> menuList=searchMenu(LuceneUtils.analyzer, searcher, userId, searchText);
        map.put("menu", menuList);
    } catch (IOException e) {
        e.printStackTrace();
    } catch (InvalidTokenOffsetsException e) {
        e.printStackTrace();
    }
    return map;
}

/**
	 * 高亮设置
	 * @param query
	 * @param doc
	 * @param field
	 * @return
	 * @throws InvalidTokenOffsetsException 
	 * @throws IOException 
	 */
private String toHighlighter(Analyzer analyzer, Query query, Document doc,String hightField) 
    throws IOException, InvalidTokenOffsetsException {
    SimpleHTMLFormatter simpleHtmlFormatter = new SimpleHTMLFormatter("<font style='color:red;'>", "</font>");
    Highlighter highlighter = new Highlighter(simpleHtmlFormatter, new QueryScorer(query));
    TokenStream tokenStream = analyzer.tokenStream("text", doc.get(hightField));
    //		String[] highlighterStr = highlighter.getBestFragments(tokenStream, doc.get(hightField),15);
    String highlighterStr = highlighter.getBestFragment(tokenStream, doc.get(hightField));
    return highlighterStr == null ? doc.get(hightField) : highlighterStr;
}

//具体查询操作
public List<LuceneBean> searchDetail(IndexSearcher balchangeSearcher,IndexSearcher fundinfoSearcher,IndexSearcher balancehistSearcher,LuceneReaderPO po) throws IOException {
    List<LuceneBean> menuList=new ArrayList<>();
    AccessibleHitCollector collector=new AllHitsCollector(false,false);
    Query dateQuery = new TermQuery(new Term("date2", po.getDate()));
    Query productIdQuery = new TermQuery(new Term("productId", po.getProductId()));
    BooleanQuery builder = new BooleanQuery.Builder()
        .add(dateQuery, BooleanClause.Occur.MUST)
        .build();
    QueryWrapperFilter filter=new QueryWrapperFilter(builder);

    BooleanQuery booleanQuery=new BooleanQuery.Builder()
        .add(filter,Occur.FILTER)
        .add(productIdQuery, BooleanClause.Occur.MUST)
        .build();
    balancehistSearcher.search(booleanQuery, collector);
    Document doc = null;
    LuceneReaderPO newPo=null;
    for(int i=0,size=collector.getTotalHits();i<size;i++) {
        int docid=collector.getDocId(i);
        doc = balancehistSearcher.doc(docid);
        newPo= new LuceneReaderPO();
        newPo.setProductId(po.getProductId());
        bean.setMenuNameCn(toHighlighter(analyzer, booleanQuery, doc,"menuNameCn"));
        newPo.setMinDate(po.getMinDate());
        newPo.setDate(po.getDate());
        menuList.add(newPo);
    }
    collector.reset();
    collector=null;
    return menuList;
}

java 大叔

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
java lucene 开发介绍

java lucene 开发介绍lucene 版本号5.3.2，索引查看器 luke 版本号 5.5.0,非maven项目，需要自行下载jar包，使用ansj_seg分词器在src下新建类 library.javaimport java.nio.file.Paths;import java.util.ListResourceBundle;public class library extends ListResourceBundle{ private final Object myData
复制链接

扫一扫

专栏目录