<dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-core</artifactId> <version>5.3.1</version> </dependency> <!--一般分词器,适用于英文分词--> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-analyzers-common</artifactId> <version>5.3.1</version> </dependency> <!--中文分词器--> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-analyzers-smartcn</artifactId> <version>5.3.1</version> </dependency> <!--对分词索引查询解析--> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-queryparser</artifactId> <version>5.3.1</version> </dependency> <!--检索关键字高亮显示--> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-highlighter</artifactId> <version>5.3.1</version> </dependency> <!--IK分词器--> <dependency> <groupId>com.janeluo</groupId> <artifactId>ikanalyzer</artifactId> <version>2012_u6</version> </dependency>
IKAnalyzer不支持5.x的lucene 添加下面两个类即可
import org.apache.lucene.analysis.Analyzer; public class IKAnalyzer5x extends Analyzer { private boolean useSmart; public boolean useSmart() { return this.useSmart; } public void setUseSmart(boolean useSmart) { this.useSmart = useSmart; } public IKAnalyzer5x() { this(false); } public IKAnalyzer5x(boolean useSmart) { this.useSmart = useSmart; } @Override protected TokenStreamComponents createComponents(String fieldName) { IKTokenizer5x _IKTokenizer = new IKTokenizer5x(this.useSmart); return new TokenStreamComponents(_IKTokenizer); } }
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.util.AttributeFactory; import org.wltea.analyzer.core.IKSegmenter; import org.wltea.analyzer.core.Lexeme; import java.io.IOException; public class IKTokenizer5x extends Tokenizer { private IKSegmenter _IKImplement; private final CharTermAttribute termAtt = (CharTermAttribute)this.addAttribute(CharTermAttribute.class); private final OffsetAttribute offsetAtt = (OffsetAttribute)this.addAttribute(OffsetAttribute.class); private final TypeAttribute typeAtt = (TypeAttribute)this.addAttribute(TypeAttribute.class); private int endPosition; public IKTokenizer5x() { this._IKImplement = new IKSegmenter(this.input, true); } public IKTokenizer5x(boolean useSmart) { this._IKImplement = new IKSegmenter(this.input, useSmart); } public IKTokenizer5x(AttributeFactory factory) { super(factory); this._IKImplement = new IKSegmenter(this.input, true); } public boolean incrementToken() throws IOException { this.clearAttributes(); Lexeme nextLexeme = this._IKImplement.next(); if(nextLexeme != null) { this.termAtt.append(nextLexeme.getLexemeText()); this.termAtt.setLength(nextLexeme.getLength()); this.offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition()); this.endPosition = nextLexeme.getEndPosition(); this.typeAtt.setType(nextLexeme.getLexemeTypeString()); return true; } else { return false; } } public void reset() throws IOException { super.reset(); this._IKImplement.reset(this.input); } public final void end() { int finalOffset = this.correctOffset(this.endPosition); this.offsetAtt.setOffset(finalOffset, finalOffset); } }
调用测试分词器
Analyzer analyzer = new IKAnalyzer5x(true); TokenStream tokenStream = analyzer.tokenStream("test", "一个新款韩版长袖羊驼绒羊毛皮草外套女海宁皮草"); OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { System.out.println(offsetAttribute.toString()); }