package com.essearch.core.analyzer;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.ngram.NGramTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
//如果需要编写自己的中文分词器,可以参照Lucene-analyzers-commons-4.10.2.jar中cn包下的中文分词进行改造,编写符合特殊要求的分词器。假如我们需要对文档中的每个字符进行分词,那么核心代码如下:
public class MyNGramAnalyzer extends Analyzer {
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
NGramTokenizer nGramTokenizer = new NGramTokenizer(reader,1,15);
TokenStream result = new LowerCaseFilter(nGramTokenizer);
return new TokenStreamComponents(nGramTokenizer,result);
}
private static void testtokenizer(Tokenizer tokenizer) {
try {
tokenizer.reset();
while (tokenizer.incrementToken()) {
CharTermAttribute charTermAttribute = tokenizer
.addAttribute(CharTermAttribute.class);
TermToBytesRefAttribute termToBytesRefAttribute = tokenizer
.addAttribute(TermToBytesRefAttribute.class);
PositionIncrementAttribute positionIncrementAttribute = tokenizer
.addAttribute(PositionIncrementAttribute.class);
PositionLengthAttribute positionLengthAttribute = tokenizer
.addAttribute(PositionLengthAttribute.class);
OffsetAttribute offsetAttribute = tokenizer
.addAttribute(OffsetAttribute.class);
TypeAttribute typeAttribute = tokenizer
.addAttribute(TypeAttribute.class);
// System.out.println(attribute.toString());
System.out.println("term=" + charTermAttribute.toString() + ","
+ offsetAttribute.startOffset() + "-"
+ offsetAttribute.endOffset() + ",type="
+ typeAttribute.type() + ",PositionIncrement="
+ positionIncrementAttribute.getPositionIncrement()
+ ",PositionLength="
+ positionLengthAttribute.getPositionLength());
}
tokenizer.end();
tokenizer.close();
} catch (IOException e) {
e.printStackTrace();
}
}
public static void main(String[] args) {
String s = "编码规范从根本上解决了程序维护员的难题;规范的编码阅读和理解起来更容易,也可以快速的不费力气的借鉴别人的编码。对将来维护你编码的人来说,你的编码越优化,他们就越喜欢你的编码,理解起来也就越快。";
StringReader sr = new StringReader(s);
NGramTokenizer nGramTokenizer = new NGramTokenizer(sr,1,15);
testtokenizer(nGramTokenizer);
Analyzer analyzer=new MyNGramAnalyzer();
testAnalyzer(analyzer,s);
}
private static void testAnalyzer(Analyzer analyzer,String data) {
TokenStream ts = null;
try {
ts = analyzer.tokenStream("myfield", new StringReader(data));
//获取词元位置属性
OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
//获取词元文本属性
CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
//获取词元文本属性
TypeAttribute type = ts.addAttribute(TypeAttribute.class);
ts.reset();
//迭代获取分词结果
while (ts.incrementToken()) {
System.out.println(offset.startOffset() + "-" + offset.endOffset() + ":" + term.toString() + "|" + type.type());
}
//关闭TokenStream(关闭StringReader)
ts.end();
} catch (IOException e) {
e.printStackTrace();
} finally {
//释放TokenStream的所有资源
if(ts != null){
try {
ts.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
@Override
protected TokenStreamComponents createComponents(String arg0) {
// TODO Auto-generated method stub
return null;
}
}
lucene分词器分词
最新推荐文章于 2023-05-23 00:01:06 发布