package org.lucene.util;
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
public class AnalyzerUtils {
public static void displayToken(String str,Analyzer a) {
try {
TokenStream stream = a.tokenStream("content",new StringReader(str));
//创建一个属性,这个属性会添加到流中,随着这个TokenStream一起增加
CharTermAttribute cta = stream.addAttribute(CharTermAttribute.class);
stream.reset();//在incrementToken()之前必须调用reset,否则会报错
while(stream.incrementToken()) {
System.out.print("["+cta+"]");
}
stream.end();//在incrementToken结束之后,必须调用end
System.out.println();
} catch (IOException e) {
e.printStackTrace();
}
}
public static void displayAllTokenInfo(String str,Analyzer a) {
try {
TokenStream stream = a.tokenStream("content",new StringReader(str));
//位置增量属性,存储语汇单元之间的距离
PositionIncrementAttribute pia =
stream.addAttribute(PositionIncrementAttribute.class);
//每个语汇单元的位置偏移量
OffsetAttribute oa =
stream.addAttribute(OffsetAttribute.class);
//存储每一个语汇单元的信息(分词单元信息)
CharTermAttribute cta =
stream.addAttribute(CharTermAttribute.class);
//使用的分词器的类型信息
TypeAttribute ta =
stream.addAttribute(TypeAttribute.class);
stream.reset();//在incrementToken()之前必须调用reset,否则会报错
while(stream.incrementToken()) {
System.out.print(pia.getPositionIncrement()+":");
System.out.print(cta+"["+oa.startOffset()+"-"+oa.endOffset()+"]-->"+ta.type()+"\n");
}
stream.end();//在incrementToken结束之后,必须调用end
} catch (Exception e) {
e.printStackTrace();
}
}
@Test
public void test02() {
Analyzer a1 = new StandardAnalyzer(Version.LUCENE_4_9);
Analyzer a2 = new StopAnalyzer(Version.LUCENE_4_9);
Analyzer a3 = new SimpleAnalyzer(Version.LUCENE_4_9);
Analyzer a4 = new WhitespaceAnalyzer(Version.LUCENE_4_9);
//Analyzer a5 = new MMSegAnalyzer(new File("D:\\tools\\javaTools\\lucene\\mmseg4j-1.8.5\\data"));
Analyzer a5 = new MMSegAnalyzer();
String txt = "系统建设采用代表当今云计算、大数据和互联网主流并成熟的技术进行架构设计,相应的软件开发和产品选型应充分考虑未来发展方向,同时保证平台在技术先进和可靠性。";
AnalyzerUtils.displayToken(txt, a1);
AnalyzerUtils.displayToken(txt, a2);
AnalyzerUtils.displayToken(txt, a3);
AnalyzerUtils.displayToken(txt, a4);
AnalyzerUtils.displayToken(txt, a5);
}
}