一、通过TokenStream查看分词的详细信息
package com.wsy;
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.Version;
import java.io.IOException;
import java.io.StringReader;
public class AnalyzerUtils {
public static void displayAllToken(String string, Analyzer analyzer) {
try {
TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(string));
// 放入属性信息,为了查看流中的信息
// 位置增量信息,语汇单元之间的距离
PositionIncrementAttribute positionIncrementAttribute = tokenStream.addAttribute(PositionIncrementAttribute.class);
// 每个语汇单元的位置偏移量信息
OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
// 每一个语汇单元的分词信息
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
// 使用的分词器的类型信息
TypeAttribute typeAttribute = tokenStream.addAttribute(TypeAttribute.class);
while (tokenStream.incrementToken()) {
System.out.println(positionIncrementAttribute.getPositionIncrement() + ":" + charTermAttribute + "[" + offsetAttribute.startOffset() + "-" + offsetAttribute.endOffset() + "]-->" + typeAttribute.type());
}
System.out.println("----------------------------");
} catch (IOException e) {
e.printStackTrace();
}
}
public static void main(String[] args) {
Analyzer analyzer1 = new StandardAnalyzer(Version.LUCENE_35);
Analyzer analyzer2 = new StopAnalyzer(Version.LUCENE_35);
Analyzer analyzer3 = new SimpleAnalyzer(Version.LUCENE_35);
Analyzer analyzer4 = new WhitespaceAnalyzer(Version.LUCENE_35);
String string3 = "how are you, thank you.";
AnalyzerUtils.displayAllToken(string3, analyzer1);
AnalyzerUtils.displayAllToken(string3, analyzer2);
AnalyzerUtils.displayAllToken(string3, analyzer3);
AnalyzerUtils.displayAllToken(string3, analyzer4);
}
}
二、总结
要想查看TokenStream中的详细信息,就需要把查看信息的属性丢进去,之后按照需要输出即可。依然可以发现,不同的分词器表现出了不同的分词形态。