</pre><pre code_snippet_id="1596031" snippet_file_name="blog_20160303_2_418520" name="code" class="java"><pre name="code" class="java">Lucene版本:5.2.1
java版本:1.8
测试时间:2016-3-3 16:04:57
显示分词效果
public static void displayToken(String str,Analyzer a) {try {TokenStream stream = a.tokenStream("content",new StringReader(str));//创建一个属性,这个属性会添加流中,随着这个TokenStream增加CharTermAttribute cta = stream.addAttribute(CharTermAttribute.class);stream.reset();while(stream.incrementToken()) {System.out.print("["+cta+"]");}stream.end();System.out.println();} catch (IOException e) {e.printStackTrace();}}
//显示所有token过程
<span style="white-space:pre"> </span>public static void displayAllTokenInfo(String str,Analyzer a) {
try {
TokenStream stream = a.tokenStream("content",new StringReader(str));
//位置增量的属性,存储语汇单元之间的距离
PositionIncrementAttribute pia =
stream.addAttribute(PositionIncrementAttribute.class);
//每个语汇单元的位置偏移量
OffsetAttribute oa =
stream.addAttribute(OffsetAttribute.class);
//存储每一个语汇单元的信息(分词单元信息)
CharTermAttribute cta =
stream.addAttribute(CharTermAttribute.class);
//使用的分词器的类型信息
TypeAttribute ta =
stream.addAttribute(TypeAttribute.class);
stream.reset();
for(;stream.incrementToken();) {
System.out.print(pia.getPositionIncrement()+":");
System.out.print(cta+"["+oa.startOffset()+"-"+oa.endOffset()+"]-->"+ta.type()+"\n");
}
} catch (Exception e) {
e.printStackTrace();
}
}
@Test
//测试工具类
public void test01() {
Analyzer a1 = new StandardAnalyzer();
Analyzer a2 = new StopAnalyzer();
Analyzer a3 = new SimpleAnalyzer();
Analyzer a4 = new WhitespaceAnalyzer();
String txt = "this is GS/MC " +
"My email is info@163.com,My QQ is 65356435";
AnalyzerUtils.displayToken(txt, a1);
AnalyzerUtils.displayToken(txt, a2);
AnalyzerUtils.displayToken(txt, a3);
AnalyzerUtils.displayToken(txt, a4);
}