简单不同分词器分词方法试验
package cn.tedu.test2;
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.analysis.core.SimpleAnalyzer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.junit.Test;
public class AnalyzerTest {
public void printTerm(Analyzer analyzer,String msg) {
//1.获取字符串流对象
StringReader reader = new StringReader(msg);
//2.利用分词器analyzer对reader进行分词处理,处理结果就是
//一个具备分词所有内容的流对象token
TokenStream token = analyzer.tokenStream("test", reader);
try {
//3.重置流对象
token.reset();
//4.获取每一个分词的字符串属性
CharTermAttribute attribute = token.getAttribute(CharTermAttribute.class);
while (token.incrementToken()) {
//5.打印attribute
System.out.println(attribute.toString());
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
@Test
public void run() {
StandardAnalyzer standardAnalyzer = new StandardAnalyzer();
SimpleAnalyzer simpleAnalyzer = new SimpleAnalyzer();
WhitespaceAnalyzer whitespaceAnalyzer = new WhitespaceAnalyzer();
SmartChineseAnalyzer smartChineseAnalyzer = new SmartChineseAnalyzer();
//6.定义字符串源数据
String msgString="那么B+树到底有什么优势呢?";
System.out.println("**********标准**********");
printTerm(standardAnalyzer, msgString);
System.out.println("**********简单**********");
printTerm(simpleAnalyzer, msgString);
System.out.println("**********空格**********");
printTerm(whitespaceAnalyzer, msgString);
System.out.println("**********智能中文**********");
printTerm(smartChineseAnalyzer, msgString);
}
}