IKAnalyzer需要自己去下载对应版本的架包
package com.team.lucene;
import java.io.StringReader;
import lombok.extern.slf4j.Slf4j;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.analysis.core.SimpleAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;
/**
* @ClassName:AnalyzerTest.java
* @Description: 分词器学习
* @author gaoguangjin
* @Date 2015-7-14 上午9:49:40
*/
@Slf4j
public class AnalyzerTest {
public static void main(String[] args) {
String content = "gaoguangjin is handsome boy 刘德华没有高广金帅气";
// 标准分词器
Analyzer analyzer1 = new StandardAnalyzer(Version.LUCENE_40);
// 简单分词器
Analyzer analyzer2 = new SimpleAnalyzer(Version.LUCENE_40);
// 二元切分
Analyzer analyzer3 = new CJKAnalyzer(Version.LUCENE_40);
// 中文语意分词 需要自己引入IKAnalyzer架包
Analyzer analyzer4 = new IKAnalyzer(true);
display(analyzer1, "StandardAnalyzer", content);
display(analyzer2, "SimpleAnalyzer", content);
display(analyzer3, "CJKAnalyzer", content);
display(analyzer4, "IKAnalyzer", content);
}
/**
* @Description: 展示
* @param analyzer
* @param analyzerName
* @return:void
*/
private static void display(Analyzer analyzer, String analyzerName, String content) {
try {
TokenStream tokenstream = analyzer.tokenStream("content", new StringReader(content));
// 为token设置属性类
CharTermAttribute termAttribute = tokenstream.addAttribute(CharTermAttribute.class);
// 重新设置
tokenstream.reset();
log.info("*******************" + analyzerName + "开始分词********************");
// 遍历得到token
while (tokenstream.incrementToken()) {
log.info(new String(termAttribute.buffer(), 0, termAttribute.length()) + " ");
}
} catch (Exception e) {
log.error(analyzerName + "分词失败!" + e.getLocalizedMessage());
}
}
}
打印
2015-07-14 13:30:42,623 INFO [main] (AnalyzerTest.java:55) - *******************StandardAnalyzer开始分词********************
2015-07-14 13:30:42,627 INFO [main] (AnalyzerTest.java:58) - gaoguangjin
2015-07-14 13:30:42,628 INFO [main] (AnalyzerTest.java:58) - handsome
2015-07-14 13:30:42,628 INFO [main] (AnalyzerTest.java:58) - boy
2015-07-14 13:30:42,630 INFO [main] (AnalyzerTest.java:58) - 刘
2015-07-14 13:30:42,630 INFO [main] (AnalyzerTest.java:58) - 德
2015-07-14 13:30:42,631 INFO [main] (AnalyzerTest.java:58) - 华
2015-07-14 13:30:42,631 INFO [main] (AnalyzerTest.java:58) - 没
2015-07-14 13:30:42,632 INFO [main] (AnalyzerTest.java:58) - 有
2015-07-14 13:30:42,632 INFO [main] (AnalyzerTest.java:58) - 高
2015-07-14 13:30:42,632 INFO [main] (AnalyzerTest.java:58) - 广
2015-07-14 13:30:42,632 INFO [main] (AnalyzerTest.java:58) - 金
2015-07-14 13:30:42,632 INFO [main] (AnalyzerTest.java:58) - 帅
2015-07-14 13:30:42,632 INFO [main] (AnalyzerTest.java:58) - 气
2015-07-14 13:30:42,634 INFO [main] (AnalyzerTest.java:55) - *******************SimpleAnalyzer开始分词********************
2015-07-14 13:30:42,634 INFO [main] (AnalyzerTest.java:58) - gaoguangjin
2015-07-14 13:30:42,637 INFO [main] (AnalyzerTest.java:58) - is
2015-07-14 13:30:42,637 INFO [main] (AnalyzerTest.java:58) - handsome
2015-07-14 13:30:42,637 INFO [main] (AnalyzerTest.java:58) - boy
2015-07-14 13:30:42,637 INFO [main] (AnalyzerTest.java:58) - 刘德华没有高广金帅气
2015-07-14 13:30:42,643 INFO [main] (AnalyzerTest.java:55) - *******************CJKAnalyzer开始分词********************
2015-07-14 13:30:42,643 INFO [main] (AnalyzerTest.java:58) - gaoguangjin
2015-07-14 13:30:42,643 INFO [main] (AnalyzerTest.java:58) - handsome
2015-07-14 13:30:42,644 INFO [main] (AnalyzerTest.java:58) - boy
2015-07-14 13:30:42,644 INFO [main] (AnalyzerTest.java:58) - 刘德
2015-07-14 13:30:42,644 INFO [main] (AnalyzerTest.java:58) - 德华
2015-07-14 13:30:42,644 INFO [main] (AnalyzerTest.java:58) - 华没
2015-07-14 13:30:42,644 INFO [main] (AnalyzerTest.java:58) - 没有
2015-07-14 13:30:42,645 INFO [main] (AnalyzerTest.java:58) - 有高
2015-07-14 13:30:42,645 INFO [main] (AnalyzerTest.java:58) - 高广
2015-07-14 13:30:42,645 INFO [main] (AnalyzerTest.java:58) - 广金
2015-07-14 13:30:42,645 INFO [main] (AnalyzerTest.java:58) - 金帅
2015-07-14 13:30:42,645 INFO [main] (AnalyzerTest.java:58) - 帅气
useSmart = true
2015-07-14 13:30:42,871 INFO [main] (AnalyzerTest.java:55) - *******************IKAnalyzer开始分词********************
2015-07-14 13:30:42,884 INFO [main] (AnalyzerTest.java:58) - gaoguangjin
2015-07-14 13:30:42,884 INFO [main] (AnalyzerTest.java:58) - is
2015-07-14 13:30:42,885 INFO [main] (AnalyzerTest.java:58) - handsome
2015-07-14 13:30:42,885 INFO [main] (AnalyzerTest.java:58) - boy
2015-07-14 13:30:42,885 INFO [main] (AnalyzerTest.java:58) - **刘德华**
2015-07-14 13:30:42,885 INFO [main] (AnalyzerTest.java:58) - 没
2015-07-14 13:30:42,885 INFO [main] (AnalyzerTest.java:58) - 有
2015-07-14 13:30:42,885 INFO [main] (AnalyzerTest.java:58) - 高
2015-07-14 13:30:42,885 INFO [main] (AnalyzerTest.java:58) - 广
2015-07-14 13:30:42,886 INFO [main] (AnalyzerTest.java:58) - 金
2015-07-14 13:30:42,886 INFO [main] (AnalyzerTest.java:58) - 帅
2015-07-14 13:30:42,886 INFO [main] (AnalyzerTest.java:58) - 气