IKAnalyzer:
jar: IKAnalyzer-IKAnalyzer2012FF_u1.jar
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apdplat.word.WordSegmenter;
import org.apdplat.word.lucene.ChineseWordAnalyzer;
import org.apdplat.word.segmentation.SegmentationAlgorithm;
import org.apdplat.word.segmentation.Word;
public static void printOut(String keyword){
Analyzer analyzer =new IKAnalyzer();
StringReader reader=new StringReader(keyword);
try {
TokenStream ts=analyzer.tokenStream("", reader);
CharTermAttribute term=ts.getAttribute(CharTermAttribute.class);
ts.reset();
while(ts.incrementToken()){
System.out.println(term.toString());
}
} catch (Exception e) {
e.printStackTrace();
}
}
new IKAnalyzer()的效果: (默认为false)
中华人民共和国
中华人民
中华
华人
人民共和国
人民
共和国
共和
国
new IKAnalyzer(true)的效果:
中华人民共和国
IKAnalyzer支持两种分词模式: false:最细粒度 true:智能分词模式
============================================
自定义分词器:(自定义,分割)
import java.io.Reader;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.pattern.PatternTokenizer;
/**
* 正则分词器
* @author bowen_bao
*/
public class PatternAnalyzer extends Analyzer{
private Pattern pattern;
private boolean toLowerCase;
public PatternAnalyzer(String regex,boolean toLowerCase){
this.pattern=Pattern.compile(regex);
this.toLowerCase=toLowerCase;
}
@Override
protected TokenStreamComponents createComponents(String arg0, Reader arg1) {
PatternTokenizer tokenizer=new PatternTokenizer(arg1,pattern,-1);
TokenStream result =toLowerCase?new LowerCaseFilter(tokenizer):tokenizer;
return new TokenStreamComponents(tokenizer,result);
}
}
new PatternAnalyzer(“,”,true);
======================================================
IKAnalyzer.cfg.xml为IKAnalyzer的配置文件 ,放在resources 下面
ext_dict 为扩展字典
ext_stopwords 为停止词字典
=================================================