Lucene自带了几个分词器WhitespaceAnalyzer,SimpleAnalyzer,StopAnalyzer,StandardAnalyzer,ChineseAnalyzer,CJKAnalyzer等。前面三个只适用于英文分词,StandardAnalyzer对可最简单地实现中文分词,即二分法,每个字都作为一个词,这样分出来虽然全面,但有很多缺点,比如,索引文件过大,检索时速度慢等。ChineseAnalyzer是按字分的,与StandardAnalyzer对中文的分词没有大的区别。 CJKAnalyzer是按两字切分的, 比较武断,并且会产生垃圾Token,影响索引大小。以上分词器过于简单,无法满足现实的需求,所以我们需要实现自己的分词算法。
由于MMAnalyzer分词器好没有真正的实现,所以这里先放着明天再研究。
其中代码;
package com.mylucene;
import java.io.IOException;
import java.io.StringReader;
import jeasy.analysis.MMAnalyzer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.analysis.cn.ChineseAnalyzer;
import org.apache.lucene.analysis.core.SimpleAnalyzer;
import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.util.Version;
public class SimpleAnalyzerTest {
public static void analyze(Analyzer analyzer,String text)
{
try {
System.out.println("所选择的分词器类:"+analyzer.getClass());
TokenStream tokenStream =analyzer.tokenStream("content", new StringReader(text));
tokenStream.reset();
while(tokenStream.incrementToken())
{
CharTermAttribute attribute=tokenStream.getAttribute(CharTermAttribute.class);
OffsetAttribute ffsetAttribute=tokenStream.getAttribute(OffsetAttribute.class);
System.out.println("["+ffsetAttribute.startOffset()+","+ffsetAttribute.endOffset()+","+attribute.toString()+"]");
}
tokenStream.close();
} catch (Exception e) {
e.printStackTrace();
}
}
public static void main(String[] args)
{
String enText = "My name is liguofeng!";
String chText = "我是中国人,我爱中国!";
SimpleAnalyzer analyzer1 = new SimpleAnalyzer(Version.LUCENE_47);
analyze(analyzer1,enText);
analyze(analyzer1,chText);
CJKAnalyzer analyzer2 = new CJKAnalyzer(Version.LUCENE_47);
analyze(analyzer2,enText);
analyze(analyzer2,chText);
ChineseAnalyzer analyzer3 = new ChineseAnalyzer();
analyze(analyzer3,enText);
analyze(analyzer3,chText);
StopAnalyzer analyzer4 = new StopAnalyzer(Version.LUCENE_47);
analyze(analyzer4,enText);
analyze(analyzer4,chText);
StandardAnalyzer analyzer5 = new StandardAnalyzer(Version.LUCENE_47);
analyze(analyzer5,enText);
analyze(analyzer5,chText);
WhitespaceAnalyzer analyzer6 = new WhitespaceAnalyzer(Version.LUCENE_47);
analyze(analyzer6,enText);
analyze(analyzer6,chText);
/*MMAnalyzer an= new MMAnalyzer();
try {
System.out.println(an.segment(chText, "|"));
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}*/
}
}