lucene的中文分词器到现在还没有好的解决办法。下边介绍了两个lucene自己提供的分词器和一个javaeye上的网友实现的分词器。关于各个分词器的不同见代码中的print信息。直接运行得到console的输出结果更容易对比不同。 package analyzer; import java.io.Reader; import java.io.StringReader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.cjk.CJKAnalyzer; import org.apache.lucene.analysis.cn.ChineseAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.mira.lucene.analysis.IK_CAnalyzer; import org.mira.lucene.analysis.MIK_CAnalyzer; import com.sohospace.lucene.analysis.xanalyzer.XAnalyzer; import com.sohospace.lucene.analysis.xanalyzer.XFactory; import com.sohospace.lucene.analysis.xanalyzer.XTokenizer; // 中文分词使用了Paoding的分词技术,特表示感谢 public class TestCJKAnalyzer { private static String testString1 = "中华人民共和国在1949年建立,从此开始了新中国的伟大篇章"; private static String testString2 = "比尔盖茨从事餐饮业和服务业方面的工作"; public static void testStandard(String testString) throws Exception{ Analyzer analyzer = new StandardAnalyzer(); Reader r = new StringReader(testString); StopFilter sf = (StopFilter) analyzer.tokenStream("", r); System.err.println("=====standard analyzer===="); System.err.println("分析方法:默认没有词只有字"); Token t; while ((t = sf.next()) != null) { System.out.println(t.termText()); } } public static void testCJK(String testString) throws Exception{ Analyzer analyzer = new CJKAnalyzer(); Reader r = new StringReader(testString); StopFilter sf = (StopFilter) analyzer.tokenStream("", r); System.err.println("=====cjk analyzer===="); System.err.println("分析方法:交叉双字分割"); Token t; while ((t = sf.next()) != null) { System.out.println(t.termText()); } } public static void testChiniese(String testString) throws Exception{ Analyzer analyzer = new ChineseAnalyzer(); Reader r = new StringReader(testString); TokenFilter tf = (TokenFilter) analyzer.tokenStream("", r); System.err.println("=====chinese analyzer===="); System.err.println("分析方法:基本等同StandardAnalyzer"); Token t; while ((t = tf.next()) != null) { System.out.println(t.termText()); } } public static void testPaoding(String testString) throws Exception{ XAnalyzer analyzer = XFactory.getQueryAnalyzer(); Reader r = new StringReader(testString); XTokenizer ts = (XTokenizer) analyzer.tokenStream("", r); System.err.println("=====paoding analyzer===="); System.err.println("分析方法:字典分词,去掉停止词。在字典不能匹配的情况下使用CJKAnalyzer的分割发。"); Token t; while ((t = ts.next()) != null) { System.out.println(t.termText()); } } public static void testJe(String testString) throws Exception{// Analyzer analyzer = new MIK_CAnalyzer(); Analyzer analyzer = new IK_CAnalyzer(); Reader r = new StringReader(testString); TokenStream ts = (TokenStream)analyzer.tokenStream("", r); System.err.println("=====je analyzer===="); System.err.println("分析方法:字典分词,正反双向搜索,具体不明"); Token t; while ((t = ts.next()) != null) { System.out.println(t.termText()); } } public static void main(String[] args) throws Exception{// String testString = testString1; String testString = testString1; System.out.println(testString); testStandard(testString); testCJK(testString); testPaoding(testString); // testChiniese(testString);// testJe(testString); }}