突然说起中文分词,真是无从下手。这东西涉及中文,肯定会需要引入其它类包吧。查查资料是需要Lucene的开源全文检索引擎工具包。在Lucene里面就有中文分词器IKAnalyzer Analyzer 3.0 中文分词器
需要的JAR如下
lucene-analyzers-2.4.1.jar 下载
lucene-core-2.4.1.jar 下载
IKAnalyzer2.0.20BF.jar 下载
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.analysis.cn.ChineseAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.mira.lucene.analysis.MIK_CAnalyzer;
public class JeAnalyzer
{
private static String testString1 = "我喜欢看电视视频,不喜欢看电影。";
public static void testStandard(String testString)
{
try
{
Analyzer analyzer = new StandardAnalyzer();
Reader r = new StringReader(testString);
StopFilter sf = (StopFilter) analyzer.tokenStream("", r);
System.err.println("=====standard analyzer====");
Token t;
while ((t = sf.next()) != null)
{
System.out.println(t.termText());
}
}
catch(Exception e)
{
e.printStackTrace();
}
}
public static void testCJK(String testString)
{
try
{
Analyzer analyzer = new CJKAnalyzer();
Reader r = new StringReader(testString);
StopFilter sf = (StopFilter) analyzer.tokenStream("", r);
System.err.println("=====cjk analyzer====");
Token t;
while ((t = sf.next()) != null)
{
System.out.println(t.termText());
}
}
catch(Exception e)
{
e.printStackTrace();
}
}
public static void testChiniese(String testString)
{
try
{
Analyzer analyzer = new ChineseAnalyzer();
Reader r = new StringReader(testString);
TokenFilter tf = (TokenFilter) analyzer.tokenStream("", r);
System.err.println("=====chinese analyzer====");
Token t;
while ((t = tf.next()) != null)
{
System.out.println(t.termText());
}
}
catch(Exception e)
{
e.printStackTrace();
}
}
public static String transJe(String testString,String c1,String c2)
{
String result = "";
try
{
Analyzer analyzer = new MIK_CAnalyzer();
Reader r = new StringReader(testString);
TokenStream ts = (TokenStream)analyzer.tokenStream("", r);
//System.out.println("=====je analyzer====");
Token t;
while ((t = ts.next()) != null)
{
result += t.termText()+",";
}
}
catch(Exception e)
{
e.printStackTrace();
}
return result;
}
public static void main(String[] args)
{
try
{
String testString = testString1;
System.out.println(testString);
String sResult[] = transJe(testString,"gb2312","utf-8").split(",");
for(int i = 0 ; i< sResult.length ; i++)
{
System.out.println(sResult[i]);
}
}
catch(Exception e)
{
e.printStackTrace();
}
}
}