Java课程设计-文档相似性检查系统-字符串中文分词类

3 篇文章 0 订阅
1 篇文章 0 订阅

突然说起中文分词,真是无从下手。这东西涉及中文,肯定会需要引入其它类包吧。查查资料是需要Lucene的开源全文检索引擎工具包。在Lucene里面就有中文分词器IKAnalyzer  Analyzer 3.0 中文分词器  

需要的JAR如下

lucene-analyzers-2.4.1.jar  下载

lucene-core-2.4.1.jar  下载

IKAnalyzer2.0.20BF.jar 下载

import java.io.Reader; 
import java.io.StringReader; 
import org.apache.lucene.analysis.Analyzer; 
import org.apache.lucene.analysis.StopFilter; 
import org.apache.lucene.analysis.Token; 
import org.apache.lucene.analysis.TokenFilter; 
import org.apache.lucene.analysis.TokenStream; 
import org.apache.lucene.analysis.cjk.CJKAnalyzer; 
import org.apache.lucene.analysis.cn.ChineseAnalyzer; 
import org.apache.lucene.analysis.standard.StandardAnalyzer; 
import org.mira.lucene.analysis.MIK_CAnalyzer;


public class JeAnalyzer 
{ 

	private static String testString1 = "我喜欢看电视视频,不喜欢看电影。"; 
	public static void testStandard(String testString) 
	{
		try
		{
			Analyzer analyzer = new StandardAnalyzer(); 
			Reader r = new StringReader(testString); 
			StopFilter sf = (StopFilter) analyzer.tokenStream("", r); 
			System.err.println("=====standard analyzer===="); 
			Token t; 
			while ((t = sf.next()) != null) 
			{ 
				System.out.println(t.termText()); 
			} 
		}
		catch(Exception e)
		{
			e.printStackTrace();
		}
	}
	public static void testCJK(String testString) 
	{ 
		try
		{
			Analyzer analyzer = new CJKAnalyzer(); 
			Reader r = new StringReader(testString); 
			StopFilter sf = (StopFilter) analyzer.tokenStream("", r); 
			System.err.println("=====cjk analyzer===="); 
			Token t; 
			while ((t = sf.next()) != null) 
			{ 
				System.out.println(t.termText()); 
			} 
		}
		catch(Exception e)
		{
			e.printStackTrace();
		}
	} 
	public static void testChiniese(String testString) 
	{ 
		try
		{
			Analyzer analyzer = new ChineseAnalyzer(); 
			Reader r = new StringReader(testString); 
			TokenFilter tf = (TokenFilter) analyzer.tokenStream("", r); 
			System.err.println("=====chinese analyzer===="); 
			Token t; 
			while ((t = tf.next()) != null) 
			{ 
				System.out.println(t.termText()); 
			} 
		}
		catch(Exception e)
		{
			e.printStackTrace();
		}
	} 

	public static String transJe(String testString,String c1,String c2) 
	{
		String result = "";
		try 
		{
			Analyzer analyzer = new MIK_CAnalyzer(); 
			Reader r = new StringReader(testString); 
			TokenStream ts = (TokenStream)analyzer.tokenStream("", r); 
			//System.out.println("=====je analyzer===="); 
			Token t;
			while ((t = ts.next()) != null) 
			{
				result += t.termText()+",";
			}
		}
		catch(Exception e)
		{
			e.printStackTrace();
		}
		return result;
	} 
	public static void main(String[] args) 
	{ 
		try
		{

			String testString = testString1; 
			System.out.println(testString); 
			String sResult[] = transJe(testString,"gb2312","utf-8").split(","); 
			for(int i = 0 ; i< sResult.length ; i++)
			{
				System.out.println(sResult[i]);
			}
		}
		catch(Exception e)
		{
			e.printStackTrace();
		}
	}

} 


  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值