C#中文分词

using System;

using System.IO;

using System.Text;

using System.Collections;

using System.Collections.Generic;

using System.Text.RegularExpressions;



/* 作者:卢楚风  个人网站:www.ccdodo.com

 * 本程序仅供学习参考使用,欢迎热衷与搜索引擎技术与中文分词技术的同仁一起讨论

 *  需要下载源码(含词典)可到http://download.csdn.net/source/519252下载

 * 转载请保留此信息,谢谢!

 * 2008-6-28

---------------------------------------------------------*/

namespace Ccdodo.WordSpliter

{

    /// <summary>

    /// 分词类

    /// </summary>

    public class Spliter

    {

        #region 属性

        private static string SplitChar = " ";//分隔符

        #endregion

        //

        #region 数据缓存函数

        /// <summary>

        /// 数据缓存函数

        /// </summary>

        /// <param name="key">索引键</param>

        /// <param name="val">缓存的数据</param>

        private static void SetCache(string key, object val)

        {

            if (val == null)

                val = " ";

            System.Web.HttpContext.Current.Application.Lock();

            System.Web.HttpContext.Current.Application.Set(key,val);

            System.Web.HttpContext.Current.Application.UnLock();

        }



        /// <summary>

        /// 读取缓存

        /// </summary>

        /// <param name="mykey"></param>

        /// <returns></returns>

        private static object GetCache(string key)

        {

            return System.Web.HttpContext.Current.Application.Get(key);

        }

        #endregion

        //

        #region 读取文本

        private static SortedList ReadTxtFile(string FilePath)

        {

            if (GetCache("dict") == null)

            {

                Encoding encoding = Encoding.GetEncoding("utf-8");

                SortedList arrText = new SortedList();

                //

                try

                {

                    FilePath = System.Web.HttpContext.Current.Server.MapPath(FilePath);

                    if (!File.Exists(FilePath))

                    {

                        arrText.Add("0","文件" + FilePath + "不存在...");

                    }

                    else

                    {

                        StreamReader objReader = new StreamReader(FilePath, encoding);

                        string sLine = "";

                        //ArrayList arrText = new ArrayList();



                        while (sLine != null)

                        {

                            sLine = objReader.ReadLine();

                            if (sLine != null)

                                arrText.Add(sLine, sLine);

                        }

                        //

                        objReader.Close();

                        objReader.Dispose();

                    }

                }

                catch (Exception) { }

                SetCache("dict", arrText);

                //return (string[])arrText.ToArray(typeof(string));

            }

            return (SortedList)GetCache("dict");

        }

        #endregion

        //

        #region 载入词典

        private static SortedList LoadDict

        {

            get { return ReadTxtFile("~/dict/default.dic"); }

        }

        #endregion

        //

        #region 正则检测

        private static bool IsMatch(string str, string reg)

        {

            return new Regex(reg).IsMatch(str);

        }

        #endregion

        //

        #region 首先格式化字符串(粗分)

        private static string FormatStr(string val)

        {

            string result = "";

            if (val == null || val == "")

                return "";

            //

            char[] CharList = val.ToCharArray();

            //

            string Spc = SplitChar;//分隔符

            int StrLen = CharList.Length;

            int CharType = 0; //0-空白 1-英文 2-中文 3-符号

            //

            for (int i = 0; i < StrLen; i++)

            {

                string StrList = CharList[i].ToString();

                if (StrList == null || StrList == "")

                    continue;

                //

                if (CharList[i] < 0x81)

                {

                    #region

                    if (CharList[i] < 33)

                    {

                        if (CharType != 0 && StrList != "/n" && StrList != "/r")

                        {

                            result += " ";

                            CharType = 0;

                        }

                        continue;

                    }

                    else if (IsMatch(StrList, "[^0-9a-zA-Z@//.%#:///&_-]"))//排除这些字符

                    {

                        if (CharType == 0)

                            result += StrList;

                        else

                            result += Spc + StrList;

                        CharType = 3;

                    }

                    else

                    {

                        if (CharType == 2 || CharType == 3)

                        {

                            result += Spc + StrList;

                            CharType = 1;

                        }

                        else

                        {

                            if (IsMatch(StrList, "[@%#:]"))

                            {

                                result += StrList;

                                CharType = 3;

                            }

                            else

                            {

                                result += StrList;

                                CharType = 1;

                            }//end if No.4

                        }//end if No.3

                    }//end if No.2

                    #endregion

                }//if No.1

                else

                {

                    //如果上一个字符为非中文和非空格,则加一个空格

                    if (CharType != 0 && CharType != 2)

                        result += Spc;

                    //如果是中文标点符号

                    if (!IsMatch(StrList, "^[/u4e00-/u9fa5]+___FCKpd___0quot;))

                    {

                        if(CharType!=0)

                            result += Spc + StrList;

                        else

                            result += StrList;

                        CharType = 3;

                    }

                    else //中文

                    {

                        result += StrList;

                        CharType = 2;

                    }

                }

                //end if No.1



            }//exit for

            //

            return result;

        }

        #endregion

        //

        #region 分词

        /// <summary>

        /// 分词

        /// </summary>

        /// <param name="key">关键词</param>

        /// <returns></returns>

        private static ArrayList StringSpliter(string[] key)

        {

            ArrayList List = new ArrayList();

            try

            {

                SortedList dict = LoadDict;//载入词典

                //

                for (int i = 0; i < key.Length; i++)

                {

                    if (IsMatch(key[i], @"^(?!^/.$)([a-zA-Z0-9/./u4e00-/u9fa5]+)___FCKpd___0quot;)) //中文、英文、数字

                    {

                        if (IsMatch(key[i], "^[/u4e00-/u9fa5]+___FCKpd___0quot;))//如果是纯中文

                        {

                            //if (!dict.Contains(key[i].GetHashCode()))

                            //    List.Add(key[i]);

                            //

                            int keyLen = key[i].Length;

                            if (keyLen < 2)

                                continue;

                            else if (keyLen <=7)

                                List.Add(key[i]);

                            //

                            //开始分词

                            for (int x = 0; x < keyLen; x++)

                            {

                                //x:起始位置//y:结束位置

                                for (int y = x; y < keyLen; y++)

                                {

                                    string val = key[i].Substring(x, keyLen - y);

                                    if (val == null || val.Length < 2)

                                        break;

                                    else if (val.Length > 10)

                                        continue;

                                    if (dict.Contains(val))

                                        List.Add(val);

                                }

                                //

                            }

                            //

                        }

                        //else if (IsMatch(key[i], @"^([0-9]+(/.[0-9]+)*)|([a-zA-Z]+)___FCKpd___0quot;))//纯数字、纯英文

                        //{

                        //    List.Add(key[i]);

                        //}

                        else if (!IsMatch(key[i], @"^(/.*)___FCKpd___0quot;))//不全是小数点

                        {

                            List.Add(key[i]);

                        }

                        //else //中文、英文、数字的混合

                        //{

                        //    List.Add(key[i]);

                        //}

                        //

                    }

                }

            }

            catch (Exception) { }

            //

            return List;

            //return (string[])List.ToArray(typeof(string));

        }

        #endregion

        //

        #region 得到分词结果

        /// <summary>

        /// 得到分词结果

        /// </summary>

        /// <param name="key"></param>

        /// <returns></returns>

        public static string[] DoSplit(string key)

        {

            ArrayList KeyList = StringSpliter(FormatStr(key).Split(SplitChar.ToCharArray()));

            KeyList.Insert(0,key);

            //

            //去掉重复的关键词

            for (int i = 0; i < KeyList.Count; i++)

            {

                for (int j = 0; j < KeyList.Count; j++)

                {

                    if (KeyList[i].ToString() == KeyList[j].ToString())

                    {

                        if (i != j)

                        {

                            KeyList.RemoveAt(j);j--;

                        }

                    }

                    //

                }

            }

            return (string[])KeyList.ToArray(typeof(string));

        }

        #endregion

        //

    }

    //

}
  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 4
    评论
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值