简单的中文分词程序(C#源码)

出于兴趣,在参考了网上的一些资料后,自己就写了一个。
加入了权重的支持,可以解决一些歧义词


using System;
using System.Collections;
using System.Collections.Generic;
using System.Text;
using System.Xml;
using System.IO;
using System.Windows.Forms;

using System.Text.RegularExpressions;

using System.Runtime.Serialization.Formatters.Binary;

namespace EasySeg
{

    public class EasySeger
    {
        //dict path
        private static string DictPath = Application.StartupPath + "//data//dict2.txt";
  
        //store dict
        private static Hashtable chartable = new Hashtable();

        //weather the dict has been loaded
        private static bool DictLoaded = false;
        //the timespan of loading dict 
        public static double DictLoad_Span = 0;

        /// <summary>
        /// RegularExpressions's pattern
        /// </summary>
        private string strChinese = "[/u4e00-/u9fa5]";
        private string strNumber = "[0-9]";
        private string strEnglish = "[a-zA-Z]";
        private string strSymbol = "[//.]";

       
        //the timespan of Seging text 
        public double TextSeg_Span = 0;
              

        /// <summary>
        /// GetCharType
        /// </summary>
        /// <param name="Char"></param>
        /// <returns>
        /// 0: Chinese,1:English,2:Number,3:Symbol
        ///</returns>
        private int GetCharType(string Char)
        {
            if (new Regex(strChinese).IsMatch(Char))
                return 0;
            if (new Regex(strEnglish).IsMatch(Char))
                return 1;
            if (new Regex(strNumber).IsMatch(Char))
                return 2;
            if (new Regex(strSymbol).IsMatch(Char))
                return 3;
           
            return -1;

        }
                      

        //load dict
        private void LoadDict()
        {
            if (DictLoaded) return;
                       
            //if (File.Exists(DictHTPath))
            //{
            //    //load DictHT if exists
            //    chartable = Deserializer(DictHTPath);
            //}
            //else {
            //    //build DictHT
            //    BuidDictTree();
               
            //    //save DictHT
            //    Serializer(chartable, DictHTPath);
            //}
            BuidDictTree();         

            DictLoaded = true;
            return;
          
        }

        //buid a tree
        private void BuidDictTree()
        {
            //record start time
            long dt_s = DateTime.Now.Ticks;

            string char_s;
            StreamReader reader = new StreamReader(DictPath, System.Text.Encoding.UTF8);
                      
            while (!reader.EndOfStream)
            {
                string word = reader.ReadLine();

                if (word == null || word.Trim() == "")
                {
                   
                    continue;
                }
                               
                word = word.Trim();

                //get word's weight
                double weight = 0;
                if (word.IndexOf(" ") != -1)
                {
                    weight = double.Parse(word.Split(' ')[1].ToString());
                    word = word.Split(' ')[0];
                }

                Hashtable t_chartable = chartable;
                //add word
                for (int i = 0; i < word.Length; i++)
                {
                    char_s = word.Substring(i, 1);
                    if (!t_chartable.Contains(char_s))
                    {
                        t_chartable.Add(char_s, new Hashtable());
                    }
                   
                    t_chartable = (Hashtable)t_chartable[char_s];

                    //set word's weight
                    if (i == word.Length - 1)
                    {
                        if(t_chartable.Contains("WT"))
                            t_chartable["WT"] = weight;
                        else
                            t_chartable.Add("WT", weight);

                    }
                                     
                }
                
              
            }
            reader.Close();

            //record time span
            DictLoad_Span = (double)(DateTime.Now.Ticks - dt_s) / 10000;

        }

        //parse a text section
        public string SegText(string text)
        {
            text = text.Trim();

            //load Dict only once
            LoadDict();

            //record start time
            long dt_s = DateTime.Now.Ticks;

            string ReText = "";
            Hashtable t_chartable = chartable;

            bool flag = false;          
            for (int i = 0; i < text.Length; i++)
            {
                //get a word
                double weight;
                string word;
                FindWord(text.Substring(i),out weight,out word);

                if (flag)
                    ReText += " ";
                else
                    flag = true;

                ReText += word ;

                //add word length,subtract autodesc
                i += word.Length - 1;//.Replace(" ","")
                               
                //end
               
            }
           
            //record time span
            TextSeg_Span = (double)(DateTime.Now.Ticks - dt_s) / 10000;

            return ReText;

 

        }
       
        private void FindWord(string text,out double Weight,out string Word)
        {
            Hashtable t_chartable = chartable;
            Hashtable last_chartable = chartable;

            string ReWord = "";
            double weight = -1;

            string char_s;

            for (int i = 0; i < text.Length; i++)
            {
                //next charactor
                char_s = text.Substring(i, 1);
               
                //get a word
                if (!t_chartable.Contains(char_s))
                {
                    if (ReWord == "")
                    {
                        #region NoMathed Single Charactor
                        int j = i + 1;
                        switch (GetCharType(char_s))
                        {
                            case 0://single Chinese word
                                ReWord += char_s;
                                break;
                            case 1://single English word
                                j = i + 1;
                                while (j < text.Length)
                                {
                                    int _t = GetCharType(text.Substring(j, 1));
                                    if (_t != 1 && _t != 3)
                                        break;

                                    j++;
                                }
                                ReWord += text.Substring(i, j - i);

                                break;
                            case 2://single Number word
                                j = i + 1;
                                while (j < text.Length)
                                {
                                    int _t = GetCharType(text.Substring(j, 1));
                                    if (_t != 2 && _t != 3)
                                        break;

                                    j++;
                                }
                                ReWord += text.Substring(i, j - i);

                                break;

                            default:
                                ReWord += char_s;//single other charactor word
                                break;

                        }
                        #endregion

                    }
                    else
                    {
                                              
                        //if is a word
                        if (last_chartable.Contains("WT"))
                        {
                            weight = double.Parse(last_chartable["WT"].ToString());
                            //break;

                            //check if has another mathed word and return that's weitht
                            //get a word
                            //top level if is a single chractor word
                            if (i < 2)
                                break;

                            double weight2;
                            string word2;
                            FindWord(text.Substring(i - 1), out weight2, out word2);
                            //weather it is multi meanings word
                            if (weight2 <= weight || word2.Trim().Length < 2)
                                break;
                           
                        }
                       
                        //do these single chars
                        if (ReWord.Length < 2)
                            break;
                        ReWord = ReWord.Substring(0,ReWord.Length - 1);

                        #region insert space
                        //string _reword = "";
                        //bool flag = false;
                        -1 back a charactor
                        //for (int k = 0; k < ReWord.Length -1; k++)
                        //{
                        //    if(flag)
                        //        _reword += ' ';
                        //    else
                        //        flag = true;

                        //    _reword += ReWord[k];
                        //}
                        //ReWord = _reword;
                        #endregion


                    }            
                    break;
                }
                              


                ReWord += char_s;

                //end
                if (i == text.Length - 1)
                {
                   break;
                }

               
                t_chartable = (Hashtable)t_chartable[char_s];
                last_chartable = t_chartable;

            }

            //return word and weight
            Word = ReWord;
            Weight = weight;

            return;
                       
           
        }
           


    }
}

 

点击进入下载页面:
http://download.csdn.net/source/174318

©️2020 CSDN 皮肤主题: 大白 设计师:CSDN官方博客 返回首页