简单的中文信息处理C#实现代码片段

最新推荐文章于 2024-10-26 11:13:04 发布

Zoohua

最新推荐文章于 2024-10-26 11:13:04 发布

阅读量834

点赞数

分类专栏：中文信息处理文章标签： c# string dictionary class path 算法

本文链接：https://blog.csdn.net/zoohua/article/details/4430270

版权

中文信息处理专栏收录该内容

4 篇文章

订阅专栏

首先定义我们的词典类，最长词和词出现的频率有此类处理：

namespace CNWordSegment
{
[Serializable]
    public class Dictionary
    {
      private int maxWordLength = 0;
      private int maxWordFreq = 0;
      private Dictionary<string, int> dict = new Dictionary<string, int>();

      public Dictionary()
      {
      }

      public void AddWord(string word)
      {
        if (word.Length > maxWordLength)
          maxWordLength = word.Length;

        if (dict.ContainsKey(word))
        {
          int freq = dict[word];
          dict[word] = freq + 1;
        }
        else
        {
          dict.Add(word, 1);
        }

        if (dict[word] > maxWordFreq)
          maxWordFreq = dict[word];
      }
      public void AddWord(string word,int defaultFreq)
      {
        if(word.Length > maxWordLength)
          maxWordLength = word.Length;

        if(dict.ContainsKey(word))
        {
          int freq = dict[word];
          dict[word] = freq + 1;
        }
        else
        {
          dict.Add(word,defaultFreq);
        }

        if (dict[word] > maxWordFreq)
          maxWordFreq = dict[word];
      }

      public int GetFrequency(string word)
      {
        if (dict.ContainsKey(word))
          return dict[word];

return 0;
}

      ///<summary>
      ///加载词典
      ///</summary>
      public void Load(string path)
      {
        StreamReader reader = new StreamReader(path, System.Text.Encoding.UTF8);
        string strline = reader.ReadLine();
        while (strline != null)
        {
          AddWord(strline);
          strline = reader.ReadLine();
        }
        reader.Close();
        reader.Dispose();
      }

      public bool ContainsWord(string word)
      {
        return dict.ContainsKey(word);
      }

      public int GetMaxWordLength()
      {
        return maxWordLength;
      }

      public int GetMaxWordFreq()
      {
        return maxWordFreq;
      }
    }
}

接下来给出我们的最大匹配法：
namespace CNWordSegment
{
/// <summary>
/// 正向最大分词算法
/// </summary>
public class FMMSegment : public SegmentStrategy //SegmentStrategy 是一个分词策略基类，我们利用它来分发策略，这样我们可以加入各种分词法了
{
    public override List<string> Segment(string sentence, Dictionary dict)
    {
      int maxLength = dict.GetMaxWordLength();
      int pos = 0;
      int targetLength = maxLength;
      int restLength = sentence.Length;

List<string> seged = new List<string>();

      while (restLength > 0)
      {
        if (targetLength > restLength)
          targetLength = restLength;

        string word = sentence.Substring(pos, targetLength);
        if (dict.ContainsWord(word) || targetLength == 1)
        {
          seged.Add(word);
          pos += targetLength;
          targetLength = maxLength;
          restLength = sentence.Length - pos;
        }
        else
          targetLength--;
      }

      return seged;
    }
}
}
    完成一上两步后我们就要实现分词过程了。下面给出函数实现
public List<string> Segment(string stext)
{
List<string> wordList = new List<string>();
wordList.Capacity = stext.Length;

int start = 0;

char currWord, prevWord;
//遍历文本
for (int i = 0; i < stext.Length; i++)
{
currWord = stext[i];

#region ==分词过程==
    if (i > 0)
    {
      prevWord = stext[i - 1];

      //判断当前字符是否和上一个字符的类型相同或者是否已到结尾
      if (Utility.GetCharType(currWord) != Utility.GetCharType(prevWord) || i == stext.Length - 1)
      {
        string words = stext.Substring(start, i - start);

        //处理最后一个字符
        if (i == stext.Length - 1)
        {
          if (Utility.GetCharType(currWord) != CharType.Punctuation)
          {
            words = stext.Substring(start, i - start + 1);
          }
        }

        CharType ct = Utility.GetCharType(prevWord);
        switch (ct)
        {
          case CharType.Digital: //数字按空格切分字符

#region ==日期识别==
            //日期识别
            if (currWord == '年' || currWord == '月' || currWord == '日')
            {
              string tempDateWord = words + currWord.ToString();
              if (i == stext.Length - 1)
                tempDateWord = words;

              //合并日期
              if (wordList.Count > 0 && wordList[wordList.Count - 1].IndexOf("年") > 0 && currWord == '月')
              {
                wordList[wordList.Count - 1] += tempDateWord;

                start = i + 1;
                i++;
              }
              else if (wordList.Count > 0 && wordList[wordList.Count - 1].IndexOf("月") > 0 && currWord == '日')
              {
                wordList[wordList.Count - 1] += tempDateWord;
                start = i + 1;
                i++;
              }
              else
              {
                wordList.Add(tempDateWord);
                start = i + 1;
                i++;
              }
              //wordList.Add(words + currWord.ToString());
              //start = i + 1 ;
              break;
            }
#endregion

            foreach (string s in words.Split(new char[] { ' ' }))
            {
              wordList.Add(s);
            }
            start = i;
            break;

          case CharType.Letter: //字母按空格切分字符
            foreach (string s in words.Split(new char[] { ' ' }))
            {
              wordList.Add(s);
            }
            start = i;
            break;

          case CharType.WhiteSpace: //空格不做处理，直接跳index
            wordList.Add(words);
            start = i;
            break;

          case CharType.Punctuation: //标点符号不做处理，直接跳index
            wordList.Add(words);
            start = i;
            break;

          case CharType.CNWord: //中文字符分词处理
            foreach (string s in segmentStrategy.Segment(words))
            {
              wordList.Add(s);
            }
            start = i;
            break;

          case CharType.Other:      //其他未知字符，目前按单字切分
            for (int j = 0; j < words.Length; j++)
            {
              wordList.Add(words[j].ToString());
            }
            start = i;
            break;
        }
      }
    }
#endregion

    //处理最后一个字符
    if (i == stext.Length - 1)
    {
      if (Utility.GetCharType(currWord) == CharType.Punctuation)
      {
        wordList.Add(currWord.ToString());
      }
    }
}

wordList.TrimExcess();

return wordList;
}
}
}

上的代码只是整个程序的片段，详细的实现各位可以自己来实现，这里只是起一点抛砖引玉的作用。