简单的中文信息处理C#实现代码片段

首先定义我们的词典类,最长词和词出现的频率有此类处理:

namespace CNWordSegment
{
  [Serializable]
    public class Dictionary
    {
      private int maxWordLength = 0;
      private int maxWordFreq = 0;
      private Dictionary<string, int> dict = new Dictionary<string, int>();

      public Dictionary()
      {
      }

      public void AddWord(string word)
      {
        if (word.Length > maxWordLength)
          maxWordLength = word.Length;

        if (dict.ContainsKey(word))
        {
          int freq = dict[word];
          dict[word] = freq + 1;
        }
        else
        {
          dict.Add(word, 1);
        }

        if (dict[word] > maxWordFreq)
          maxWordFreq = dict[word];
      }
      public void AddWord(string word,int defaultFreq)
      {
        if(word.Length > maxWordLength)
          maxWordLength = word.Length;

        if(dict.ContainsKey(word))
        {
          int freq = dict[word];
          dict[word] = freq + 1;
        }
        else
        {
          dict.Add(word,defaultFreq);
        }

        if (dict[word] > maxWordFreq)
          maxWordFreq = dict[word];
      }

      public int GetFrequency(string word)
      {
        if (dict.ContainsKey(word))
          return dict[word];

        return 0;
      }

      ///<summary>
      ///加载词典
      ///</summary>
      public void Load(string path)
      {
        StreamReader reader = new StreamReader(path, System.Text.Encoding.UTF8);
        string strline = reader.ReadLine();
        while (strline != null)
        {
          AddWord(strline);
          strline = reader.ReadLine();
        }
        reader.Close();
        reader.Dispose();
      }

      public bool ContainsWord(string word)
      {
        return dict.ContainsKey(word);
      }

      public int GetMaxWordLength()
      {
        return maxWordLength;
      }

      public int GetMaxWordFreq()
      {
        return maxWordFreq;
      }
    }
}

接下来给出我们的最大匹配法:
namespace CNWordSegment
{
  /// <summary>
  /// 正向最大分词算法
  /// </summary>
  public class FMMSegment : public SegmentStrategy //SegmentStrategy 是一个分词策略基类,我们利用它来分发策略,这样我们可以加入各种分词法了
  {
    public override List<string> Segment(string sentence, Dictionary dict)
    {
      int maxLength = dict.GetMaxWordLength();
      int pos = 0;
      int targetLength = maxLength;
      int restLength = sentence.Length;

      List<string> seged = new List<string>();

      while (restLength > 0)
      {
        if (targetLength > restLength)
          targetLength = restLength;

        string word = sentence.Substring(pos, targetLength);
        if (dict.ContainsWord(word) || targetLength == 1)
        {
          seged.Add(word);
          pos += targetLength;
          targetLength = maxLength;
          restLength = sentence.Length - pos;
        }
        else
          targetLength--;
      }

      return seged;
    }
  }
}
    完成一上两步后我们就要实现分词过程了。下面给出函数实现
public List<string> Segment(string stext)
{
  List<string> wordList = new List<string>();
  wordList.Capacity = stext.Length;

  int start = 0;

  char currWord, prevWord;
  //遍历文本
  for (int i = 0; i < stext.Length; i++)
  {
    currWord = stext[i];

#region ==分词过程==
    if (i > 0)
    {
      prevWord = stext[i - 1];

      //判断当前字符是否和上一个字符的类型相同 或者是否已到结尾
      if (Utility.GetCharType(currWord) != Utility.GetCharType(prevWord) || i == stext.Length - 1)
      {
        string words = stext.Substring(start, i - start);

        //处理最后一个字符
        if (i == stext.Length - 1)
        {
          if (Utility.GetCharType(currWord) != CharType.Punctuation)
          {
            words = stext.Substring(start, i - start + 1);
          }
        }

        CharType ct = Utility.GetCharType(prevWord);
        switch (ct)
        {
          case CharType.Digital: //数字 按空格切分字符

#region ==日期识别==
            //日期识别
            if (currWord == '年' || currWord == '月' || currWord == '日')
            {
              string tempDateWord = words + currWord.ToString();
              if (i == stext.Length - 1)
                tempDateWord = words;

              //合并日期
              if (wordList.Count > 0 && wordList[wordList.Count - 1].IndexOf("年") > 0 && currWord == '月')
              {
                wordList[wordList.Count - 1] += tempDateWord;

                start = i + 1;
                i++;
              }
              else if (wordList.Count > 0 && wordList[wordList.Count - 1].IndexOf("月") > 0 && currWord == '日')
              {
                wordList[wordList.Count - 1] += tempDateWord;
                start = i + 1;
                i++;
              }
              else
              {
                wordList.Add(tempDateWord);
                start = i + 1;
                i++;
              }
              //wordList.Add(words + currWord.ToString());
              //start = i + 1 ;
              break;
            }
#endregion

            foreach (string s in words.Split(new char[] { ' ' }))
            {
              wordList.Add(s);
            }
            start = i;
            break;

          case CharType.Letter:  //字母 按空格切分字符
            foreach (string s in words.Split(new char[] { ' ' }))
            {
              wordList.Add(s);
            }
            start = i;
            break;

          case CharType.WhiteSpace: //空格 不做处理,直接跳index
            wordList.Add(words);
            start = i;
            break;

          case CharType.Punctuation: //标点符号 不做处理,直接跳index
            wordList.Add(words);
            start = i;
            break;

          case CharType.CNWord: //中文字符 分词处理
            foreach (string s in segmentStrategy.Segment(words))
            {
              wordList.Add(s);
            }
            start = i;
            break;

          case CharType.Other:      //其他未知字符,目前按单字切分
            for (int j = 0; j < words.Length; j++)
            {
              wordList.Add(words[j].ToString());
            }
            start = i;
            break;
        }
      }
    }
#endregion

    //处理最后一个字符
    if (i == stext.Length - 1)
    {
      if (Utility.GetCharType(currWord) == CharType.Punctuation)
      {
        wordList.Add(currWord.ToString());
      }
    }
  }

  wordList.TrimExcess();

  return wordList;
}
}
}

上的代码只是整个程序的片段,详细的实现各位可以自己来实现,这里只是起一点抛砖引玉的作用。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值