首先定义我们的词典类,最长词和词出现的频率有此类处理:
namespace CNWordSegment
{
[Serializable]
public class Dictionary
{
private int maxWordLength = 0;
private int maxWordFreq = 0;
private Dictionary<string, int> dict = new Dictionary<string, int>();
public Dictionary()
{
}
public void AddWord(string word)
{
if (word.Length > maxWordLength)
maxWordLength = word.Length;
if (dict.ContainsKey(word))
{
int freq = dict[word];
dict[word] = freq + 1;
}
else
{
dict.Add(word, 1);
}
if (dict[word] > maxWordFreq)
maxWordFreq = dict[word];
}
public void AddWord(string word,int defaultFreq)
{
if(word.Length > maxWordLength)
maxWordLength = word.Length;
if(dict.ContainsKey(word))
{
int freq = dict[word];
dict[word] = freq + 1;
}
else
{
dict.Add(word,defaultFreq);
}
if (dict[word] > maxWordFreq)
maxWordFreq = dict[word];
}
public int GetFrequency(string word)
{
if (dict.ContainsKey(word))
return dict[word];
return 0;
}
///<summary>
///加载词典
///</summary>
public void Load(string path)
{
StreamReader reader = new StreamReader(path, System.Text.Encoding.UTF8);
string strline = reader.ReadLine();
while (strline != null)
{
AddWord(strline);
strline = reader.ReadLine();
}
reader.Close();
reader.Dispose();
}
public bool ContainsWord(string word)
{
return dict.ContainsKey(word);
}
public int GetMaxWordLength()
{
return maxWordLength;
}
public int GetMaxWordFreq()
{
return maxWordFreq;
}
}
}
接下来给出我们的最大匹配法:
namespace CNWordSegment
{
/// <summary>
/// 正向最大分词算法
/// </summary>
public class FMMSegment : public SegmentStrategy //SegmentStrategy 是一个分词策略基类,我们利用它来分发策略,这样我们可以加入各种分词法了
{
public override List<string> Segment(string sentence, Dictionary dict)
{
int maxLength = dict.GetMaxWordLength();
int pos = 0;
int targetLength = maxLength;
int restLength = sentence.Length;
List<string> seged = new List<string>();
while (restLength > 0)
{
if (targetLength > restLength)
targetLength = restLength;
string word = sentence.Substring(pos, targetLength);
if (dict.ContainsWord(word) || targetLength == 1)
{
seged.Add(word);
pos += targetLength;
targetLength = maxLength;
restLength = sentence.Length - pos;
}
else
targetLength--;
}
return seged;
}
}
}
完成一上两步后我们就要实现分词过程了。下面给出函数实现
public List<string> Segment(string stext)
{
List<string> wordList = new List<string>();
wordList.Capacity = stext.Length;
int start = 0;
char currWord, prevWord;
//遍历文本
for (int i = 0; i < stext.Length; i++)
{
currWord = stext[i];
#region ==分词过程==
if (i > 0)
{
prevWord = stext[i - 1];
//判断当前字符是否和上一个字符的类型相同 或者是否已到结尾
if (Utility.GetCharType(currWord) != Utility.GetCharType(prevWord) || i == stext.Length - 1)
{
string words = stext.Substring(start, i - start);
//处理最后一个字符
if (i == stext.Length - 1)
{
if (Utility.GetCharType(currWord) != CharType.Punctuation)
{
words = stext.Substring(start, i - start + 1);
}
}
CharType ct = Utility.GetCharType(prevWord);
switch (ct)
{
case CharType.Digital: //数字 按空格切分字符
#region ==日期识别==
//日期识别
if (currWord == '年' || currWord == '月' || currWord == '日')
{
string tempDateWord = words + currWord.ToString();
if (i == stext.Length - 1)
tempDateWord = words;
//合并日期
if (wordList.Count > 0 && wordList[wordList.Count - 1].IndexOf("年") > 0 && currWord == '月')
{
wordList[wordList.Count - 1] += tempDateWord;
start = i + 1;
i++;
}
else if (wordList.Count > 0 && wordList[wordList.Count - 1].IndexOf("月") > 0 && currWord == '日')
{
wordList[wordList.Count - 1] += tempDateWord;
start = i + 1;
i++;
}
else
{
wordList.Add(tempDateWord);
start = i + 1;
i++;
}
//wordList.Add(words + currWord.ToString());
//start = i + 1 ;
break;
}
#endregion
foreach (string s in words.Split(new char[] { ' ' }))
{
wordList.Add(s);
}
start = i;
break;
case CharType.Letter: //字母 按空格切分字符
foreach (string s in words.Split(new char[] { ' ' }))
{
wordList.Add(s);
}
start = i;
break;
case CharType.WhiteSpace: //空格 不做处理,直接跳index
wordList.Add(words);
start = i;
break;
case CharType.Punctuation: //标点符号 不做处理,直接跳index
wordList.Add(words);
start = i;
break;
case CharType.CNWord: //中文字符 分词处理
foreach (string s in segmentStrategy.Segment(words))
{
wordList.Add(s);
}
start = i;
break;
case CharType.Other: //其他未知字符,目前按单字切分
for (int j = 0; j < words.Length; j++)
{
wordList.Add(words[j].ToString());
}
start = i;
break;
}
}
}
#endregion
//处理最后一个字符
if (i == stext.Length - 1)
{
if (Utility.GetCharType(currWord) == CharType.Punctuation)
{
wordList.Add(currWord.ToString());
}
}
}
wordList.TrimExcess();
return wordList;
}
}
}
上的代码只是整个程序的片段,详细的实现各位可以自己来实现,这里只是起一点抛砖引玉的作用。