出于兴趣,在参考了网上的一些资料后,自己就写了一个。
加入了权重的支持,可以解决一些歧义词
using System;
using System.Collections;
using System.Collections.Generic;
using System.Text;
using System.Xml;
using System.IO;
using System.Windows.Forms;
using System.Text.RegularExpressions;
using System.Runtime.Serialization.Formatters.Binary;
namespace EasySeg
{
public class EasySeger
{
//dict path
private static string DictPath = Application.StartupPath + "//data//dict2.txt";
//store dict
private static Hashtable chartable = new Hashtable();
//weather the dict has been loaded
private static bool DictLoaded = false;
//the timespan of loading dict
public static double DictLoad_Span = 0;
/// <summary>
/// RegularExpressions's pattern
/// </summary>
private string strChinese = "[/u4e00-/u9fa5]";
private string strNumber = "[0-9]";
private string strEnglish = "[a-zA-Z]";
private string strSymbol = "[//.]";
//the timespan of Seging text
public double TextSeg_Span = 0;
/// <summary>
/// GetCharType
/// </summary>
/// <param name="Char"></param>
/// <returns>
/// 0: Chinese,1:English,2:Number,3:Symbol
///</returns>
private int GetCharType(string Char)
{
if (new Regex(strChinese).IsMatch(Char))
return 0;
if (new Regex(strEnglish).IsMatch(Char))
return 1;
if (new Regex(strNumber).IsMatch(Char))
return 2;
if (new Regex(strSymbol).IsMatch(Char))
return 3;
return -1;
}
//load dict
private void LoadDict()
{
if (DictLoaded) return;
//if (File.Exists(DictHTPath))
//{
// //load DictHT if exists
// chartable = Deserializer(DictHTPath);
//}
//else {
// //build DictHT
// BuidDictTree();
// //save DictHT
// Serializer(chartable, DictHTPath);
//}
BuidDictTree();
DictLoaded = true;
return;
}
//buid a tree
private void BuidDictTree()
{
//record start time
long dt_s = DateTime.Now.Ticks;
string char_s;
StreamReader reader = new StreamReader(DictPath, System.Text.Encoding.UTF8);
while (!reader.EndOfStream)
{
string word = reader.ReadLine();
if (word == null || word.Trim() == "")
{
continue;
}
word = word.Trim();
//get word's weight
double weight = 0;
if (word.IndexOf(" ") != -1)
{
weight = double.Parse(word.Split(' ')[1].ToString());
word = word.Split(' ')[0];
}
Hashtable t_chartable = chartable;
//add word
for (int i = 0; i < word.Length; i++)
{
char_s = word.Substring(i, 1);
if (!t_chartable.Contains(char_s))
{
t_chartable.Add(char_s, new Hashtable());
}
t_chartable = (Hashtable)t_chartable[char_s];
//set word's weight
if (i == word.Length - 1)
{
if(t_chartable.Contains("WT"))
t_chartable["WT"] = weight;
else
t_chartable.Add("WT", weight);
}
}
}
reader.Close();
//record time span
DictLoad_Span = (double)(DateTime.Now.Ticks - dt_s) / 10000;
}
//parse a text section
public string SegText(string text)
{
text = text.Trim();
//load Dict only once
LoadDict();
//record start time
long dt_s = DateTime.Now.Ticks;
string ReText = "";
Hashtable t_chartable = chartable;
bool flag = false;
for (int i = 0; i < text.Length; i++)
{
//get a word
double weight;
string word;
FindWord(text.Substring(i),out weight,out word);
if (flag)
ReText += " ";
else
flag = true;
ReText += word ;
//add word length,subtract autodesc
i += word.Length - 1;//.Replace(" ","")
//end
}
//record time span
TextSeg_Span = (double)(DateTime.Now.Ticks - dt_s) / 10000;
return ReText;
}
private void FindWord(string text,out double Weight,out string Word)
{
Hashtable t_chartable = chartable;
Hashtable last_chartable = chartable;
string ReWord = "";
double weight = -1;
string char_s;
for (int i = 0; i < text.Length; i++)
{
//next charactor
char_s = text.Substring(i, 1);
//get a word
if (!t_chartable.Contains(char_s))
{
if (ReWord == "")
{
#region NoMathed Single Charactor
int j = i + 1;
switch (GetCharType(char_s))
{
case 0://single Chinese word
ReWord += char_s;
break;
case 1://single English word
j = i + 1;
while (j < text.Length)
{
int _t = GetCharType(text.Substring(j, 1));
if (_t != 1 && _t != 3)
break;
j++;
}
ReWord += text.Substring(i, j - i);
break;
case 2://single Number word
j = i + 1;
while (j < text.Length)
{
int _t = GetCharType(text.Substring(j, 1));
if (_t != 2 && _t != 3)
break;
j++;
}
ReWord += text.Substring(i, j - i);
break;
default:
ReWord += char_s;//single other charactor word
break;
}
#endregion
}
else
{
//if is a word
if (last_chartable.Contains("WT"))
{
weight = double.Parse(last_chartable["WT"].ToString());
//break;
//check if has another mathed word and return that's weitht
//get a word
//top level if is a single chractor word
if (i < 2)
break;
double weight2;
string word2;
FindWord(text.Substring(i - 1), out weight2, out word2);
//weather it is multi meanings word
if (weight2 <= weight || word2.Trim().Length < 2)
break;
}
//do these single chars
if (ReWord.Length < 2)
break;
ReWord = ReWord.Substring(0,ReWord.Length - 1);
#region insert space
//string _reword = "";
//bool flag = false;
-1 back a charactor
//for (int k = 0; k < ReWord.Length -1; k++)
//{
// if(flag)
// _reword += ' ';
// else
// flag = true;
// _reword += ReWord[k];
//}
//ReWord = _reword;
#endregion
}
break;
}
ReWord += char_s;
//end
if (i == text.Length - 1)
{
break;
}
t_chartable = (Hashtable)t_chartable[char_s];
last_chartable = t_chartable;
}
//return word and weight
Word = ReWord;
Weight = weight;
return;
}
}
}