using System;
using System;
using System.Windows.Forms;
using System.IO;
using System.Text.RegularExpressions;
using System.Collections;
using System.Collections.Generic;
using System.ComponentModel;
namespace HMM
{
public partial class Form1 : Form
{
string[] arrayData;
DirectoryInfo di;
FileInfo[] fis;
Hashtable htDict = new Hashtable();
double singleCutRate;
public Form1()
{
InitializeComponent();
label1.Text = "先预处理!";
progressBar1.Visible = false;
di = new DirectoryInfo("data");
fis = di.GetFiles("*.txt");
arrayData = new string[fis.Length];
}
private void Form1_Resize(object sender, EventArgs e)
{
this.Width = 800;
this.Height = 600;
}
private void button1_Click(object sender, EventArgs e)
{
if (!new FileInfo("dict.txt").Exists)
{
int count = 0;
progressBar1.Visible = true;
BackgroundWorker worker = new BackgroundWorker();
worker.WorkerReportsProgress = true; //报告进度
worker.DoWork += (s, o) =>
{
int progressCount = 1;
foreach (FileInfo i in fis)
{
StreamReader sr = new StreamReader(i.FullName, System.Text.Encoding.Default);
arrayData[progressCount - 1] = sr.ReadToEnd();
sr.Close();
worker.ReportProgress((int)((double)progressCount / (double)fis.Length * 33), null);
progressCount++;
}
for (int i = 0; i < arrayData.Length; i++)
{
arrayData[i] = Regex.Replace(arrayData[i], @"[^\u4e00-\u9fa5]", "");
for (int j = 0; j < arrayData[i].Length; j++)
{
string strWord = arrayData[i].Substring(j, 1);
if (IsChinese(strWord))
{
if (htDict.ContainsKey(strWord))
{
htDict[strWord] = ((int)htDict[strWord]) + 1;
}
else
{
htDict.Add(strWord, 1);
}
}
}
worker.ReportProgress((int)((double)i / (double)arrayData.Length * 33) + 33, null);
}
StreamWriter sw = new StreamWriter("dict.txt", false, System.Text.Encoding.Default);
foreach (DictionaryEntry i in htDict)
{
sw.WriteLine(i.Key + "|" + i.Value);
count++;
sw.Flush();
worker.ReportProgress((int)((double)count / (double)htDict.Count * 33) + 67, null);
}
sw.Close();
};
worker.RunWorkerCompleted += (s, o) =>
{
this.Invoke(new MethodInvoker(() => { progressBar1.Visible = false; progressBar1.Value = 0; label1.Text = "预处理完成!|" + count; }));
};
worker.ProgressChanged += (s, o) =>
{
progressBar1.Style = ProgressBarStyle.Continuous;
progressBar1.Value = o.ProgressPercentage;
};
worker.RunWorkerAsync();
}
else
{
int count = 0;
progressBar1.Visible = true;
BackgroundWorker worker = new BackgroundWorker();
worker.WorkerReportsProgress = true; //报告进度
worker.DoWork += (s, o) =>
{
int progressCount = 1;
foreach (FileInfo i in fis)
{
StreamReader sr = new StreamReader(i.FullName, System.Text.Encoding.Default);
arrayData[progressCount - 1] = sr.ReadToEnd();
sr.Close();
worker.ReportProgress((int)((double)progressCount / (double)fis.Length * 33), null);
progressCount++;
}
for (int i = 0; i < arrayData.Length; i++)
{
arrayData[i] = Regex.Replace(arrayData[i], @"[^\u4e00-\u9fa5]", "");
worker.ReportProgress((int)((double)i / (double)arrayData.Length * 33) + 33, null);
}
StreamReader reader = new StreamReader("dict.txt", System.Text.Encoding.Default);
string line = "";
while ((line = reader.ReadLine()) != null)
{
htDict[line.Substring(0, 1)] = line.Substring(2);
count++;
}
reader.Close();
worker.ReportProgress(100, null);
};
worker.RunWorkerCompleted += (s, o) =>
{
this.Invoke(new MethodInvoker(() => { progressBar1.Visible = false; progressBar1.Value = 0; label1.Text = "预处理完成!|" + count; }));
};
worker.ProgressChanged += (s, o) =>
{
progressBar1.Style = ProgressBarStyle.Continuous;
progressBar1.Value = o.ProgressPercentage;
};
worker.RunWorkerAsync();
}
}
private void button2_Click(object sender, EventArgs e)
{
if (label1.Text != "先预处理!" && textBox1.Text.Trim() != "")
{
textBox2.Text = "";
if (!double.TryParse(textBox3.Text.Trim(), out singleCutRate))
{
singleCutRate = 0.01;
}
List<string> list = new List<string>();
string strSplitWords = Regex.Replace(textBox1.Text.Trim(), @"[^\u4e00-\u9fa5]", "");
int startPos = 0;
int m = 1;
string strWord1 = "";
string strWord2 = "";
progressBar1.Visible = true;
BackgroundWorker worker = new BackgroundWorker();
worker.WorkerReportsProgress = true; //报告进度
worker.DoWork += (s, o) =>
{
while (strSplitWords.Length >= 2)
{
if (strWord1 == "")
{
strWord1 = strSplitWords.Substring(startPos, m);
}
strWord2 = strSplitWords.Substring(startPos, ++m);
double x1 = (double)ReturnCount(strWord1, arrayData);
double y1 = (double)ReturnTotalCount(strWord1);
if (y1 == 0)
y1++;
double a = x1 / y1;
double x2 = (double)ReturnCount(strWord2, arrayData);
double y2 = (double)ReturnTotalCount(strWord2);
if (y2 == 0)
y2++;
double b = x2 / y2;
if ((a < 1 && a > b) || (a == 1 && b < singleCutRate) || (a == 0 && b == 0))
{
list.Add(strWord1);
startPos += strWord1.Length;
worker.ReportProgress((int)((double)startPos / (double)strSplitWords.Length * 100), null);
m = 1;
strWord1 = "";
strWord2 = "";
if ((strSplitWords.Length - startPos) == 1)
{
list.Add(strSplitWords.Substring(startPos, 1));
break;
}
else if ((strSplitWords.Length - startPos) < 1)
{
break;
}
}
else
{
strWord1 = strWord2;
strWord2 = "";
if ((strSplitWords.Length - startPos - m) < 1)
{
list.Add(strWord1);
startPos += strWord1.Length;
worker.ReportProgress((int)((double)startPos / (double)strSplitWords.Length * 100), null);
break;
}
}
}
worker.ReportProgress(100, null);
};
worker.RunWorkerCompleted += (s, o) =>
{
this.Invoke(new MethodInvoker(() =>
{
progressBar1.Visible = false;
progressBar1.Value = 0;
foreach (string i in list)
{
textBox2.Text += i + "|";
}
label2.Text = "分词完成!";
}));
};
worker.ProgressChanged += (s, o) =>
{
progressBar1.Style = ProgressBarStyle.Continuous;
progressBar1.Value = o.ProgressPercentage;
};
worker.RunWorkerAsync();
}
}
public bool IsChinese(string str)
{
return Regex.IsMatch(str, @"^[\u4e00-\u9fa5]+$");
}
public int ReturnCount(string s, string[] d)
{
int count = 0;
for (int i = 0; i < d.Length; i++)
{
count += Regex.Matches(d[i], s).Count;
}
return count;
}
public int ReturnTotalCount(string s)
{
int total = 0;
for (int i = 0; i < s.Length; i++)
{
if (htDict.ContainsKey(s.Substring(i, 1)))
{
total += Convert.ToInt32(htDict[s.Substring(i, 1)]);
}
}
return total;
}
}
}
最近在看机器学习方面的书,看到隐马尔科夫,意淫了一下无字典中文分词的可能性,我设想了一种分词方式,并无聊了一个程序,因为执行效率相当差,所以添加了进度条,否则真的等到受不了,仅供参考
1、下载了3200本各类电子书,600多M
2、预先扫描每个字出现的概率P(w)
3、清除待分词内容c中非中文字符
4、从左向右扫描ci,(i为字数,每一位为w1、w2、w3.....wi),
开始时:计算第一个字c1的a=P(c1)/P(w1)和前两个字c2的b=P(c2)/(P(w1)+P(w2))
comp:if ((a < 1 && a > b) || (a == 1 && b < singleCutRate) || (a == 0 && b == 0))
则分割c1,指针向后移动i
否则继续比较 前两个字 a=P(c2)/(P(w1)+P(w2))
和前三个字 b=P(c3)/(P(w1)+P(w2)+P(w3)) 的大小 循环到comp
注意检测c的尾部并及时跳出循环,singleCutRate用于估算单字的切割概率,比如
“中” 和 “中国” ,当 P(中国)/(P(中)+P(国)) >=0.05 认为“中国”是固有词汇,否则直接分割“中”,这个切割概率需要调教一个合理的数值,似乎最好的区间在0.005上下。
yy完毕!