由隐马尔科夫意淫无字典中文分词 C#

最新推荐文章于 2021-05-23 03:54:28 发布

@CAPRICA@

最新推荐文章于 2021-05-23 03:54:28 发布

阅读量1.5k

点赞数

分类专栏：大数据、神经网络、深度学习、机器学习

本文链接：https://blog.csdn.net/joycesunny/article/details/44856007

版权

大数据、神经网络、深度学习、机器学习专栏收录该内容

31 篇文章 3 订阅

订阅专栏

using System;
using System;
using System.Windows.Forms;
using System.IO;
using System.Text.RegularExpressions;
using System.Collections;
using System.Collections.Generic;
using System.ComponentModel;

namespace HMM
{
    public partial class Form1 : Form
    {
        string[] arrayData;
        DirectoryInfo di;
        FileInfo[] fis;
        Hashtable htDict = new Hashtable();
        double singleCutRate;

        public Form1()
        {
            InitializeComponent();
            label1.Text = "先预处理！";
            progressBar1.Visible = false;
            di = new DirectoryInfo("data");
            fis = di.GetFiles("*.txt");
            arrayData = new string[fis.Length];
        }

        private void Form1_Resize(object sender, EventArgs e)
        {
            this.Width = 800;
            this.Height = 600;
        }

        private void button1_Click(object sender, EventArgs e)
        {
            if (!new FileInfo("dict.txt").Exists)
            {
                int count = 0;
                progressBar1.Visible = true;
                BackgroundWorker worker = new BackgroundWorker();
                worker.WorkerReportsProgress = true;  //报告进度  
                worker.DoWork += (s, o) =>
                {
                    int progressCount = 1;
                    foreach (FileInfo i in fis)
                    {
                        StreamReader sr = new StreamReader(i.FullName, System.Text.Encoding.Default);
                        arrayData[progressCount - 1] = sr.ReadToEnd();
                        sr.Close();
                        worker.ReportProgress((int)((double)progressCount / (double)fis.Length * 33), null);
                        progressCount++;
                    }
                    for (int i = 0; i < arrayData.Length; i++)
                    {
                        arrayData[i] = Regex.Replace(arrayData[i], @"[^\u4e00-\u9fa5]", "");
                        for (int j = 0; j < arrayData[i].Length; j++)
                        {
                            string strWord = arrayData[i].Substring(j, 1);
                            if (IsChinese(strWord))
                            {
                                if (htDict.ContainsKey(strWord))
                                {
                                    htDict[strWord] = ((int)htDict[strWord]) + 1;
                                }
                                else
                                {
                                    htDict.Add(strWord, 1);
                                }
                            }
                        }
                        worker.ReportProgress((int)((double)i / (double)arrayData.Length * 33) + 33, null);
                    }
                    StreamWriter sw = new StreamWriter("dict.txt", false, System.Text.Encoding.Default);
                    foreach (DictionaryEntry i in htDict)
                    {
                        sw.WriteLine(i.Key + "|" + i.Value);
                        count++;
                        sw.Flush();
                        worker.ReportProgress((int)((double)count / (double)htDict.Count * 33) + 67, null);
                    }
                    sw.Close();
                };
                worker.RunWorkerCompleted += (s, o) =>
                {
                    this.Invoke(new MethodInvoker(() => { progressBar1.Visible = false; progressBar1.Value = 0; label1.Text = "预处理完成！|" + count; }));
                };
                worker.ProgressChanged += (s, o) =>
                {
                    progressBar1.Style = ProgressBarStyle.Continuous;
                    progressBar1.Value = o.ProgressPercentage;
                };
                worker.RunWorkerAsync();
            }
            else
            {
                int count = 0;
                progressBar1.Visible = true;
                BackgroundWorker worker = new BackgroundWorker();
                worker.WorkerReportsProgress = true;  //报告进度  
                worker.DoWork += (s, o) =>
                {
                    int progressCount = 1;
                    foreach (FileInfo i in fis)
                    {
                        StreamReader sr = new StreamReader(i.FullName, System.Text.Encoding.Default);
                        arrayData[progressCount - 1] = sr.ReadToEnd();
                        sr.Close();
                        worker.ReportProgress((int)((double)progressCount / (double)fis.Length * 33), null);
                        progressCount++;
                    }
                    for (int i = 0; i < arrayData.Length; i++)
                    {
                        arrayData[i] = Regex.Replace(arrayData[i], @"[^\u4e00-\u9fa5]", "");
                        worker.ReportProgress((int)((double)i / (double)arrayData.Length * 33) + 33, null);
                    }
                    StreamReader reader = new StreamReader("dict.txt", System.Text.Encoding.Default);
                    string line = "";
                    while ((line = reader.ReadLine()) != null)
                    {
                        htDict[line.Substring(0, 1)] = line.Substring(2);
                        count++;
                    }
                    reader.Close();
                    worker.ReportProgress(100, null);
                };
                worker.RunWorkerCompleted += (s, o) =>
                {
                    this.Invoke(new MethodInvoker(() => { progressBar1.Visible = false; progressBar1.Value = 0; label1.Text = "预处理完成！|" + count; }));
                };
                worker.ProgressChanged += (s, o) =>
                {
                    progressBar1.Style = ProgressBarStyle.Continuous;
                    progressBar1.Value = o.ProgressPercentage;
                };
                worker.RunWorkerAsync();
            }
        }

        private void button2_Click(object sender, EventArgs e)
        {
            if (label1.Text != "先预处理！" && textBox1.Text.Trim() != "")
            {
                textBox2.Text = "";
                if (!double.TryParse(textBox3.Text.Trim(), out singleCutRate))
                {
                    singleCutRate = 0.01;
                }
                List<string> list = new List<string>();
                string strSplitWords = Regex.Replace(textBox1.Text.Trim(), @"[^\u4e00-\u9fa5]", "");
                int startPos = 0;
                int m = 1;
                string strWord1 = "";
                string strWord2 = "";
                progressBar1.Visible = true;
                BackgroundWorker worker = new BackgroundWorker();
                worker.WorkerReportsProgress = true;  //报告进度
                worker.DoWork += (s, o) =>
                {
                    while (strSplitWords.Length >= 2)
                    {
                        if (strWord1 == "")
                        {
                            strWord1 = strSplitWords.Substring(startPos, m);
                        }
                        strWord2 = strSplitWords.Substring(startPos, ++m);
                        double x1 = (double)ReturnCount(strWord1, arrayData);
                        double y1 = (double)ReturnTotalCount(strWord1);
                        if (y1 == 0)
                            y1++;
                        double a = x1 / y1;
                        double x2 = (double)ReturnCount(strWord2, arrayData);
                        double y2 = (double)ReturnTotalCount(strWord2);
                        if (y2 == 0)
                            y2++;
                        double b = x2 / y2;
                        if ((a < 1 && a > b) || (a == 1 && b < singleCutRate) || (a == 0 && b == 0))
                        {
                            list.Add(strWord1);
                            startPos += strWord1.Length;
                            worker.ReportProgress((int)((double)startPos / (double)strSplitWords.Length * 100), null);
                            m = 1;
                            strWord1 = "";
                            strWord2 = "";
                            if ((strSplitWords.Length - startPos) == 1)
                            {
                                list.Add(strSplitWords.Substring(startPos, 1));
                                break;
                            }
                            else if ((strSplitWords.Length - startPos) < 1)
                            {
                                break;
                            }
                        }
                        else
                        {
                            strWord1 = strWord2;
                            strWord2 = "";
                            if ((strSplitWords.Length - startPos - m) < 1)
                            {
                                list.Add(strWord1);
                                startPos += strWord1.Length;
                                worker.ReportProgress((int)((double)startPos / (double)strSplitWords.Length * 100), null);
                                break;
                            }
                        }
                    }
                    worker.ReportProgress(100, null);
                };
                worker.RunWorkerCompleted += (s, o) =>
         {
             this.Invoke(new MethodInvoker(() =>
             {
                 progressBar1.Visible = false;
                 progressBar1.Value = 0;
                 foreach (string i in list)
                 {
                     textBox2.Text += i + "|";
                 }
                 label2.Text = "分词完成！";
             }));
         };
                worker.ProgressChanged += (s, o) =>
                {
                    progressBar1.Style = ProgressBarStyle.Continuous;
                    progressBar1.Value = o.ProgressPercentage;
                };
                worker.RunWorkerAsync();
            }
        }

        public bool IsChinese(string str)
        {
            return Regex.IsMatch(str, @"^[\u4e00-\u9fa5]+$");
        }

          public int ReturnCount(string s, string[] d)
        {
            int count = 0;
            for (int i = 0; i < d.Length; i++)
            {
                count += Regex.Matches(d[i], s).Count;
            }
            return count;
        }

        public int ReturnTotalCount(string s)
        {
            int total = 0;
            for (int i = 0; i < s.Length; i++)
            {
                if (htDict.ContainsKey(s.Substring(i, 1)))
                {
                    total += Convert.ToInt32(htDict[s.Substring(i, 1)]);
                }
            }
            return total;
        }
    }
}