程序采用最大匹配法完成。
其中采用的语料库来自于詹卫东:
http://ccl.pku.edu.cn/doubtfire/Course/Chinese%20Information%20Processing/2002_2003_1.htm
程序也参考了他的。
其中的表words结构是:wid (int), word(文本), wfreq(int)
而涉及的另外一个自定义的结构:
public
struct
wordsStr
... {
public int wid;//表示ID
public string wordPrase;
public int wFreq;//词出现的次数
public PartofWord eNumPoW;//词性
} ;
... {
public int wid;//表示ID
public string wordPrase;
public int wFreq;//词出现的次数
public PartofWord eNumPoW;//词性
} ;
整体程序如下:
操作流程为:打开数据库,将数据库中文件读入到DataSet中。并形成一个ArrayList的字典结构。
打开要分词的文件--->分词(A:处理非中文字符 B:处理标点符号 C处理纯中文字符串)
using
System;
using System.Drawing;
using System.Collections;
using System.ComponentModel;
using System.Windows.Forms;
using System.Data;
using System.Data.OleDb;
using System.IO;
using System.Collections.Specialized;
using System.Text;
using wordLibrary_keqian070507;
namespace NLP_WordSeg
... {
/**//// <summary>
/// Form1 的摘要说明。
/// </summary>
public class Form1 : System.Windows.Forms.Form
...{
private System.Windows.Forms.Button button1;
/**////<summary>
///变量,用来链接数据库使用,DataConnect,and so on
///<summary>
private System.Data.OleDb.OleDbConnection _connOle;
private System.Data.DataSet _dataSet;
private string _fileNameStr;
private bool _openDBb;
private char[] _separatorsC = ...{ '!','。','?'};
private const string _strExt = "_seg";
private const string _Separator = @"";
private const int _iNumberMax = 4;
private ArrayList[] LibArray = new ArrayList[0x9FA5-0x4E00+1];//用来记录数据
private System.Windows.Forms.Button btOpenFile;
private System.Windows.Forms.GroupBox groupBox1;
private System.Windows.Forms.GroupBox groupBox2;
private System.Windows.Forms.Button btMMMethod;
private System.Windows.Forms.TextBox textBox1;
private System.Windows.Forms.Label label1;
/**//// <summary>
/// 必需的设计器变量。
/// </summary>
private System.ComponentModel.Container components = null;
public Form1()
...{
//
// Windows 窗体设计器支持所必需的
//
InitializeComponent();
//
// TODO: 在 InitializeComponent 调用后添加任何构造函数代码
//
_connOle = null;
_dataSet = new DataSet();
_fileNameStr = "";
_openDBb = false;
for (int i = 0; i< 0x9FA5-0x4E00+1; i++)
...{
LibArray[i] = new ArrayList();
}
}
/**//// <summary>
/// 清理所有正在使用的资源。
/// </summary>
protected override void Dispose( bool disposing )
...{
if( disposing )
...{
if (components != null)
...{
components.Dispose();
}
}
base.Dispose( disposing );
}
Windows 窗体设计器生成的代码#region Windows 窗体设计器生成的代码
/**//// <summary>
/// 设计器支持所需的方法 - 不要使用代码编辑器修改
/// 此方法的内容。
/// </summary>
private void InitializeComponent()
...{
this.button1 = new System.Windows.Forms.Button();
this.btOpenFile = new System.Windows.Forms.Button();
this.groupBox1 = new System.Windows.Forms.GroupBox();
this.groupBox2 = new System.Windows.Forms.GroupBox();
this.label1 = new System.Windows.Forms.Label();
this.textBox1 = new System.Windows.Forms.TextBox();
this.btMMMethod = new System.Windows.Forms.Button();
this.groupBox1.SuspendLayout();
this.groupBox2.SuspendLayout();
this.SuspendLayout();
//
// button1
//
this.button1.FlatStyle = System.Windows.Forms.FlatStyle.Flat;
this.button1.Location = new System.Drawing.Point(16, 40);
this.button1.Name = "button1";
this.button1.Size = new System.Drawing.Size(96, 32);
this.button1.TabIndex = 0;
this.button1.Text = "载入语料库";
this.button1.Click += new System.EventHandler(this.button1_Click);
//
// btOpenFile
//
this.btOpenFile.FlatStyle = System.Windows.Forms.FlatStyle.Flat;
this.btOpenFile.Location = new System.Drawing.Point(16, 80);
this.btOpenFile.Name = "btOpenFile";
this.btOpenFile.Size = new System.Drawing.Size(96, 32);
this.btOpenFile.TabIndex = 1;
this.btOpenFile.Text = "打开处理文件";
this.btOpenFile.Click += new System.EventHandler(this.btOpenFile_Click);
//
// groupBox1
//
this.groupBox1.Controls.Add(this.button1);
this.groupBox1.Controls.Add(this.btOpenFile);
this.groupBox1.Location = new System.Drawing.Point(16, 32)
using System.Drawing;
using System.Collections;
using System.ComponentModel;
using System.Windows.Forms;
using System.Data;
using System.Data.OleDb;
using System.IO;
using System.Collections.Specialized;
using System.Text;
using wordLibrary_keqian070507;
namespace NLP_WordSeg
... {
/**//// <summary>
/// Form1 的摘要说明。
/// </summary>
public class Form1 : System.Windows.Forms.Form
...{
private System.Windows.Forms.Button button1;
/**////<summary>
///变量,用来链接数据库使用,DataConnect,and so on
///<summary>
private System.Data.OleDb.OleDbConnection _connOle;
private System.Data.DataSet _dataSet;
private string _fileNameStr;
private bool _openDBb;
private char[] _separatorsC = ...{ '!','。','?'};
private const string _strExt = "_seg";
private const string _Separator = @"";
private const int _iNumberMax = 4;
private ArrayList[] LibArray = new ArrayList[0x9FA5-0x4E00+1];//用来记录数据
private System.Windows.Forms.Button btOpenFile;
private System.Windows.Forms.GroupBox groupBox1;
private System.Windows.Forms.GroupBox groupBox2;
private System.Windows.Forms.Button btMMMethod;
private System.Windows.Forms.TextBox textBox1;
private System.Windows.Forms.Label label1;
/**//// <summary>
/// 必需的设计器变量。
/// </summary>
private System.ComponentModel.Container components = null;
public Form1()
...{
//
// Windows 窗体设计器支持所必需的
//
InitializeComponent();
//
// TODO: 在 InitializeComponent 调用后添加任何构造函数代码
//
_connOle = null;
_dataSet = new DataSet();
_fileNameStr = "";
_openDBb = false;
for (int i = 0; i< 0x9FA5-0x4E00+1; i++)
...{
LibArray[i] = new ArrayList();
}
}
/**//// <summary>
/// 清理所有正在使用的资源。
/// </summary>
protected override void Dispose( bool disposing )
...{
if( disposing )
...{
if (components != null)
...{
components.Dispose();
}
}
base.Dispose( disposing );
}
Windows 窗体设计器生成的代码#region Windows 窗体设计器生成的代码
/**//// <summary>
/// 设计器支持所需的方法 - 不要使用代码编辑器修改
/// 此方法的内容。
/// </summary>
private void InitializeComponent()
...{
this.button1 = new System.Windows.Forms.Button();
this.btOpenFile = new System.Windows.Forms.Button();
this.groupBox1 = new System.Windows.Forms.GroupBox();
this.groupBox2 = new System.Windows.Forms.GroupBox();
this.label1 = new System.Windows.Forms.Label();
this.textBox1 = new System.Windows.Forms.TextBox();
this.btMMMethod = new System.Windows.Forms.Button();
this.groupBox1.SuspendLayout();
this.groupBox2.SuspendLayout();
this.SuspendLayout();
//
// button1
//
this.button1.FlatStyle = System.Windows.Forms.FlatStyle.Flat;
this.button1.Location = new System.Drawing.Point(16, 40);
this.button1.Name = "button1";
this.button1.Size = new System.Drawing.Size(96, 32);
this.button1.TabIndex = 0;
this.button1.Text = "载入语料库";
this.button1.Click += new System.EventHandler(this.button1_Click);
//
// btOpenFile
//
this.btOpenFile.FlatStyle = System.Windows.Forms.FlatStyle.Flat;
this.btOpenFile.Location = new System.Drawing.Point(16, 80);
this.btOpenFile.Name = "btOpenFile";
this.btOpenFile.Size = new System.Drawing.Size(96, 32);
this.btOpenFile.TabIndex = 1;
this.btOpenFile.Text = "打开处理文件";
this.btOpenFile.Click += new System.EventHandler(this.btOpenFile_Click);
//
// groupBox1
//
this.groupBox1.Controls.Add(this.button1);
this.groupBox1.Controls.Add(this.btOpenFile);
this.groupBox1.Location = new System.Drawing.Point(16, 32)