调用海量智能分词研究版的dll获取分词的结果(C#)

中文分词是中文搜索引擎的基础,主要应用在信息检索、信息挖掘、中外文对译、中文校对、自动聚类、自动分类等很多方面.

这个是我参照VC的例子修改的C#版本。^  ^

using System;
using System.Drawing;
using System.Text;
using System.Collections;
using System.ComponentModel;
using System.Windows.Forms;
using System.Runtime.InteropServices;
namespace ChineseParse
{

public struct SHLSegWord
{
public strings_szWord; //字符串
public ints_dwPOS;  //词性标志
public float   s_fWeight ;//关键词权重,如果不是关键词,权重为0
//System.UInt32
}

/// <summary>
/// HLParse 的摘要说明。
/// </summary>
public class frmHLParse : System.Windows.Forms.Form
{
private System.Windows.Forms.RichTextBox txtOutput;
private System.Windows.Forms.RichTextBox txtInput;
private System.Windows.Forms.Label label1;
private System.Windows.Forms.Label label2;
private System.Windows.Forms.Button btnExit;
private System.Windows.Forms.Button btnParse;
private string m_strKey;
private string m_strWords;
private string m_strFinger;
/************************************************************/
//常量定义部分//
/************************************************************/
const int HL_CAL_OPT_KEYWORD = 0x1;//计算关键词附加标识
const int HL_CAL_OPT_FINGER = 0x2;//计算文章语义指纹标识
const int HL_CAL_OPT_POS = 0x4;//计算词性标识
const int HL_CAL_OPT_SEARCH = 0x8;//输出面向检索的分词结果
/************************************************************/
//词性定义部分//
/************************************************************/
public const int NATURE_D_A=0x40000000;//形容词 形语素
public const int NATURE_D_B=0x20000000;//区别词 区别语素
public const int NATURE_D_C=0x10000000;//连词 连语素
public const int NATURE_D_D=0x08000000;//副词 副语素
public const int NATURE_D_E=0x04000000;//叹词 叹语素
public const int NATURE_D_F=0x02000000;//方位词 方位语素
public const int NATURE_D_I=0x01000000;//成语
public const int NATURE_D_L=0x00800000;//习语
public const int NATURE_A_M=0x00400000;//数词 数语素
public const int NATURE_D_MQ=0x00200000;//数量词
public const int NATURE_D_N=0x00100000;//名词 名语素
public const int NATURE_D_O=0x00080000;//拟声词
public const int NATURE_D_P=0x00040000;//介词
public const int NATURE_A_Q=0x00020000;//量词 量语素
public const int NATURE_D_R=0x00010000;//代词 代语素
public const int NATURE_D_S=0x00008000;//处所词
public const int NATURE_D_T=0x00004000;//时间词
public const int NATURE_D_U=0x00002000;//助词 助语素
public const int NATURE_D_V=0x00001000;//动词 动语素
public const int NATURE_D_W=0x00000800;//标点符号
public const int NATURE_D_X=0x00000400;//非语素字
public const int NATURE_D_Y=0x00000200;//语气词 语气语素
public const int NATURE_D_Z=0x00000100;//状态词
public const int NATURE_A_NR=0x00000080;//人名
public const int NATURE_A_NS=0x00000040;//地名
public const int NATURE_A_NT=0x00000020;//机构团体
public const int NATURE_A_NX=0x00000010;//外文字符
public const int NATURE_A_NZ=0x00000008;//其他专名
public const int NATURE_D_H=0x00000004;
private System.Windows.Forms.RichTextBox txtKey;
private System.Windows.Forms.CheckBox chkPos;
private System.Windows.Forms.CheckBox chkSeach;
private System.Windows.Forms.Label txtMsg;
private System.Windows.Forms.CheckBox chkKeyword;
private System.Windows.Forms.CheckBox chkFinger;
private System.Windows.Forms.Label lblFinger;
private System.Windows.Forms.Button btn;//前接成分
public const int NATURE_D_K=0x00000002;//后接成分

//初始化分词词典
[DllImport("HLSSplit.dll",SetLastError=true,EntryPoint="HLSplitInit")]
private static extern bool HLSplitInit(string path);
//创建分词句柄
[DllImport("HLSSplit.dll",SetLastError=true,EntryPoint="HLOpenSplit")]
private static extern IntPtr HLOpenSplit();
//对一段字符串分词
[DllImport("HLSSplit.dll",SetLastError=true,EntryPoint="HLSplitWord")]
private static extern bool HLSplitWord(IntPtr pHandle,string text,int flag);

//取得分词个数
[DllImport("HLSSplit.dll",SetLastError=true,EntryPoint="HLGetWordCnt")]
private static extern int HLGetWordCnt(IntPtr pHandle);

//获取指定的分词结果
[DllImport("HLSSplit.dll",SetLastError=true,EntryPoint="HLGetWordAt")]
private static extern IntPtr HLGetWordAt(IntPtr pHandle,int pos);

//获取关键词个数
[DllImport("HLSSplit.dll",SetLastError=true,EntryPoint="HLGetFileKeyCnt")]
private static extern int HLGetFileKeyCnt(IntPtr pHandle);

//获取指定下标的关键词
[DllImport("HLSSplit.dll",SetLastError=true,EntryPoint="HLGetFileKeyAt")]
private static extern IntPtr HLGetFileKeyAt(IntPtr pHandle,int pos);

//装载用户自定义词典
[DllImport("HLSSplit.dll",SetLastError=true,EntryPoint="HLOpenUsrDict")]
private static extern bool HLOpenUsrDict(string lpUserDictName);

//卸载用户自定义词典
[DllImport("HLSSplit.dll",SetLastError=true,EntryPoint="HLFreeUsrDict")]
private static extern bool HLFreeUsrDict();

//获得语义指纹
[DllImport("HLSSplit.dll",SetLastError=true,EntryPoint="HLGetFingerM")]
private static extern bool HLGetFingerM(IntPtr hHandle,ref IntPtr rpData, ref Int32 rdwLen);
 
//关闭分词句柄
[DllImport("HLSSplit.dll",SetLastError=true,EntryPoint="HLCloseSplit")]
private static extern void HLCloseSplit(IntPtr pHandle);
//海量分词系统卸载
[DllImport("HLSSplit.dll",SetLastError=true,EntryPoint="HLFreeSplit")]
private static extern void HLFreeSplit();
 
/// <summary>
/// 必需的设计器变量。
/// </summary>
private System.ComponentModel.Container components = null;

public frmHLParse()
{
//
// Windows 窗体设计器支持所必需的
//
InitializeComponent();

//
// TODO: 在 InitializeComponent 调用后添加任何构造函数代码
//
}
/// <summary>
/// 清理所有正在使用的资源。
/// </summary>
protected override void Dispose( bool disposing )
{
if( disposing )
{
if(components != null)
{
components.Dispose();
}
}
base.Dispose( disposing );
}

#region Windows 窗体设计器生成的代码
/// <summary>
/// 设计器支持所需的方法 - 不要使用代码编辑器修改
/// 此方法的内容。
/// </summary>
private void InitializeComponent()
{
this.txtOutput = new System.Windows.Forms.RichTextBox();
this.txtInput = new System.Windows.Forms.RichTextBox();
this.label1 = new System.Windows.Forms.Label();
this.label2 = new System.Windows.Forms.Label();
this.btnExit = new System.Windows.Forms.Button();
this.btnParse = new System.Windows.Forms.Button();
this.txtKey = new System.Windows.Forms.RichTextBox();
this.chkPos = new System.Windows.Forms.CheckBox();
this.chkKeyword = new System.Windows.Forms.CheckBox();
this.chkFinger = new System.Windows.Forms.CheckBox();
this.chkSeach = new System.Windows.Forms.CheckBox();
this.txtMsg = new System.Windows.Forms.Label();
this.lblFinger = new System.Windows.Forms.Label();
this.btn = new System.Windows.Forms.Button();
this.SuspendLayout();
//
// txtOutput
//
this.txtOutput.Location = new System.Drawing.Point(34, 208);
this.txtOutput.Name = "txtOutput";
this.txtOutput.ReadOnly = true;
this.txtOutput.Size = new System.Drawing.Size(488, 136);
this.txtOutput.TabIndex = 1;
this.txtOutput.Text = "";
//
// txtInput
//
this.txtInput.Location = new System.Drawing.Point(34, 4);
this.txtInput.Name = "txtInput";
this.txtInput.Size = new System.Drawing.Size(488, 154);
this.txtInput.TabIndex = 0;
this.txtInput.Text = "海量中文智能分词基础件具有灵活定制的特点,支持多平台、 支持多码制、 针对不同应用可量身定做多种版本";
//
// label1
//
this.label1.Location = new System.Drawing.Point(6, 6);
this.label1.Name = "label1";
this.label1.Size = new System.Drawing.Size(24, 36);
this.label1.TabIndex = 2;
this.label1.Text = "输入";
//
// label2
//
this.label2.Location = new System.Drawing.Point(4, 204);
this.label2.Name = "label2";
this.label2.Size = new System.Drawing.Size(24, 36);
this.label2.TabIndex = 3;
this.label2.Text = "输出";
//
// btnExit
//
this.btnExit.Location = new System.Drawing.Point(604, 352);
this.btnExit.Name = "btnExit";
this.btnExit.TabIndex = 5;
this.btnExit.Text = "退出";
this.btnExit.Click += new System.EventHandler(this.btnExit_Click);
//
// btnParse
//
this.btnParse.Location = new System.Drawing.Point(272, 352);
this.btnParse.Name = "btnParse";
this.btnParse.TabIndex = 4;
this.btnParse.Text = "分词";
this.btnParse.Click += new System.EventHandler(this.btnParse_Click);
//
// txtKey
//
this.txtKey.Location = new System.Drawing.Point(528, 4);
this.txtKey.Name = "txtKey";
this.txtKey.ReadOnly = true;
this.txtKey.Size = new System.Drawing.Size(160, 340);
this.txtKey.TabIndex = 6;
this.txtKey.Text = "";
//
// chkPos
//
this.chkPos.Location = new System.Drawing.Point(42, 164);
this.chkPos.Name = "chkPos";
this.chkPos.Size = new System.Drawing.Size(74, 20);
this.chkPos.TabIndex = 7;
this.chkPos.Text = "词性";
//
// chkKeyword
//
this.chkKeyword.Checked = true;
this.chkKeyword.CheckState = System.Windows.Forms.CheckState.Checked;
this.chkKeyword.Location = new System.Drawing.Point(120, 164);
this.chkKeyword.Name = "chkKeyword";
this.chkKeyword.Size = new System.Drawing.Size(74, 20);
this.chkKeyword.TabIndex = 8;
this.chkKeyword.Text = "关键词";
//
// chkFinger
//
this.chkFinger.Checked = true;
this.chkFinger.CheckState = System.Windows.Forms.CheckState.Checked;
this.chkFinger.Location = new System.Drawing.Point(198, 164);
this.chkFinger.Name = "chkFinger";
this.chkFinger.Size = new System.Drawing.Size(74, 20);
this.chkFinger.TabIndex = 9;
this.chkFinger.Text = "语义指纹";
//
// chkSeach
//
this.chkSeach.Location = new System.Drawing.Point(276, 164);
this.chkSeach.Name = "chkSeach";
this.chkSeach.Size = new System.Drawing.Size(74, 20);
this.chkSeach.TabIndex = 10;
this.chkSeach.Text = "检索优化";
//
// txtMsg
//
this.txtMsg.Location = new System.Drawing.Point(356, 164);
this.txtMsg.Name = "txtMsg";
this.txtMsg.Size = new System.Drawing.Size(166, 20);
this.txtMsg.TabIndex = 11;
this.txtMsg.TextAlign = System.Drawing.ContentAlignment.MiddleLeft;
//
// lblFinger
//
this.lblFinger.Location = new System.Drawing.Point(36, 188);
this.lblFinger.Name = "lblFinger";
this.lblFinger.Size = new System.Drawing.Size(486, 18);
this.lblFinger.TabIndex = 12;
this.lblFinger.TextAlign = System.Drawing.ContentAlignment.MiddleLeft;
//
// btn
//
this.btn.Location = new System.Drawing.Point(32, 352);
this.btn.Name = "btn";
this.btn.TabIndex = 13;
this.btn.Text = "其它";
this.btn.Click += new System.EventHandler(this.btn_Click);
//
// frmHLParse
//
this.AutoScaleBaseSize = new System.Drawing.Size(6, 14);
this.ClientSize = new System.Drawing.Size(696, 381);
this.Controls.Add(this.btn);
this.Controls.Add(this.lblFinger);
this.Controls.Add(this.txtMsg);
this.Controls.Add(this.chkSeach);
this.Controls.Add(this.chkFinger);
this.Controls.Add(this.chkKeyword);
this.Controls.Add(this.chkPos);
this.Controls.Add(this.txtKey);
this.Controls.Add(this.btnExit);
this.Controls.Add(this.btnParse);
this.Controls.Add(this.label2);
this.Controls.Add(this.label1);
this.Controls.Add(this.txtOutput);
this.Controls.Add(this.txtInput);
this.Name = "frmHLParse";
this.Text = "HLParse";
this.ResumeLayout(false);

}
#endregion

/// <summary>
/// 应用程序的主入口点。
/// </summary>
[STAThread]
static void Main()
{
Application.Run(new frmHLParse());
}
private void btnParse_Click(object sender, System.EventArgs e)
{
this.m_strWords="";
this.m_strFinger="";
this.m_strKey="";
ParseWord(this.txtInput.Text);
this.txtOutput.Text=this.m_strWords;
this.txtKey.Text=this.m_strKey;
this.lblFinger.Text=this.m_strFinger;
}

private void btnExit_Click(object sender, System.EventArgs e)
{
this.Close();
}
private void ParseWord(string text)
{
 
bool bInitDict=HLSplitInit(@"D:/MyProjects/ChineseParse/bin/Debug/");
if(!bInitDict)
{
MessageBox.Show("初始化分词字典失败!","错误");
return ;
}
 
IntPtr hHandle = HLOpenSplit(); //创建分词句柄
if(hHandle==IntPtr.Zero)
{
//创建分词句柄失败
MessageBox.Show("创建分词句柄失败!","错误");
HLFreeSplit() ;//卸载分词字典
return ;
}
 
int iExtraCalcFlag = 0; //附加计算标志,不进行附加计算
//获得附加计算标识
if(this.chkPos.Checked)
iExtraCalcFlag |= HL_CAL_OPT_POS ;//
if(this.chkKeyword.Checked)
iExtraCalcFlag |= HL_CAL_OPT_KEYWORD;
if(this.chkSeach.Checked)
iExtraCalcFlag |= HL_CAL_OPT_SEARCH;
if(this.chkFinger.Checked)
iExtraCalcFlag |= HL_CAL_OPT_FINGER;
DateTime bgdt=DateTime.Now;
bool bSuccess = HLSplitWord (hHandle,text,iExtraCalcFlag);
System.TimeSpan ts=DateTime.Now-bgdt;
this.txtMsg.Text=string.Format("用时{0}分{1}秒{2}毫秒",ts.Minutes,ts.Seconds,ts.Milliseconds);
if(bSuccess)
{
//分词成功
int nResultCnt = HLGetWordCnt(hHandle);//取得分词个数
for(int i = 0;i<nResultCnt;i++)
{
//取得分词结果
IntPtr h=HLGetWordAt(hHandle,i) ;//取得一个分词结果
SHLSegWordpWord = (SHLSegWord)Marshal.PtrToStructure(h,typeof(SHLSegWord));
m_strWords+=pWord.s_szWord;
if(this.chkPos.Checked)
m_strWords+=GetNatureString(pWord.s_dwPOS);
m_strWords+="|";
}
if(this.chkKeyword.Checked)
{
//获取关键词
int nKeyCnt = HLGetFileKeyCnt(hHandle) ;//获得关键词个数
for(int j = 0 ; j < nKeyCnt ; j++)
{
IntPtr h = HLGetFileKeyAt(hHandle,j);//获得指定的关键词
SHLSegWord pKey= (SHLSegWord)Marshal.PtrToStructure(h,typeof(SHLSegWord));
if(pKey.s_szWord==null|| pKey.s_szWord=="")
continue ;
string strKey=string.Format("{0}.{1} {2}/r/n",j+1,pKey.s_szWord ,pKey.s_fWeight);
m_strKey += strKey ;
}
}
if(this.chkFinger.Checked)
{
//获取语义指纹
IntPtr PtrData=Marshal.AllocHGlobal(64);
Int32 PtrDataLen=0;
//int nDataLen=20 ;c2 53 e2 2d 91 5c 99 ac c2 24 42 56 eb 1d 78
m_strFinger="语义指纹:";
 
//StringBuilder
if(HLGetFingerM(hHandle,ref PtrData,ref PtrDataLen))//获得语义指纹
{
//int len=(int)Marshal.PtrToStructure(PtrDataLen,typeof(System.Int32));
 
for(int j = 0;j<PtrDataLen;j++)
{
string strU;
IntPtr p=(IntPtr)(PtrData.ToInt32()+j);
Byte b= (Byte)Marshal.PtrToStructure(p,typeof(Byte));
strU=string.Format("{0:x}",b);
m_strFinger+=strU+" ";
}
}
 
}
HLCloseSplit(hHandle) ;//关闭分词句柄
}
else
{
//分词失败
MessageBox.Show("分词失败!","错误");
HLCloseSplit(hHandle) ;//关闭分词句柄
HLFreeSplit() ;//卸载分词字典
return ;
}
HLFreeSplit() ; //卸载分词词典
}
public struct aaaa
{
public byte[] data;
}
private string GetNatureString(int dwPos)
{
string Nature=".";
if((dwPos & NATURE_D_A) == NATURE_D_A)
{
Nature+="a";//形容词
}
else if((dwPos & NATURE_D_B) == NATURE_D_B)
{
Nature+="b";//区别词
}
else if((dwPos & NATURE_D_C) == NATURE_D_C)
{
Nature+="c";//连词
}
else if((dwPos & NATURE_D_D) == NATURE_D_D)
{
Nature+="d";//副词
}
else if((dwPos & NATURE_D_E) == NATURE_D_E)
{
Nature+="e";//叹词
}
else if((dwPos & NATURE_D_F) == NATURE_D_F)
{
Nature+="f";//方位词
}
else if((dwPos & NATURE_D_I) == NATURE_D_I)
{
Nature+="i"; //成语
}
else if((dwPos & NATURE_D_L) == NATURE_D_L)
{
Nature+="l";//习语
}
else if((dwPos & NATURE_A_M) == NATURE_A_M)
{
Nature+="m";//数词
}
else if((dwPos & NATURE_D_MQ) == NATURE_D_MQ)
{
Nature+="mq";//数量词
}
else if((dwPos & NATURE_D_N) == NATURE_D_N)
{
Nature+="n";//名词
}
else if((dwPos & NATURE_D_O) == NATURE_D_O)
{
Nature+="o";//拟声词
}
else if((dwPos & NATURE_D_P) == NATURE_D_P)
{
Nature+="p";//介词
}
else if((dwPos & NATURE_A_Q) == NATURE_A_Q)
{
Nature+="q";//量词
}
else if((dwPos & NATURE_D_R) == NATURE_D_R)
{
Nature+=".r";//代词
}
else if((dwPos & NATURE_D_S) == NATURE_D_S)
{
Nature+="s";//处所词
}
else if((dwPos & NATURE_D_T) == NATURE_D_T)
{
Nature+=".t";//时间词
}
else if((dwPos & NATURE_D_U) == NATURE_D_U)
{
Nature+="u";//助词
}
else if((dwPos & NATURE_D_V) == NATURE_D_V)
{
Nature+="v";//动词
}
else if((dwPos & NATURE_D_W) == NATURE_D_W)
{
Nature+="w";//标点符号
}
else if((dwPos & NATURE_D_X) == NATURE_D_X)
{
Nature+="x";//非语素字
}
else if((dwPos & NATURE_D_Y) == NATURE_D_Y)
{
Nature+="y";//语气词
}
else if((dwPos & NATURE_D_Z) == NATURE_D_Z)
{
Nature+="z";//状态词
}
else if((dwPos & NATURE_A_NR) == NATURE_A_NR)
{
Nature+="nr";//人名
}
else if((dwPos & NATURE_A_NS) == NATURE_A_NS)
{
Nature+="ns";//地名
}
else if((dwPos & NATURE_A_NT) == NATURE_A_NT)
{
Nature+="nt";//机构团体
}
else if((dwPos & NATURE_A_NX) == NATURE_A_NX)
{
Nature+="nx";//外文字符
}
else if((dwPos & NATURE_A_NZ) == NATURE_A_NZ)
{
Nature+="nz";//其他专名
}
else if((dwPos & NATURE_D_H) == NATURE_D_H)
{
Nature+="h";//前接成分
}
else if((dwPos & NATURE_D_K) == NATURE_D_K)
{
Nature+="k";//后接成分
}
else
{
Nature+="?";//未知词性
}
return Nature;
}

private void btn_Click(object sender, System.EventArgs e)
{
Form1 frm=new Form1();
frm.ShowDialog();
}
}
}

 海量智能分词研究版下载:http://www.hylanda.com/cgi-bin/download/download.asp?id=8

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 4
    评论
1、 修改字典格式,提高字典加载速度 2、 增加对英文专业名词的支持 如C++,C#等只要加入字典就可以被分出来 3、 增加词频判断功能,在无法取舍时根据词频取舍 4、 增加优先优先词频选项,通过这个选项动态决定分词粒度 需打开 FreqFirst 5、 增加中文人名前后缀统计和根据该统计定位人名的功能 6、 增加中文人名和未登录词出现频率统计功能 7、 增加自动更新字典功能,对超过阈值的人名和未登录词自动插入字典 需打开 AutoInsertUnknownWords 开关 并设置 UnknownWordsThreshold,(不推荐自动插入,推荐手工插入) 8、 增加定期保存字典和统计结果功能 需设置 AutoSaveInterval 9、 增加KTDictSeg.xml配置文件来配置分词参数 10、增加对Lucene.net 的支持,提供 KTDictSegAnalyzer 分析器给Lucene.net 11、增加字典管理功能,可以添加删除修改字典 12、字典管理中提供从未登录词中批量插入字典功能,可帮助使用者手工选择合适的未登录词插入字典(推荐) 13、提供一个新闻搜索的简单例子,采用Lucene.net+KTDictSegAnalyzer+KTDictSeg,项目名为Demo.KTDictSegAnalyzer 14、将所有ArrayList 改为List 其中 src_V1.3.01是源码 rel_V1.3.01 包含所有的可执行文件,配置文件;Data目录下是词库,停用词表,以及我目前统计的人名前后缀词表;News 目录下是Lucene.net为 新闻搜索的例子建的索引。 News.zip 是上图中批量插入时要输入的XML文件,它包含3万条从新浪和中华网抓下来的过时的新闻,大约2000万字左右,可供各位朋友学习使用。 注意:如果要导入news.xml,这个文件必须要和Demo.KTDictSegAnalyzer.exe放在同一个目录下!
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值