高阶HMM中文分词

最新推荐文章于 2021-08-13 17:29:26 发布

orisun

最新推荐文章于 2021-08-13 17:29:26 发布

阅读量687

点赞数

分类专栏： NLP DM AI

本文链接：https://blog.csdn.net/zhangchaoyangsun/article/details/8470266

版权

本文介绍了使用HMM（隐马尔科夫模型）进行中文分词，包括一阶和二阶HMM的模型参数学习和分词效果比较，并探讨了CRF++分词系统，强调了CRF的特征函数灵活性及其在分词准确性上的优势。

摘要由CSDN通过智能技术生成

模型的建立

一句话中出现的汉字构成观察序列，如“希腊的经济结构较特殊”对应的观察序列O={希,腊,的,经,济,结,构,较,特,殊}。所有观察值的集合至少应该包含训练集和测试集中出现的所有汉字。

状态有4种：B表示词首的汉字；M表示词语中间的汉字；E表示词尾的汉字；S表示单独的汉字构成一个词。

举例：希/B腊/E　　的/S　　经/B济/M结/M构/E　　较/S　　特/B殊/E

文本的预处理

语料库用的是使用msr_training.utf8和msr_test.utf8
由于要做分词，我们的观察值是一个一个的汉字，从文本中提前汉字的方法自然是一次读取3个字节。如果文本中含有英文符号、英文字母、阿拉伯数字等对会对提取汉字的工作造成干扰。有一种去除单字节编码字符的方法是：先利用ICTCLAS进行分词和词性标注（wordseg.cpp），然后去除词性以下列字母开关的词(posfilter.cpp)：
m:数词，里面通常包含数字
x:字符串，包含英文字母
w:标点符号，可能包含英文标点符号
t:时间，可能包含数字
另外词性为nrf(音译人名，如“阿沛·阿旺晋美”)的词也应该去掉，因为包含一个点。

wordseg.cpp

#include <string>
#include <iostream>
#define OS_LINUX
#include "ICTCLAS50.h"
using namespace std;
 
int main(int argc, char *argv[])
{
    if (argc < 2) {      //命令行中需要给定要处理的文件名
        cout << "Usage:command filename" << endl;
        return 1;
    }
    string filename = argv[1];
    string outfile = filename + ".ws";
    string initPath = "/home/orisun/master/ICTCLAS50_Linux_RHAS_32_C/API";
    if (!ICTCLAS_Init(initPath.c_str())) {
        cout << "Init fails" << endl;
        return -1;
    }
    ICTCLAS_FileProcess(filename.c_str(), outfile.c_str(), CODE_TYPE_UTF8,1);
    ICTCLAS_Exit();
    return 0;
}

posfilter.cpp

#include<iostream>
#include<fstream>
#include<sstream>
#include<string>
#include<set>

using namespace std;

int main(int argc,char *argv[]){
    set<char> filter_set;
    filter_set.insert('m');
    filter_set.insert('x');
    filter_set.insert('w');
    filter_set.insert('t');
    if(argc<2){
        cout<<"usage: "<<argv[0]<<" inputfile"<<endl;
        return 1;
    }
    ifstream ifs(argv[1]);
    if(!ifs){
        cerr<<"open file "<<argv[1]<<" failed."<<endl;
        return 1;
    }
    string outfile(argv[1]);
    ofstream ofs((outfile+".posfilter").c_str());
    if(!ofs){
        cerr<<"open outputfile failed."<<endl;
        return 1;
    }

    string line,line_out,word;
    while(getline(ifs,line)){
        line_out.clear();
        istringstream strstm(line);
        while(strstm>>word){
            string::size_type pos=word.find("/");
            string post=word.substr(pos+1);
            char c=post.at(0);
            if(c=='w')
                line_out+=" ";
            if(filter_set.find(c)==filter_set.end() && post!="nrf"){       //词性不在被过滤的集合当中
                line_out+=word.substr(0,pos);       //对于训练集要追加空格，对测试集不能追加空格
            }
        }
        ofs<<line_out<<endl;
    }

    ifs.close();
    ofs.close();
    return 0;
}

另外由于ICTCKLAS词性标注也不是100%准确，采用上述方法并不能将单字节编码的字符去除干净，在BMES.cpp中会进行最后的检查，找到单字节字符后再手动将其删除即可。
最后请在train文档中手动去除℃和／

BMES.cpp

#include<iostream>
#include<fstream>
#include<sstream>
#include<string>

using namespace std;

int main(int argc,char *argv[]){
	if(argc<3){
		cout<<"Usage: "<<argv[0]<<" inputfile outputfile"<<endl;
		return 1;
	}
	
	ifstream ifs(argv[1]);
	ofstream ofs(argv[2]);
	if(!(ifs&&ofs)){
		cerr<<"Open file failed!"<<endl;
		return 1;
	}
	
	string line,word,line_out;
	int lineno=0;
	while(getline(ifs,line)){
		lineno++;
		line_out.clear();
		istringstream strstm(line);
		while(strstm>>word){
			if(word.size()%3!=0){
				cout<<lineno<<": "<<word<<endl;
				//return 1;
			}
			int len = word.size()/3;			//词中包含多少个汉字
			if (len == 0)
				continue;
			string word_out;
			if (len == 1) {
				word_out=word;
				word_out+="/S";
			} else {
				/*拷贝词中的第1个汉字*/
				word_out.insert(word_out.size(),word,0,3);
				word_out+="/B";
				int i=1;
				/*逐个拷贝词中间的汉字*/
				for(;i<len-1;++i){
					word_out.insert(word_out.size(),word,3*i,3);
					word_out+="/M";
				}
				/*拷贝词的最后1个汉字*/
				word_out.insert(word_out.size(),word,3*len-3,3);
				word_out+="/E";
			}
			line_out+=word_out;
			line_out+="  ";
		}
		ofs<<line_out<<endl;
	}
	ifs.close();
	ofs.close();
	return 0;
}

同样要把train文本和test文本中的所有汉字录入GDBM数据库中，然后对所有汉字标记序号。

train2dict.cpp

#include<stdio.h>
#include<string.h>
#include<stdlib.h>
#include<sys/stat.h>
#include<gdbm.h>
#include<ctype.h>

#define DB_FILE_BLOCK "dict_db"

int main(int argc,char* argv[]){
	if(argc<2){
		printf("Usage: %s BMES_marked_file.\n",argv[0]);
		exit(1);
	}
	
	FILE *fp;
	if((fp=fopen(argv[1],"r"))==NULL){
		perror("fopen");
		exit(1);
	}
	
	GDBM_FILE dbm_ptr;
    dbm_ptr = gdbm_open(DB_FILE_BLOCK,0,GDBM_WRCREAT,S_IRUSR | S_IWUSR,NULL);
	
	char *v="w";
	datum key,value;
	value.dptr=v;
	value.dsize=1;
	
	char word[3]={0};
	char *line=NULL;			//循环从输入文件中读取一行，放在line中
	ssize_t read=0;
	size_t needlen=0;
	char slash='/';
	int line_no=0;
	while((read=getline(&line,&needlen,fp))!=-1){
		line_no++;
		char *begin=line;
		char *end=NULL;
		while((end=strchr(begin,slash))!=NULL){
			if(end-begin<3){
				printf("%d:%s\n",line_no,begin);
				break;
			}
			strncpy(word,end-3,3);
			key.dptr=word;
			key.dsize=3;
			//char tmp[4]={0};
			//strncpy(tmp,key.dptr,3);
			//printf("%s\t",tmp);
			gdbm_store(dbm_ptr,key,value,GDBM_REPLACE);
			begin=end+2;
		}
	}	
	free(line);