使用C++实现mmseg，对中文句子进行分词

最新推荐文章于 2024-04-12 17:02:56 发布

laiconglin

最新推荐文章于 2024-04-12 17:02:56 发布

阅读量2.4k

点赞数 1

分类专栏： C/C++ 文章标签： c++ iterator string recursion character include

本文链接：https://blog.csdn.net/laiconglin/article/details/7281759

版权

C/C++ 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

想要了解更多，可以看我在OSCHINA分享的代码，

http://www.oschina.net/action/code/download?code=7969&id=12697

这里我的这段代码为了实现的简单化，假设传进来的参数都是中文字符串，也就是不含有字母数字类似的东西。

例如 string test="研究生命起源";
分词效果为 : 研究生命起源，还算正确。
初始化过程中，有很多文件读取的操作，其中有一个文件存储了我们需要的词库，另外还有一个词频的文件。
主要的算法采用递归来实现，主要是考虑到实现的方便性，大家如果觉得实在受不了递归，可以考虑修改出一个非递归的版本。采用递归的方法实现，于是就导致了当一个串很长很长的时候需要很长的时间才能做好分词，似乎，已经慢的快到了不能忍的地步了。时间复杂度接近2^n，n为字符串的中文字的个数。

PS：在Ubuntu下的eclipse编译通过的代码，这里一个中文字是三个字节，如果在windows下Visual Studio,是两个字节，这个我在开头通过测试一个字符串的长度来获取，这样用稍微变通的方法让这个可以在不同平台直接使用。

代码见附件。里面有词库加上代码。

/*
 * buildTerms.cpp
 *
 *  Created on: Dec 21, 2011
 *      Author: lai_conglin
 */

# include <iostream>
# include <map>
# include <string>
# include <fstream>
# include <set>
# include <vector>
# include <algorithm>
# include <time.h>
#include <stdio.h>
using namespace std;
map<string, set<string> > content;
map<string, int> freq;
size_t ChineseLength;
void initial(){
	string testch="测试";
//	cout<<testch.length()<<endl;
	ChineseLength=testch.length()/2;
}
struct Word {
private:
	string value;
	int frequence;
public:
	Word() {

	}
	void setValue(string src) {
		value = src;
		/**
		 * a single character will need to find its frequence , and a term will be 1 default
		 */
		if (value.length() == ChineseLength) {
			map<string, int>::iterator it = freq.find(value);
			if (it == freq.end()) {
				frequence = 1;
			} else {
				frequence = freq[value];
			}
		} else {
			frequence = 1;
		}
	}
	string getValue() {
		return value;
	}
	int getLength() {
		return value.length();
	}
	int getFrequency() {
		return frequence;
	}
};
struct Chunk {
public:
	vector<Word> list;

	Chunk() {

	}
	/**
	 * add a word to the list
	 */
	void addWord(Word w) {
		list.push_back(w);
	}
	int getWordNumber() {
		return list.size();
	}
	double getVariance() {
		double avgLen = 0.0;
		int listSize = list.size();
		for (int i = 0; i < listSize; i++) {
			avgLen += list.at(i).getLength();
		}
		avgLen = 1.0 * avgLen / listSize;
		double variance = 1.0;
		for (int i = 0; i < listSize; i++) {
			double temp = (avgLen - list.at(i).getLength());
			variance += temp * temp;
		}
		return variance;
	}
	long getFreq() {
		long freqValue = 0;
		int listSize = list.size();
		for (int i = 0; i < listSize; i++) {
			freqValue += list.at(i).getFrequency();
		}
		return freqValue;
	}
	vector<string> getVectorString() {
		vector<string> res;
		int size = list.size();
		for (int i = 0; i < size; i++) {
			res.push_back(list.at(i).getValue());
		}
		return res;
	}
};
void read_terms_from_Lexicon() {
	ifstream fin("Lexicon_full_words.txt");
//	string test="一·二八";
//	cout<<test.substr(3,2);
	string dot = "·";
//	cout<<test.find(dot.c_str(),3);
	string s;
	string temp;
	map<string, set<string> >::iterator lexiconIterator;
	s.clear();
	//ignore the first one string, I don't know why
	fin >> s;
	cout << "start read from Lexicon_full_words.txt" << endl;
	cout << "reading terms: -> " << endl;

	while (fin >> s) {
//		int len = s.length();
		set<string> tempSet;
		//one character will cost 3 bytes in linux, 2 in windows....maybe not right.you need to test it
		temp.clear();
		temp = s.substr(0, ChineseLength);
		lexiconIterator = content.find(temp);
		if (lexiconIterator == content.end()) {
			tempSet.clear();
			tempSet.insert(s);
		} else {
			tempSet = content[temp];
			tempSet.insert(s);
		}
		content[temp] = tempSet;

	}
	cout << "finish read the lexicon." << endl;
	cout << "finish read the lexicon .lexicon size:" << content.size() << endl;

	fin.close();
}
void show_Lexicon() {
	map<string, set<string> >::iterator lexiconIterator;
	int count = 0;
	for (lexiconIterator = content.begin(); lexiconIterator != content.end();
			lexiconIterator++) {
		string first = lexiconIterator->first;
		set<string> second = lexiconIterator->second;
		cout << first << ":";
		set<string>::iterator setIt;
		for (setIt = second.begin(); setIt != second.end(); setIt++) {
			cout << *setIt << " ";
		}
		cout << endl;
		count++;
		if (count == 10) {
			break;
		}
	}
	cout << "lexicon size:" << content.size() << endl;
}
void write_index() {
	fstream outputFile("lexicon.index", fstream::out);
	map<string, set<string> >::iterator lexiconIterator;
	for (lexiconIterator = content.begin(); lexiconIterator != content.end();
			lexiconIterator++) {
		string first = lexiconIterator->first;
		set<string> second = lexiconIterator->second;
		outputFile << "#:" << first << endl;
		set<string>::iterator setIt;
		for (setIt = second.begin(); setIt != second.end(); setIt++) {
			outputFile << *setIt << " ";
		}
		outputFile << endl;
	}
	cout << "write index: lexicon size:" << content.size() << endl;
	outputFile.close();
}
void write_freq() {
	fstream outputFile("freq.index", fstream::out);
	map<string, int>::iterator freqIterator;
	for (freqIterator = freq.begin(); freqIterator != freq.end();
			freqIterator++) {
		string first = freqIterator->first;
		int second = freqIterator->second;
		outputFile << first << " " << second;

		outputFile << endl;
	}
	cout << "write index: freq size:" << freq.size() << endl;
	outputFile.close();

}
void build_freq() {
	ifstream fin("freq.index");
	string tempStr;
	int tempFreq;
	while (fin >> tempStr) {
		fin >> tempFreq;
		if (tempFreq > 1 || tempStr.length() == ChineseLength) {
			freq[tempStr] = tempFreq;
		}
	}
	fin.close();
//	write_freq();
	cout << "index: freq size:" << freq.size() << endl;
}

void build_index() {
	ifstream fin("lexicon.index");
//	cout<<fin.good()<<endl;
	if (fin.good() == 0) {
		cout << "build index , need some time , please wait for a moment! \n";
		read_terms_from_Lexicon();
		write_index();
		fin.open("lexicon.index", ifstream::in);
	}
	cout << "hello ,begin load index \n";
	string tempStr;
	set<string> tempSet;
	string key = "";
	while (fin >> tempStr) {
		if (tempStr.find("#:", 0) == 0) {
			if (key != "")
				content[key] = tempSet;
			tempSet.clear();
			key = tempStr.substr(2);
		} else {
			tempSet.insert(tempStr);
		}
//		if(content.size()>10){
//			break;
//		}
	}
	content[key] = tempSet;
	cout << "lexicon size:" << content.size() << endl;
	fin.close();
	build_freq();
}

vector<Chunk> chunklist;
int minChunkWordNumber;
void mmseg_recursion(string src, Chunk tempChunk) {
	set<string> termslist;
	int i = 0;
	int len = src.length();
	//get the single character.
	string singleWordStr;
	Word tempWord;
	singleWordStr = src.substr(0, ChineseLength);
	tempWord.setValue(singleWordStr);

	//if the character is the end character , return
	if ((i + ChineseLength) >= len) {
		tempChunk.addWord(tempWord);
		chunklist.push_back(tempChunk);
		if (minChunkWordNumber > tempChunk.getWordNumber()) {
			minChunkWordNumber = tempChunk.getWordNumber();
		}
		return;
	} else {
		string tempStr;
		map<string, set<string> >::iterator lexIt;
		lexIt = content.find(singleWordStr);
		//if the lexicon has no this word
		//them let it be a single term
		if (lexIt == content.end()) {
			tempChunk.addWord(tempWord);
			string remain = src.substr(ChineseLength);
			mmseg_recursion(remain, tempChunk);
			tempChunk.list.pop_back();
		} else {
			termslist = content[singleWordStr];
			set<string>::iterator setIt;
			vector<string> termsVector;
//			for (setIt = termslist.begin(); setIt != termslist.end(); setIt++) {
//				termsVector.push_back(*setIt);
//			}
//			sort(termsVector.begin(), termsVector.end());
//			int sizeVec = termsVector.size();
			for (setIt = termslist.begin(); setIt != termslist.end(); setIt++) {
				tempStr = *setIt;
				size_t foundit = src.find(tempStr, 0);
				if (foundit == 0 && tempStr != singleWordStr) {
					tempWord.setValue(tempStr);
					tempChunk.addWord(tempWord);
					if (tempChunk.getWordNumber() > (minChunkWordNumber)) {
						tempChunk.list.pop_back();
						return;
					}

					//if the term has all remain character of string
					//return
					if (tempStr.length() == src.length()) {
						chunklist.push_back(tempChunk);
						if (minChunkWordNumber > tempChunk.getWordNumber()) {
							minChunkWordNumber = tempChunk.getWordNumber();
						}
						return;
					}
					string remain = src.substr(tempStr.length());
					mmseg_recursion(remain, tempChunk);
					tempChunk.list.pop_back();
				}
			}
			//process the single character situation
			tempStr = singleWordStr;
			tempWord.setValue(tempStr);
			tempChunk.addWord(tempWord);
			if (tempChunk.getWordNumber() > (minChunkWordNumber)) {
				tempChunk.list.pop_back();
				return;
			}
			string remain = src.substr(tempStr.length());
			mmseg_recursion(remain, tempChunk);
			tempChunk.list.pop_back();

		}
	}
}

vector<string> mmseg(string src) {
	vector<string> res;
	chunklist.clear();
	minChunkWordNumber = 0x7ffffff0;
	Chunk tempChunk;
	vector<int> indexInChunkList;
	int min = 0x7fffffff;
//	cout << min;
	mmseg_recursion(src, tempChunk);
	int chunkListSize = chunklist.size();
	if (chunkListSize == 1) {
		return chunklist.at(0).getVectorString();
	} else {
		for (int i = 0; i < chunkListSize; i++) {
			if (chunklist.at(i).getWordNumber() < min) {
				min = chunklist.at(i).getWordNumber();
				indexInChunkList.clear();
				indexInChunkList.push_back(i);
			} else if (chunklist.at(i).getWordNumber() == min) {
				indexInChunkList.push_back(i);
			}
		}
		//rule 1 to find the max average length chunk
		if (indexInChunkList.size() == 1) {
			return chunklist.at(indexInChunkList.at(0)).getVectorString();
		} else {
			//rule 2 find the least variance of chunk
			double minVariance = min * src.length() * src.length();
			vector<int> tempIndex = indexInChunkList;
			indexInChunkList.clear();
			for (size_t index = 0; index < tempIndex.size(); index++) {
				int i = tempIndex.at(index);
				if (chunklist.at(i).getVariance() < minVariance) {
					minVariance = chunklist.at(i).getVariance();
					indexInChunkList.clear();
					indexInChunkList.push_back(i);
				} else if (chunklist.at(i).getVariance() == minVariance) {
					indexInChunkList.push_back(i);
				}
			}

			if (indexInChunkList.size() == 1) {
				return chunklist.at(indexInChunkList.at(0)).getVectorString();
			} else {
				//rule 3 have most frequency terms
				vector<int> tempIndex = indexInChunkList;
				indexInChunkList.clear();
				long max = 0;
				int tempIndexSize = tempIndex.size();
				for (int index = 0; index < tempIndexSize; index++) {
					int i = tempIndex.at(index);
					if (chunklist.at(i).getFreq() > max) {
						max = chunklist.at(i).getFreq();
						indexInChunkList.clear();
						indexInChunkList.push_back(i);
					} else if (chunklist.at(i).getFreq() == max) {
						indexInChunkList.push_back(i);
					}
				}
				return chunklist.at(indexInChunkList.at(0)).getVectorString();
			}
		}
	}
}
void showTermsSegment(vector<string> src) {
	cout << "segment like this:";
	int size = src.size();
	for (int i = 0; i < size; i++) {
		cout << src.at(i) << " ";
	}
	cout << endl;
}
int main() {
	initial();
//	read_terms_from_Lexicon();
//	write_index();
	build_index();
//	show_Lexicon();
	string test = "中华人民共和国在1949年建立";
	test = "从此开始了新中国的伟大篇章";
	test = "研究生命起源";
	test = "北京天安门";
//	从此开始了新中国的伟大篇章中华人民共和国在一九四九年建立
	test = "主要是因为研究生命起源北京天安门";
	test = "从此开始了新中国的伟大篇章中华人民共和国在一九五五年建立主要是因为研究生命起源北京天安门";

//	test ="国际化企业中华人民共和国";
//	size_t found;
//	found = test.find("开始", 10);
//	bool flag = (found != string::npos);
//	cout << test.substr(test.length(), 3) << endl;
//	cout << test.substr(24, 4) << endl;
//	test = "，";
//	cout << test.length();

//	vector<string> res = mmseg(test);
//	int min = 0x7fffffff;
//	cout << min;
	vector<string> seg = mmseg(test);
//	seg.push_back("从");
//	sort(seg.begin(), seg.end());
	cout << endl;
	cout << "test string :" << test << endl;
	showTermsSegment(seg);

	test = "主要是因为研究生死";
	seg = mmseg(test);
	cout << endl;
	cout << "test string :" << test << endl;
	showTermsSegment(seg);

	return 0;
}

laiconglin

关注

1
点赞
踩
5

收藏

觉得还不错? 一键收藏
3
评论
使用C++实现mmseg，对中文句子进行分词

想要了解更多，可以看我在OSCHINA分享的代码，http://www.oschina.net/action/code/download?code=7969&id=12697这里我的这段代码为了实现的简单化，假设传进来的参数都是中文字符串，也就是不含有字母数字类似的东西。例如 string test="研究生命起源"; 分词效果为 : 研究生命起源，还算正确。
复制链接

扫一扫

专栏目录