N-gram

#include "iostream"
#include "string.h"
#include "string"
#include "map"
#include "fstream"
#include "set"
#include "vector"
#include "stdio.h"
using namespace std;

vector<string> words;          //词汇库
set<string> stopWords;         //停用词
map<string, float> dictMakeUni;  //制作一元词典时用到
map<string, float> dictMakeBi;   //制作二元词典时用到
map<string, float> dictMakeTri;  //制作三元词典时用到
map<string, float> dictReadUni;  //读取一元词典时用到
map<string, float> dictReadBi;   //读取二元词典时用到
map<string, float> dictReadTri;  //读取三元词典时用到
vector<string> wordList;       //将测试语句分词后的结果
const int maxLength = 20;      //最长单词:10,一个汉字的长度是2
const int laplace = 1;  //平滑值
string stringBi;     //二元分词结果
string stringTri;    //三元分词结果
double bestRate = 0;  //句子切分概率
float smallP;        //一个小概率值
int allNum = 0;      //所有词的总数

//读取停用词,这里只是一些标点符号
void readStopWords(char *file)
{
	stopWords.clear();
	ifstream fin;
	fin.open(file);
	if(!fin)
		cout << "停用词文件不存在";
	char buf[20];
	while(!fin.eof())
	{
		fin.getline(buf, 20);
		stopWords.insert(string(buf));
	}
	fin.close();
}

//制作词汇表、一元词典
void makeWords(char *file)
{
	words.clear();
	ifstream fin;
	fin.open(file);
	if(!fin)
		cout << "语料库不存在";
	dictMakeUni.clear();
	ofstream foutKey, foutVal;
	foutKey.open("dictKeyUni.txt");
	if(!foutKey)
		cout << "dictKeyUni.txt创建失败";
	foutVal.open("dictValUni.txt");
	if(!foutVal)
		cout << "dictValUni.txt创建失败";
	
	char buf[500];
	char *d = " ";
	float total = 0;
	while(!fin.eof())
	{
		fin.getline(buf, 500);
		char *p;
		p = strtok(buf, d);
		while(p)
		{
			string word(p);
			if(stopWords.find(word) == stopWords.end())  //不属于停用词
			{
				if(dictMakeUni.find(word) != dictMakeUni.end())
					dictMakeUni[word] += 1;
				else
					dictMakeUni[word] = 1;  //至少出现一次
				words.push_back(word);
				total++;
			}
			p = strtok(NULL, d);
		}
	}
	
	allNum = total;
	smallP = 1.0/(float)total;
	map<string, float>::iterator it;
	for(it = dictMakeUni.begin(); it != dictMakeUni.end(); it++)
	{
		foutKey << it->first << "\n";
		foutVal << (float)it->second/total << "\n";
	}
	
	foutKey.close();
	foutVal.close();
}

//制作二元词典
void makeDictBi()
{
	dictMakeBi.clear();
	ofstream foutKey, foutVal;
	foutKey.open("dictKeyBi.txt");
	if(!foutKey)
		cout << "dictKeyBi.txt创建失败";
	foutVal.open("dictValBi.txt");
	if(!foutVal)
		cout << "dictValBi.txt创建失败";
	
	float total = 0;
	for(int i=0; i<words.size()-1; i++)
	{
		string word = words[i] + words[i+1];
		total++;
		if(dictMakeBi.find(word) != dictMakeBi.end())
			dictMakeBi[word] += 1; 
		else
			dictMakeBi[word] = 1;  //至少出现一次
	}
	
	map<string, float>::iterator it;
	for(it = dictMakeBi.begin(); it != dictMakeBi.end(); it++)
	{
		foutKey << it->first << "\n";
		foutVal << (float)it->second/total << "\n";
	}
	
	foutKey.close();
	foutVal.close();
}

//制作三元词典
void makeDictTri()
{
	dictMakeTri.clear();
	ofstream foutKey, foutVal;
	foutKey.open("dictKeyTri.txt");
	if(!foutKey)
		cout << "dictKeyTri.txt创建失败";
	foutVal.open("dictValTri.txt");
	if(!foutVal)
		cout << "dictValTri.txt创建失败";
	
	float total = 0;
	for(int i=0; i<words.size()-2; i++)
	{
		string word = words[i] + words[i+1] + words[i+2];
		total++;
		if(dictMakeTri.find(word) != dictMakeTri.end())
			dictMakeTri[word] += 1; 
		else
			dictMakeTri[word] = 1;  //至少出现一次
	}
	
	map<string, float>::iterator it;
	for(it = dictMakeTri.begin(); it != dictMakeTri.end(); it++)
	{
		foutKey << it->first << "\n";
		foutVal << (float)it->second/total << "\n";
	}
	
	foutKey.close();
	foutVal.close();
}

//读取makeDictUni()中制作好的词典,一元词典
void readDictUni()
{
	dictReadBi.clear();
	ifstream finKey, finVal;
	finKey.open("dictKeyUni.txt");
	if(!finKey)
		cout << "dictKeyUni.txt不存在";
	finVal.open("dictValUni.txt");
	if(!finVal)
		cout << "dictValUni.txt不存在";
	
	char buf1[50], buf2[50];
	char *d = " ";
	while(!finKey.eof() && !finVal.eof())
	{
		finKey.getline(buf1, 50);
		finVal.getline(buf2, 50);
		dictReadUni[string(buf1)] = atof(buf2);
	}
	
	finKey.close();
	finVal.close();
}

//读取makeDictBi()中制作好的词典,二元词典
void readDictBi()
{
	dictReadBi.clear();
	ifstream finKey, finVal;
	finKey.open("dictKeyBi.txt");
	if(!finKey)
		cout << "dictKeyBi.txt不存在";
	finVal.open("dictValBi.txt");
	if(!finVal)
		cout << "dictValBi.txt不存在";
	
	char buf1[50], buf2[50];
	char *d = " ";
	while(!finKey.eof() && !finVal.eof())
	{
		finKey.getline(buf1, 50);
		finVal.getline(buf2, 50);
		dictReadBi[string(buf1)] = atof(buf2);
	}
	
	finKey.close();
	finVal.close();
}

//读取makeDictTri()中制作好的词典,三元词典
void readDictTri()
{
	dictReadTri.clear();
	ifstream finKey, finVal;
	finKey.open("dictKeyTri.txt");
	if(!finKey)
		cout << "dictKeyTri.txt不存在";
	finVal.open("dictValTri.txt");
	if(!finVal)
		cout << "dictValTri.txt不存在";
	
	char buf1[50], buf2[50];
	char *d = " ";
	while(!finKey.eof() && !finVal.eof())
	{
		finKey.getline(buf1, 50);
		finVal.getline(buf2, 50);
		dictReadTri[string(buf1)] = atof(buf2);
	}
	
	finKey.close();
	finVal.close();
}

void biGram(string &sentence, int len, int end, double rate, int index, vector<string> &wordList)
{
	if(len <= 2)   //句子只有一个字
	{
		stringBi = sentence;
		return;
	}
	if(end >= len-2)  //句子已经划分完
	{
		string str = "";
		for(int i=0; i<wordList.size(); i++)
			str += wordList[i] + " ";
		cout << str << "\t" << "概率:" << rate << endl;
		if(rate >= bestRate)   //当前的划分方式更好
		{
			
			stringBi = str;
			bestRate = rate;
		}
		return;
	}
	if(end == -1)  //第一次执行,需做清空工作
	{
		wordList.clear();
		stringBi = "";
		bestRate = 0;
	}

	string word = "";
	int last = end + 1;
	for(int i=end+1; i<len && i<maxLength; i+=2)
	{
		word += sentence.substr(i, 2);
		if(index == 0) // 如果是第一个词
		{
			if(i == last || dictReadUni.find(word) != dictReadUni.end())
			{
				wordList.push_back(word);
				biGram(sentence, len, i+1, rate, index+1, wordList);
				wordList.pop_back();
			}
		}
		else //不是第一个词
		{
			string preWord = wordList[index-1];
			string twoWords = preWord + word;
			if(i == last  || dictReadUni.find(word) != dictReadUni.end())  //如果是第一个字,或者存在于一元词典中
			{
				if(dictReadBi.find(twoWords) != dictReadBi.end())  //如果存在2-gram
				{
					//求条件概率P(w_i | w_i-1) = P(w_i-1 w_i) / P(w_i-1)
					if(dictReadUni.find(preWord) != dictReadUni.end())    //前面的词存在于一元词典中
					{
						wordList.push_back(word);
						biGram(sentence, len, i + 1, rate * (dictReadBi[twoWords] / dictReadUni[preWord]), index + 1, wordList);
						wordList.pop_back();
					}
					else  //前面的词不存在于一元词典中
					{
						wordList.push_back(word);
						biGram(sentence, len, i + 1, rate * dictReadBi[twoWords], index + 1, wordList);
						wordList.pop_back();
					}
				}
				else  //不存在2-gram
				{
					wordList.push_back(word);
					biGram(sentence, len, i + 1, rate * smallP, index + 1, wordList);
					wordList.pop_back();
				}
			}
		}
	}
}

//返回句子分词后的结果,三元模型
void triGram(string &sentence, int len, int end, double rate, int index, vector<string> &wordList)
{
	if(len <= 2)  //句子只有1个字
	{
		stringTri = sentence;
		return;
	}
	if(end >= len-2)  //句子已经划分完
	{
		string str = "";
		for(int i=0; i<wordList.size(); i++)
			str += wordList[i] + " ";
		cout << str << "\t" << "概率:" << rate << endl;
		if(rate >= bestRate)  //当前的划分方式更好
		{
			stringTri = str;
			bestRate = rate;
		}
		return;
	}
	if(end == -1)  //第一次执行本程序,需做清空工作
	{
		wordList.clear();
		stringTri = "";
		bestRate = 0;
	}

	string word = "";
	int last = end + 1;
	for(int i=end+1; i<len && i<maxLength; i+=2)
	{
		word += sentence.substr(i, 2);
		if(index == 0 || index ==1)  //第一个词或者第二个词
		{
			if(i == last || dictReadUni.find(word) != dictReadUni.end())  
			{
				wordList.push_back(word);
				triGram(sentence, len, i+1, rate * 0.9, index+1, wordList);
				wordList.pop_back();
			}
		}
		else
		{
			string preWords = wordList[index-2] + wordList[index-1];
			string threeWords = preWords + word;
			if(i == last || dictReadUni.find(word) != dictReadUni.end())  //第一个字,或者存在于一元词典中
			{
				//要计算P(w_i | w_i-2 w_i-1)
				//P(w_i | w_i-2 w_i-1) = P(w_i-2 w_i-1 w_i) / P(w_i-1 w_i)
				if(dictReadTri.find(threeWords) != dictReadTri.end())  //存在3-gram
				{
					if(dictReadBi.find(preWords) != dictReadBi.end())  //前2个词存在于二元词典中
					{
						wordList.push_back(word);
						triGram(sentence, len, i + 1, rate * (dictReadTri[threeWords] / dictReadBi[preWords]), index + 1, wordList);
						wordList.pop_back();
					}
					else
					{
						wordList.push_back(word);
						triGram(sentence, len, i + 1, rate * dictReadTri[threeWords], index + 1, wordList);
						wordList.pop_back();
					}
				}
				else   //不存在3-gram
				{
					wordList.push_back(word);
					triGram(sentence, len, i + 1, rate * smallP, index + 1, wordList);
					wordList.pop_back();
				}
			}
		}
	}
}

int main()
{
	//只在程序第一次启用的时候需要
	readStopWords("stopwords.txt");   //读取停用词
	makeWords("train.txt");  //读取语料库,制作一元词典
	makeDictBi();      //制作二元词典
	makeDictTri();     //制作三元词典
	///
	
	readDictUni();     //读取制作好的一元词典
	readDictBi();      //读取制作好的二元词典
	readDictTri();     //读取制作好的三元词典

	string str;
	cout << "请输入句子:\t";
	while(cin >> str)
	{
		int end = -1;
		int index = 0;
		double rate = 1;
		int len = str.size();
		cout << "\n二元模型分词过程:\n";
		biGram(str, len, end, rate, index, wordList);  //二元模型划分句子
		cout << "二元模型切分结果:\t" << stringBi << endl << endl;

		end = -1;
		index = 0;
		rate = 1;
		cout << "三元模型分词过程:\n";
		triGram(str, len, end, rate, index, wordList);  //三元模型划分句子
		cout << "三元模型切分结果:\t" << stringTri << endl << endl;
		cout << endl << endl;
		cout << "请输入句子:\t";
	}
	return 0;
}


 

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值