基于c++的n元文法的一种实现方式，从一个txt中形成模型，计算另一个txt的困惑度

最新推荐文章于 2023-09-26 11:15:07 发布

刀斩

最新推荐文章于 2023-09-26 11:15:07 发布

阅读量289

点赞数

文章标签： c++ 自然语言处理

本文链接：https://blog.csdn.net/qq_39380236/article/details/124759946

版权

训练语料和测试语料为pku和msr的txt文本，在每个txt中，每句话自身的单词之间由两个空格隔开，文本需为ANSI编码，每个字或符号都由两个char型组成。

模型为一个map，map的键为n-1个单词连在一起，形成的一个string，map的值为另一个map2，map2的键为第n个单词，map2的值为此单词出现次数，例如三句话：我喜欢夏天、我喜欢冬天、我不喜欢春天。得到的模型为：

我→喜欢→2

→不喜欢→1

喜欢→夏天→1

→冬天→1

不喜欢→春天→1

// 读文件.cpp : 此文件包含 "main" 函数。程序执行将在此处开始并结束。
//

#include <iostream>
#include<string>
#include<map>
#include<fstream>
using namespace std;
struct value//map table的值的结构类型，也是一个map，
{
    map<string, int>times; //此map的键为n元文法的第n个词，map的值为第n个词在前n - 1个词后面出现的次数
    int AllWordsTimesAfterKey = 0;//前n个词出现的总次数
};
void addbos(char ch[],int n)//n元文法则在句首添加n-1个bos
{
    int i = 0;
    for (i = 0; i < (n - 1) * 4; i++)
    {
        ch[i] = ' ';
        i++;
        ch[i] = ' ';
        i++;
        ch[i] = 'b';
        i++;
        ch[i] = 'b';
    }
    ch[i] = ' ';
    ch[i + 1] = ' ';
}
void addtxt(char ch[], char chtxt[], int n)//将训练文本的一句话chtxt，复制到ch中，ch的前几项为bos
{
    int i = 0;
    while (chtxt[i] != '\0')
    {
        ch[(n - 1) * 4 + i + 2] = chtxt[i];
        i++;
    }
    ch[(n - 1) * 4 + i + 2] = '\0';
}
void addeos(char ch[])//在句尾添加1个eos
{
    int i = 0;
    while (ch[i] != '\0')
        i++;
    if (ch[i - 1] == '\n'&&ch[i-2]!=' ')//msr的句尾无空格
    {
        ch[i - 1] = ' ';
        ch[i] = ' ';
        ch[i + 1] = 'e';
        ch[i + 2] = 'e';
        ch[i + 3] = '\0';
    }
    if (ch[i - 1] == '\n' && ch[i - 2] == ' ')//pku的句尾有两个空格
    {
        ch[i - 1] = 'e';
        ch[i] = 'e';
        ch[i + 1] = '\0';
    }
}
string makekey(char ch[], int n, int wordbegin[], int wordend[])//map table的键值，n元文法则键为前n-1个字连起来
{
    string key;
    for (int i = wordbegin[0]; i <= wordend[n - 2]; i++)
    {
        key = key + ch[i];
    }
    return key;
}
string makevalue(char ch[], int n, int wordbegin[], int wordend[])//map table的值的string，n元文法中的第n个词，即前面文法所得的结果
{
    string value;
    for (int i = wordbegin[n - 1]; i <= wordend[n - 1]; i++)
    {
        value = value + ch[i];
    }
    return value;
}
void train(FILE*fp,map<string,value> & table,int n)
{
    char ch[4096],chtrain[4096];//chtxt为读取的原文，ch为添加了bos和eos后的格式
    addbos(ch,n);//n元文法则在句首添加n-1个bos
    while (fgets(chtrain, 4096, fp)!=NULL)
    {
        int i = 0;
        addtxt(ch, chtrain, n);
        addeos(ch);//在句尾添加1个eos
        string nword[10];//假设最多有10元文法
        int wordbegin[10], wordend[10];//最多10个单词的首尾位置
        for (i = 0; i < 10; i++)
        {
            wordbegin[i] = 0;
            wordend[i] = 0;
        }
        i = 0;
        int wordnum=0; //记录在一句话中，读到了几个词，每读到n个词，则计算一次    
        while (ch[i] != '\0')
        {
            if (ch[i] == ' ' && ch[i + 1] != ' ')//一个词的开头
                wordbegin[wordnum] = i + 1;
            if ((ch[i] != ' ' && ch[i + 1] == ' ') || (ch[i - 1] == 'e')&&ch[i] == 'e')//一个词的末尾，eos的后面没有空格
            {
                wordend[wordnum] = i;
                wordnum++;
            }
            i++;
            if (wordnum == n)//一次读入了n个词
            {
                //table：前n-1个词为键，struct value为值
                //value：第n个词为键，读取次数times为值
                string key = makekey(ch, n, wordbegin, wordend);//map的key，n元文法则键为前n-1个字连起来
                string valuestring=makevalue(ch,n,wordbegin,wordend);//map的value的string，n元文法中的第n个词，即前面文法所得的结果
                if (table.count(key) == 1)//前n - 1个字的组合出现过
                {
                    table[key].AllWordsTimesAfterKey++;
                    if (table[key].times.count(valuestring) == 1)//最后一个词也出现过
                    {
                        int time = table[key].times[valuestring] + 1;//出现次数加一
                        table[key].times.erase(valuestring);//前n-1个词为key所对应的，第n词为valuestring项删除
                        table[key].times.insert(pair<string, int>(valuestring, time));//前n-1个词为key所对应的，第n词为valuestring项，次数为time                      
                        i = wordend[0] + 1;//从第二个词开始重新计算
                    }
                    if (table[key].times.count(valuestring) == 0)//最后一个词没出现过
                    {
                        table[key].times.insert(pair<string, int>(valuestring, 1));
                        i = wordend[0] + 1;//从第二个词开始重新计算
                    }
                }
                if (table.count(key) == 0)//前n-1个字的组合没出现过
                {
                    value word;                      
                    word.times.insert(pair<string,int>(valuestring,1));//最后一个词出现1次
                    table.insert(pair<string, value>(key, word));//前n-1个词的组合，后边接最后一个词
                    table[key].AllWordsTimesAfterKey = 1;
                    i = wordend[0] + 1;//从第二个词开始重新计算
                    if (ch[i] == '\0')
                        break;
                }
                wordnum = 0;               
            }
        }           
    }    
    for (int cc = 0; cc < 2048; cc++)
    {
        ch[cc] = '\0';
    }
    /*for (auto it = table.begin(); it != table.end(); it++)
    {
        cout << "开头词为:   " << it->first << "\n";
        for (auto it2 = it->second.times.begin(); it2 != it->second.times.end(); it2++)
        {
            cout << "     后接词为:   " << it2->first << "   时  " << "出现次数为：" << it2->second << "\n";
        }     
    }*/
}
void test(map<string, value>table, FILE* fp, int n, char txtname[])
{
    char ch[4096], chtest[4096];
    addbos(ch,n);
    ofstream File;
    string test_txt = "";
    int i = 0;
    for (i = 0; txtname[i] != '.'; i++)
    {
    }
    txtname[i] = '\0';
    for (i = 0; txtname[i] != '\0'; i++)
    {
        test_txt = test_txt + txtname[i];
    }
    char nn[2];
    itoa(n,nn,10);
    test_txt = test_txt + "的" + nn + "元困惑度计算文件.txt";//输出结果的文件名
    File.open(test_txt);
    while (fgets(chtest, 4096, fp) != NULL)
    {
        addtxt(ch, chtest, n);
        addeos(ch);
        string nword[10];//假设最多有10元文法
        int wordbegin[10], wordend[10];//最多10个单词的首尾位置
        int N = 0;//一句话中的单词数量
        for (i = 0; i < 10; i++)
        {
            wordbegin[i] = 0;
            wordend[i] = 0;
        }
        i = 0;
        int wordnum = 0;//每从一句话中读入n个词，则计算一次
        double ppl = 0;//一句话的困惑度
        while (ch[i] != '\0')
        {
            if (ch[i] == ' ' && ch[i + 1] != ' ')//一个词的开头
            {
                wordbegin[wordnum] = i + 1;
                N++;
            }
                
            if ((ch[i] != ' ' && ch[i + 1] == ' ') || (ch[i - 1] == 'e') && ch[i] == 'e')//一个词的末尾
            {
                wordend[wordnum] = i;
                wordnum++;
            }
            i++;
            if (wordnum == n)//一次读入了n个词
            {
                //table：前n-1个词为键，struct value为值
                //value：第n个词为键，读取次数times为值
                string key = makekey(ch, n, wordbegin, wordend);//map的key，n元文法则键为前n-1个字连起来
                string valuestring = makevalue(ch, n, wordbegin, wordend);//map的value的string，n元文法中的第n个词，即前面文法所得的结果
                int onewordtimes = 0;
                if (table.count(key) == 1)
                {
                    if (table[key].times.count(valuestring) == 1)
                    {                      
                        double valuestring_times = table[key].times[valuestring];
                        double key_times= table[key].AllWordsTimesAfterKey;
                        double gailv = valuestring_times / key_times;
                        double gailv_log = log(gailv) / log(2);
                        ppl = ppl + gailv_log;//计算一句话中，每两个词的困惑度
                    }
                }
                wordnum = 0;
                i = wordend[0] + 1;
            }
        }
        ppl = ppl / N;
        ppl = ppl * (-1);
        ppl = pow(2, ppl);//一句话中每个词的困惑度加起来
        i = (n - 1) * 4 + 2;//不输出“bb”
        while (ch[i] != 'e')
        {
            File << ch[i];
            i++;
        }//结果文件
        File << " 句子困惑度为: " << ppl << endl;
    }
    File.close();
}
void ui(map<string,value> & table)
{
    int n = 2;
    int traintxt_number = 1;
    char txtname[40];
    FILE* fp;
    int addone = 0;
    while (traintxt_number)//判断是否有新训练语料输入
    {
        cout << "请输入训练文档的文件名，如“D:\\training_msr.txt”,注意txt文档应为ANSI编码方式";
        cin >> txtname;
        cout << "输入文法元数";
        cin >> n;
        cout << "句子的开头添加了n-1个“bb”，句子的结尾添加了1个“ee”" << "\n";
        fopen_s(&fp, txtname, "r");
        train(fp, table, n);//读取训练语料，构建模型table
        fclose(fp);
        cout << "\n" << "训练语料为：" << txtname << " 的n元文法模型已完成";
        for (int i = 0; i < 40; i++)
        {
            txtname[i] = '\0';
        }
        cout << "\n" << "是否添加新的训练语料文件，是请输入1，不是请输入0" << "\n";
        cin >> traintxt_number;
    }
    cout << "请输入测试文档的文件名，如“D:\\test_msr.txt”,注意txt文档应为ANSI编码方式";
    cin >> txtname;   
    fopen_s(&fp, txtname, "r");
    test(table, fp, n, txtname);
    cout << "计算结果输为在“D:\\" << txtname << "的困惑度计算文件.txt”";
}
int main()
{   
    map<string, value>table;//table的数据结构：table的键为n元文法的前n-1个词，table的值为value型
    ui(table);
}

刀斩

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
基于c++的n元文法的一种实现方式，从一个txt中形成模型，计算另一个txt的困惑度

训练语料和测试语料为pku和msr的txt文本，在每个txt中，每句话自身的单词之间由两个空格隔开，文本需为ANSI编码，每个字或符号都由两个char型组成。模型为一个map，map的键为n-1个单词连在一起，形成的一个string，map的值为另一个map2，map2的键为第n个单词，map2的值为此单词出现次数，例如三句话：我喜欢夏天、我喜欢冬天、我不喜欢春天。得到的模型为：我→喜欢→2 →不喜欢→1喜欢→夏天→1 →冬天→1不喜欢→春天...
复制链接

扫一扫