C++基于TF-IDF实现文档匹配小demo

Lose&recall

已于 2022-09-26 20:50:46 修改

阅读量630

点赞数 2

文章标签： c++ 算法开发语言

于 2022-09-10 17:20:16 首次发布

本文链接：https://blog.csdn.net/qq_46820097/article/details/126797054

版权

好久没怎么写过博客了，今天闲来无事就分享一下我目前学习内容吧

文章目录

前言
一、TF-IDF
二、相关公式
三、主要代码解析
四、所有代码
效果展示及总结

前言

这段时间主要开始学习关于一些人工智能的相关内容
本篇内容就是分享一下前段时间学习的一个算法，再次回顾一下，这个算法是TF-IDF算法，我用此写了一个小demo，接下来就详细说说

一、TF-IDF

TF： Term Frequency，也就是指词频。
对于它的理解可以是：在一篇文章中，越重要的内容，强调的次数也越多，所以频率（TF）会大，我们可以用词频高的词代表这篇文章。
IDF：Inverse Document Frequency，也就是逆文本频率指数
对于它的理解可以是：像语气词或“你我他”这种词，同样也会出现很多次，光用TF来统计词频，这样显然会影响对文章的判断，我们没办法除去这些词的影响。IDF是所有词在这个系统中的区分力的大小，如果每篇文章里都有“我”这个字，那么它在任意一篇文章当中的区分力都不强，只有那种独特的词，IDF就会很大，区分力也够强。

二、相关公式

请添加图片描述

夹角公式：
在这里插入图片描述

三、主要代码解析

//将英语语句转化为单词列表
vector<string> sentence_to_words(string content) {
	vector<string> To_word;
	while (content.find_first_of(' ') != -1) {//从一句话中找到空格进行切分
		string temp = content.substr(0, content.find_first_of(' '));//获取切分的第一个单词
		content.erase(0, content.find_first_of(' ') + 1);//更新语句长度
		if (temp.find_first_of(',') != -1) {//判断切分的单词是否带有标点逗号
			temp.resize(temp.size() - 1);//去除标点
		}
		To_word.push_back(temp);//加入列表
	}
	To_word.push_back(content);//放入最后一个单词
	return To_word;
}

//获取最终得分
vector<float> get_score(string content, vector<float> idf, vector<string> words, vector<vector<float>>tf_idf, int col) {

    vector<float> _tf_idf, _idf, _tf, score;
    vector<string>q_words = sentence_to_words(content);//将需要匹配的句子分词
    vector<string>q_words_reduce = reduce(q_words);//去重
    vector<string> q_all_words;

    int isexit = 0, s_len = idf.size();
    for (int i = 0; i < words.size(); i++) {//将文档里的所有单词以及计算好的idf复制一份
        q_all_words.push_back(words[i]);
        _idf.push_back(idf[i]);
    }


    //获取q_idf
    for (int i = 0; i < q_words_reduce.size(); i++) {
        int num = 0;
        for (int j = 0; j < words.size(); j++) {
            if (q_words_reduce[i]._Equal(words[j])) {//判断提出的这个句子里是否有单词在所有文档中出现
                break;
            }
            num++;//计算比较次数，如果从头开始比较这个词，这个词未在所有文档中出现，那么说明这个词是生词
        }
        if (num == words.size()) {
            isexit++;//统计未出现词的数量
            q_all_words.push_back(q_words_reduce[i]);//将这个词加入到词列表中，用于后续计算
        }
    }


    if (isexit != 0) {
        for (int i = s_len; i < q_all_words.size(); i++) {
            _idf.push_back(0);//由于有生词，所以idf相应也需要拓展，用0来填充
            tf_idf.push_back(vector<float>(15));//对应的tf-idf也需要更改，因为有15个语句，所以需要15列，用0来填充
        }

    }
    //获取q_tf 先将tf拓展对应长度
    for (int i = 0; i < _idf.size(); i++) {
        _tf.push_back(0);
    }
	//修改tf对应值
    for (int i = 0; i < q_all_words.size(); i++) {
        int num = 0;
        for (int j = 0; j < q_words.size(); j++) {
            if (q_words[j]._Equal(q_all_words[i])) {
                num++;
            }
        }
        //统计提出的句子中的所有词在所有词中出现的次数更新tf
        _tf[i] = num;//这里只统计了出现次数 
        //_tf[i] = (float)num / q_words.size();
    }

    //获取q_tf_idf
    for (int k = 0; k < _tf.size(); k++) {
        _tf_idf.push_back(_tf[k] * _idf[k]);
    }
    //获取相似度
    score = cosine_similarity(_tf_idf, tf_idf, _tf.size(), col);

    return score;
}

//求夹角
vector<float> cosine_similarity(vector<float> vec, vector<vector<float>>tf_idf,int row,int col) {
    
    vector<float> cos;
    float M1 = Mo(vec);//获取当前句子的模
    vector<float>unit_vec;
    for (int i = 0; i < col; i++) {
        float sum = 0.0;
        vector<float>temp,temp1;
        for (int j = 0; j < row; j++) {
            temp.push_back(tf_idf[j][i]); //获取一句话的向量
        }
        float M2 = Mo(temp);//获取这句话的模
        for (int k = 0; k < row; k++) {
            unit_vec.push_back(vec[k] / M1);//求单位向量
            temp1.push_back(temp[k] / M2);
            sum = sum + (unit_vec[k] * temp1[k]);//单位向量点乘求夹角
        }
        cos.push_back(sum);
    }
    return cos;
}

四、所有代码

本次代码使用easyx图形库来实现界面化
easyx下载地址

#include<graphics.h>
#include<conio.h>
#include<iostream>
#include<string>
#include<vector>
#include<set>
#include<cmath>
#include<algorithm>
using namespace std;

vector<int> MainIndex(string docs[], int len, string content);

int main() {
    HWND hwnd = initgraph(1000, 700);
    setbkcolor(BLACK);
    cleardevice();
    settextcolor(WHITE);
    settextstyle(30, 0, _T("楷体"));
	string docs[] = {
    "it is a good day, I like to stay here",
    "I am happy to be here",
    "I am bob",
    "it is sunny today",
    "I have a party today",
    "it is a dog and that is a cat",
    "there are dog and cat on the tree",
    "I study hard this morning",
    "today is a good day",
    "tomorrow will be a good day",
    "I like coffee, I like book and I like apple",
    "I do not like it",
    "I am kitty, I like bob",
    "I do not care who like bob, but I like kitty",
    "It is coffee time, bring your cup"
	};
    LOGFONT f;
    gettextstyle(&f);
    f.lfQuality = ANTIALIASED_QUALITY;
    settextstyle(&f);
    outtextxy(0, 0, "所有文档：");

    for (int i = 0; i < size(docs); i++) {
        outtextxy(0, 31*(i+1), docs[i].c_str());
    }
    while (true) {
        clearrectangle(0, 31 * (size(docs) + 1), 1000, 700);
        outtextxy(0, 31 * (size(docs) + 1), "查询语句是：");
        Sleep(200);
        char sentence[500];
        InputBox(sentence, 500, "请输入语句：", "提示", "I get a coffee cup", 0, 100, true);
        settextcolor(RED);
        outtextxy(textwidth("查询语句是："), 31 * (size(docs) + 1), sentence);

        string question_sentence = sentence;
        vector<int> a = MainIndex(docs, size(docs), question_sentence);

        settextcolor(WHITE);
        outtextxy(0, 31 * (size(docs) + 2), "最匹配的三句是：");
        settextcolor(RED);

        for (int i = 0; i < 3; i++) {
            outtextxy(0, 31 * (size(docs) + 2 + i + 1), docs[a[i]].c_str());
        }
        settextcolor(WHITE);
        outtextxy(0, 31 * (size(docs) + 6), "是否需要换一句？需要按回车键，不需要按其它键");
        char m = _getch();
        if (m != '\r')break;
    }
	return 0;
}



//句子转化单词
vector<string> sentence_to_words(string content) {
	vector<string> To_word;
	while (content.find_first_of(' ') != -1) {
		string temp = content.substr(0, content.find_first_of(' '));
		content.erase(0, content.find_first_of(' ') + 1);
		if (temp.find_first_of(',') != -1) {
			temp.resize(temp.size() - 1);
		}
		To_word.push_back(temp);
	}
	To_word.push_back(content);
	return To_word;
}
//打印单词
void print_words(vector<string> words) {
    for (int i = 0; i < words.size(); i++) { 
        cout << words[i] << "\t";
    }
    cout << endl;
}
//tf-idf
vector<vector<float>> multiply(vector<vector<float>> tf, vector<float> idf,int row,int col) {
    vector<vector<float>> tf_idf = tf;
    for (int i = 0; i < col; i++) {
        for (int j = 0; j < row; j++) {
            tf_idf[j][i] = tf[j][i] * idf[j];
            //cout << tf_idf[j][i] << "\t";
        }
        //cout << endl;
    }
    return tf_idf;
}
//去重
vector<string> reduce(vector<string>& source) {

    vector<string> dst;
    set<string>s(source.begin(), source.end());
    dst.assign(s.begin(), s.end());
    return dst;
}
//获取模
float Mo(vector<float> vec) {
    float sum = 0.0, finally = 0.0;
    for (int i = 0; i < vec.size(); i++) {
        sum += pow(vec[i], 2);
    }
    finally = sqrt(sum);
    return finally;
}
//求夹角
vector<float> cosine_similarity(vector<float> vec, vector<vector<float>>tf_idf,int row,int col) {
    
    vector<float> cos;
    float M1 = Mo(vec);
    vector<float>unit_vec;
    for (int i = 0; i < col; i++) {
        float sum = 0.0;
        vector<float>temp,temp1;
        for (int j = 0; j < row; j++) {
            temp.push_back(tf_idf[j][i]); 
        }
        float M2 = Mo(temp);
        for (int k = 0; k < row; k++) {
            unit_vec.push_back(vec[k] / M1);
            temp1.push_back(temp[k] / M2);
            sum = sum + (unit_vec[k] * temp1[k]);//单位向量点乘
        }
        cos.push_back(sum);
    }
    return cos;
}
//获取最终得分
vector<float> get_score(string content, vector<float> idf, vector<string> words, vector<vector<float>>tf_idf, int col) {

    vector<float> _tf_idf, _idf, _tf, score;
    vector<string>q_words = sentence_to_words(content);
    vector<string>q_words_reduce = reduce(q_words);
    vector<string> q_all_words;

    int isexit = 0, s_len = idf.size();
    for (int i = 0; i < words.size(); i++) {
        q_all_words.push_back(words[i]);
        _idf.push_back(idf[i]);
    }


    //获取q_idf
    for (int i = 0; i < q_words_reduce.size(); i++) {
        int num = 0;
        for (int j = 0; j < words.size(); j++) {
            if (q_words_reduce[i]._Equal(words[j])) {
                break;
            }
            num++;
        }
        if (num == words.size()) {
            isexit++;
            q_all_words.push_back(q_words_reduce[i]);
        }
    }


    if (isexit != 0) {
        for (int i = s_len; i < q_all_words.size(); i++) {
            _idf.push_back(0);
            tf_idf.push_back(vector<float>(15));
        }

    }
    //获取q_tf
    for (int i = 0; i < _idf.size(); i++) {
        _tf.push_back(0);
    }

    for (int i = 0; i < q_all_words.size(); i++) {
        int num = 0;
        for (int j = 0; j < q_words.size(); j++) {
            if (q_words[j]._Equal(q_all_words[i])) {
                num++;
            }
        }
        _tf[i] = num;//这里只统计了出现次数 
        //_tf[i] = (float)num / q_words.size();
    }

    //获取q_tf_idf
    for (int k = 0; k < _tf.size(); k++) {
        _tf_idf.push_back(_tf[k] * _idf[k]);
        //cout << "_tf_idf:" << _tf[k] * _idf[k] << endl;
    }
    //获取相似度

    score = cosine_similarity(_tf_idf, tf_idf, _tf.size(), col);

    return score;
}
//最后匹配结果
vector<int> MainIndex(string docs[],int len,string content) {
    vector<string> words;
    //分词
    for (int i = 0; i < len; i++) {
        string doc = docs[i];
        while (doc.find_first_of(' ') != -1) {
            string temp = doc.substr(0, doc.find_first_of(' '));
            doc.erase(0, doc.find_first_of(' ') + 1);
            if (temp.find_first_of(',') != -1) {
                temp.resize(temp.size() - 1);
            }
            words.push_back(temp);
        }
        words.push_back(doc);
    }

    //去重
    set<string>s(words.begin(), words.end());
    words.assign(s.begin(), s.end());
    //查找每个词在每个句子中出现的次数
    vector<vector<float>>tf(words.size(), vector<float>(len));
    for (int i = 0; i < len; i++) {
        string doc = docs[i];
        for (int j = 0; j < words.size(); j++) {
            vector<string> temps = sentence_to_words(doc);
            //print_words(temps);
            int num = 0, k = 0;
            while (k < temps.size()) {
                if (words[j]._Equal(temps[k]))
                    num++;
                k++;
            }
            //tf[j][i] = num;获取出现次数
            tf[j][i] = (float)num / temps.size();//方案一获取tf
            //tf[j][i] = log(1 + ((float)num / temps.size()));//方案二获取tf
        }
    }

   

    //查找每个单词是否在每篇文章中都出现
    vector<float> idf(words.size());
    for (int j = 0; j < words.size(); j++) {
        int num = 0;
        for (int i = 0; i < len; i++) {
            string doc = docs[i];  int k = 0;
            vector<string> temps = sentence_to_words(doc);
            while (k < temps.size()) {
                if (words[j]._Equal(temps[k])) {
                    num++;
                    break;
                }
                k++;
            }
        }
        //idf[j] = num;//每个单词在每篇文章出现的次数
        idf[j] = log(static_cast<float>(len) / (1 + num));//方案一获取idf
        //idf[j] = 1 + log(static_cast<float>(len) / (1 + num));//方案二获取idf
    }
    //获取tf-idf 
    vector<vector<float>>tf_idf(words.size(), vector<float>(len));
    tf_idf = multiply(tf, idf, words.size(), len);

    
    //例句
    string question_sentence = content;
    //获取得分
    vector<float> score, scores;
    vector<int> res;
    score = get_score(question_sentence, idf, words, tf_idf, len);
    scores = score;
    sort(score.begin(), score.end(), greater<float>());
    vector<float>::iterator iter;
    for (int i = 0; i < score.size(); i++)
    {
        iter = find(scores.begin(), scores.end(), score[i]);
        res.push_back(iter - scores.begin());
    }
    //int a = max_element(score.begin(), score.end()) - score.begin();
    
    return res;
    
}