好久没怎么写过博客了,今天闲来无事就分享一下我目前学习内容吧
前言
这段时间主要开始学习关于一些人工智能的相关内容
本篇内容就是分享一下前段时间学习的一个算法,再次回顾一下,这个算法是TF-IDF算法,我用此写了一个小demo,接下来就详细说说
一、TF-IDF
- TF: Term Frequency,也就是指词频。
对于它的理解可以是:在一篇文章中,越重要的内容,强调的次数也越多,所以频率(TF)会大,我们可以用词频高的词代表这篇文章。 - IDF:Inverse Document Frequency,也就是逆文本频率指数
对于它的理解可以是:像语气词或“你我他”这种词,同样也会出现很多次,光用TF来统计词频,这样显然会影响对文章的判断,我们没办法除去这些词的影响。IDF是所有词在这个系统中的区分力的大小,如果每篇文章里都有“我”这个字,那么它在任意一篇文章当中的区分力都不强,只有那种独特的词,IDF就会很大,区分力也够强。
二、相关公式
夹角公式:
三、主要代码解析
//将英语语句转化为单词列表
vector<string> sentence_to_words(string content) {
vector<string> To_word;
while (content.find_first_of(' ') != -1) {//从一句话中找到空格进行切分
string temp = content.substr(0, content.find_first_of(' '));//获取切分的第一个单词
content.erase(0, content.find_first_of(' ') + 1);//更新语句长度
if (temp.find_first_of(',') != -1) {//判断切分的单词是否带有标点逗号
temp.resize(temp.size() - 1);//去除标点
}
To_word.push_back(temp);//加入列表
}
To_word.push_back(content);//放入最后一个单词
return To_word;
}
//获取最终得分
vector<float> get_score(string content, vector<float> idf, vector<string> words, vector<vector<float>>tf_idf, int col) {
vector<float> _tf_idf, _idf, _tf, score;
vector<string>q_words = sentence_to_words(content);//将需要匹配的句子分词
vector<string>q_words_reduce = reduce(q_words);//去重
vector<string> q_all_words;
int isexit = 0, s_len = idf.size();
for (int i = 0; i < words.size(); i++) {//将文档里的所有单词以及计算好的idf复制一份
q_all_words.push_back(words[i]);
_idf.push_back(idf[i]);
}
//获取q_idf
for (int i = 0; i < q_words_reduce.size(); i++) {
int num = 0;
for (int j = 0; j < words.size(); j++) {
if (q_words_reduce[i]._Equal(words[j])) {//判断提出的这个句子里是否有单词在所有文档中出现
break;
}
num++;//计算比较次数,如果从头开始比较这个词,这个词未在所有文档中出现,那么说明这个词是生词
}
if (num == words.size()) {
isexit++;//统计未出现词的数量
q_all_words.push_back(q_words_reduce[i]);//将这个词加入到词列表中,用于后续计算
}
}
if (isexit != 0) {
for (int i = s_len; i < q_all_words.size(); i++) {
_idf.push_back(0);//由于有生词,所以idf相应也需要拓展,用0来填充
tf_idf.push_back(vector<float>(15));//对应的tf-idf也需要更改,因为有15个语句,所以需要15列,用0来填充
}
}
//获取q_tf 先将tf拓展对应长度
for (int i = 0; i < _idf.size(); i++) {
_tf.push_back(0);
}
//修改tf对应值
for (int i = 0; i < q_all_words.size(); i++) {
int num = 0;
for (int j = 0; j < q_words.size(); j++) {
if (q_words[j]._Equal(q_all_words[i])) {
num++;
}
}
//统计提出的句子中的所有词在所有词中出现的次数更新tf
_tf[i] = num;//这里只统计了出现次数
//_tf[i] = (float)num / q_words.size();
}
//获取q_tf_idf
for (int k = 0; k < _tf.size(); k++) {
_tf_idf.push_back(_tf[k] * _idf[k]);
}
//获取相似度
score = cosine_similarity(_tf_idf, tf_idf, _tf.size(), col);
return score;
}
//求夹角
vector<float> cosine_similarity(vector<float> vec, vector<vector<float>>tf_idf,int row,int col) {
vector<float> cos;
float M1 = Mo(vec);//获取当前句子的模
vector<float>unit_vec;
for (int i = 0; i < col; i++) {
float sum = 0.0;
vector<float>temp,temp1;
for (int j = 0; j < row; j++) {
temp.push_back(tf_idf[j][i]); //获取一句话的向量
}
float M2 = Mo(temp);//获取这句话的模
for (int k = 0; k < row; k++) {
unit_vec.push_back(vec[k] / M1);//求单位向量
temp1.push_back(temp[k] / M2);
sum = sum + (unit_vec[k] * temp1[k]);//单位向量点乘求夹角
}
cos.push_back(sum);
}
return cos;
}
四、所有代码
本次代码使用easyx图形库来实现界面化
easyx下载地址
#include<graphics.h>
#include<conio.h>
#include<iostream>
#include<string>
#include<vector>
#include<set>
#include<cmath>
#include<algorithm>
using namespace std;
vector<int> MainIndex(string docs[], int len, string content);
int main() {
HWND hwnd = initgraph(1000, 700);
setbkcolor(BLACK);
cleardevice();
settextcolor(WHITE);
settextstyle(30, 0, _T("楷体"));
string docs[] = {
"it is a good day, I like to stay here",
"I am happy to be here",
"I am bob",
"it is sunny today",
"I have a party today",
"it is a dog and that is a cat",
"there are dog and cat on the tree",
"I study hard this morning",
"today is a good day",
"tomorrow will be a good day",
"I like coffee, I like book and I like apple",
"I do not like it",
"I am kitty, I like bob",
"I do not care who like bob, but I like kitty",
"It is coffee time, bring your cup"
};
LOGFONT f;
gettextstyle(&f);
f.lfQuality = ANTIALIASED_QUALITY;
settextstyle(&f);
outtextxy(0, 0, "所有文档:");
for (int i = 0; i < size(docs); i++) {
outtextxy(0, 31*(i+1), docs[i].c_str());
}
while (true) {
clearrectangle(0, 31 * (size(docs) + 1), 1000, 700);
outtextxy(0, 31 * (size(docs) + 1), "查询语句是:");
Sleep(200);
char sentence[500];
InputBox(sentence, 500, "请输入语句:", "提示", "I get a coffee cup", 0, 100, true);
settextcolor(RED);
outtextxy(textwidth("查询语句是:"), 31 * (size(docs) + 1), sentence);
string question_sentence = sentence;
vector<int> a = MainIndex(docs, size(docs), question_sentence);
settextcolor(WHITE);
outtextxy(0, 31 * (size(docs) + 2), "最匹配的三句是:");
settextcolor(RED);
for (int i = 0; i < 3; i++) {
outtextxy(0, 31 * (size(docs) + 2 + i + 1), docs[a[i]].c_str());
}
settextcolor(WHITE);
outtextxy(0, 31 * (size(docs) + 6), "是否需要换一句?需要按回车键,不需要按其它键");
char m = _getch();
if (m != '\r')break;
}
return 0;
}
//句子转化单词
vector<string> sentence_to_words(string content) {
vector<string> To_word;
while (content.find_first_of(' ') != -1) {
string temp = content.substr(0, content.find_first_of(' '));
content.erase(0, content.find_first_of(' ') + 1);
if (temp.find_first_of(',') != -1) {
temp.resize(temp.size() - 1);
}
To_word.push_back(temp);
}
To_word.push_back(content);
return To_word;
}
//打印单词
void print_words(vector<string> words) {
for (int i = 0; i < words.size(); i++) {
cout << words[i] << "\t";
}
cout << endl;
}
//tf-idf
vector<vector<float>> multiply(vector<vector<float>> tf, vector<float> idf,int row,int col) {
vector<vector<float>> tf_idf = tf;
for (int i = 0; i < col; i++) {
for (int j = 0; j < row; j++) {
tf_idf[j][i] = tf[j][i] * idf[j];
//cout << tf_idf[j][i] << "\t";
}
//cout << endl;
}
return tf_idf;
}
//去重
vector<string> reduce(vector<string>& source) {
vector<string> dst;
set<string>s(source.begin(), source.end());
dst.assign(s.begin(), s.end());
return dst;
}
//获取模
float Mo(vector<float> vec) {
float sum = 0.0, finally = 0.0;
for (int i = 0; i < vec.size(); i++) {
sum += pow(vec[i], 2);
}
finally = sqrt(sum);
return finally;
}
//求夹角
vector<float> cosine_similarity(vector<float> vec, vector<vector<float>>tf_idf,int row,int col) {
vector<float> cos;
float M1 = Mo(vec);
vector<float>unit_vec;
for (int i = 0; i < col; i++) {
float sum = 0.0;
vector<float>temp,temp1;
for (int j = 0; j < row; j++) {
temp.push_back(tf_idf[j][i]);
}
float M2 = Mo(temp);
for (int k = 0; k < row; k++) {
unit_vec.push_back(vec[k] / M1);
temp1.push_back(temp[k] / M2);
sum = sum + (unit_vec[k] * temp1[k]);//单位向量点乘
}
cos.push_back(sum);
}
return cos;
}
//获取最终得分
vector<float> get_score(string content, vector<float> idf, vector<string> words, vector<vector<float>>tf_idf, int col) {
vector<float> _tf_idf, _idf, _tf, score;
vector<string>q_words = sentence_to_words(content);
vector<string>q_words_reduce = reduce(q_words);
vector<string> q_all_words;
int isexit = 0, s_len = idf.size();
for (int i = 0; i < words.size(); i++) {
q_all_words.push_back(words[i]);
_idf.push_back(idf[i]);
}
//获取q_idf
for (int i = 0; i < q_words_reduce.size(); i++) {
int num = 0;
for (int j = 0; j < words.size(); j++) {
if (q_words_reduce[i]._Equal(words[j])) {
break;
}
num++;
}
if (num == words.size()) {
isexit++;
q_all_words.push_back(q_words_reduce[i]);
}
}
if (isexit != 0) {
for (int i = s_len; i < q_all_words.size(); i++) {
_idf.push_back(0);
tf_idf.push_back(vector<float>(15));
}
}
//获取q_tf
for (int i = 0; i < _idf.size(); i++) {
_tf.push_back(0);
}
for (int i = 0; i < q_all_words.size(); i++) {
int num = 0;
for (int j = 0; j < q_words.size(); j++) {
if (q_words[j]._Equal(q_all_words[i])) {
num++;
}
}
_tf[i] = num;//这里只统计了出现次数
//_tf[i] = (float)num / q_words.size();
}
//获取q_tf_idf
for (int k = 0; k < _tf.size(); k++) {
_tf_idf.push_back(_tf[k] * _idf[k]);
//cout << "_tf_idf:" << _tf[k] * _idf[k] << endl;
}
//获取相似度
score = cosine_similarity(_tf_idf, tf_idf, _tf.size(), col);
return score;
}
//最后匹配结果
vector<int> MainIndex(string docs[],int len,string content) {
vector<string> words;
//分词
for (int i = 0; i < len; i++) {
string doc = docs[i];
while (doc.find_first_of(' ') != -1) {
string temp = doc.substr(0, doc.find_first_of(' '));
doc.erase(0, doc.find_first_of(' ') + 1);
if (temp.find_first_of(',') != -1) {
temp.resize(temp.size() - 1);
}
words.push_back(temp);
}
words.push_back(doc);
}
//去重
set<string>s(words.begin(), words.end());
words.assign(s.begin(), s.end());
//查找每个词在每个句子中出现的次数
vector<vector<float>>tf(words.size(), vector<float>(len));
for (int i = 0; i < len; i++) {
string doc = docs[i];
for (int j = 0; j < words.size(); j++) {
vector<string> temps = sentence_to_words(doc);
//print_words(temps);
int num = 0, k = 0;
while (k < temps.size()) {
if (words[j]._Equal(temps[k]))
num++;
k++;
}
//tf[j][i] = num;获取出现次数
tf[j][i] = (float)num / temps.size();//方案一获取tf
//tf[j][i] = log(1 + ((float)num / temps.size()));//方案二获取tf
}
}
//查找每个单词是否在每篇文章中都出现
vector<float> idf(words.size());
for (int j = 0; j < words.size(); j++) {
int num = 0;
for (int i = 0; i < len; i++) {
string doc = docs[i]; int k = 0;
vector<string> temps = sentence_to_words(doc);
while (k < temps.size()) {
if (words[j]._Equal(temps[k])) {
num++;
break;
}
k++;
}
}
//idf[j] = num;//每个单词在每篇文章出现的次数
idf[j] = log(static_cast<float>(len) / (1 + num));//方案一获取idf
//idf[j] = 1 + log(static_cast<float>(len) / (1 + num));//方案二获取idf
}
//获取tf-idf
vector<vector<float>>tf_idf(words.size(), vector<float>(len));
tf_idf = multiply(tf, idf, words.size(), len);
//例句
string question_sentence = content;
//获取得分
vector<float> score, scores;
vector<int> res;
score = get_score(question_sentence, idf, words, tf_idf, len);
scores = score;
sort(score.begin(), score.end(), greater<float>());
vector<float>::iterator iter;
for (int i = 0; i < score.size(); i++)
{
iter = find(scores.begin(), scores.end(), score[i]);
res.push_back(iter - scores.begin());
}
//int a = max_element(score.begin(), score.end()) - score.begin();
return res;
}
效果展示及总结
欢迎 大家留言讨论,有什么不对的地方欢迎大家指正。
效果展示