首先,需要使用到 movielens的 数据集(这是网址),我选的ml-latest-small数据,由于对vector的使用还不是很熟练,总是会莫名其妙出现下标越界提示,程序最终没有调试成功(完整数据计算一次大概需要近3个小时,orz,求大佬帮优化)
下面是代码:
#include<string>
#include<iostream>
#include<fstream>
#include<sstream>
#include<vector>
#include <algorithm>
#include<iterator>
#include<cmath>
using namespace std;
struct movie //将movies.csv的信息处理后,存储到这里
{
int id;//存放电影ID
string name;//存放电影名称
vector<string> tags;//存放电影标签
vector<int> sames;//计算这部电影的相似电影id后存到这里待用
};
struct users //这里主要存储ratings.csv的信息
{
int id;//存储用户ID
vector<int> movies;//用户看过的电影
vector<int> predictmovies;//通过基于电影本身的相似度向用户推荐的电影
double prediction = 0;//计算对当前用户的预测准确率
double recall = 0;//计算对当前用户的预测召回率
};
vector<string> split(string s, char token) //用于读取csv文件的函数,主要用途是分离特定的符号
{
stringstream iss(s);
string word;
vector<string> vs;
while (getline(iss, word, token)) {
vs.push_back(word);
}
return vs;
}
double yuxuan(vector<string> v1, vector<string> v2)//计算两个向量的余弦相似度
{
vector<string> v;
sort(v1.begin(), v1.end());
sort(v2.begin(), v2.end());
set_intersection(v1.begin(), v1.end(), v2.begin(), v2.end(), back_inserter(v));//求交集
double j1 = v.size();
double j2 = v1.size();
double j3 = v2.size();
double jieguo = j1 / sqrt(j2 * j3);
return jieguo;
}
double predict_ratio(vector<int> v1, vector<int> v2)//计算预测准确率的函数
{
vector<int> vj;
sort(v1.begin(), v1.end());
sort(v2.begin(), v2.end());
set_intersection(v1.begin(), v1.end(), v2.begin(), v2.end(), back_inserter(vj));//求交集
double j1 = vj.size();
double j2 = v1.size();
double jieguo = j1 / j2;
return jieguo;
}
double recall_ratio(vector<int> v1, vector<int> v2)//计算预测召回率的函数
{
vector<int> vj;
sort(v1.begin(), v1.end());
sort(v2.begin(), v2.end());
set_intersection(v1.begin(), v1.end(), v2.begin(), v2.end(), back_inserter(vj));//求交集
double j1 = vj.size();
double j2 = v2.size();
double jieguo = j1 / j2;
return jieguo;
}
int main()
{
string line;
unsigned hang = 0;
ifstream movies("movies.csv");//读取文件前,我已经将文件的表头删掉了,下面用到的ratings.csv同样如此。
if (!movies.is_open())
{
cout << "文件打开失败" << endl;
exit(1);
}
vector<string> mymovies;//先通过mymovies向量将csv的数据每一行取出来
while (movies.good())
{
getline(movies, line);
hang++;
mymovies.push_back(line);
}
cout << hang << endl;//打印行数,无特殊目的
movies.close();
//for (unsigned i = 0; i < mymovies.size(); i++)
//{
// cout << mymovies[i] << endl;
//}
int len;
len = mymovies.size() + 1000;
vector<movie> onemovie(len);
for (unsigned i = 0; i < hang - 1; i++)//将文件信息转存到数组中
{
vector<string> p(3);
p = split(mymovies[i], ',');
onemovie[i].id = atoi(p[0].c_str());
onemovie[i].name = p[1];
onemovie[i].tags = split(p[2], '|');
}
//for (unsigned i = 0; i < len; i++)//此处用于打印存储效果,用于调试
//{
// if(onemovie[i].id != NULL)
// {
// int j = 0;
// cout << onemovie[i].id << "\t" << onemovie[i].name << "\t";
// for ( j = 0; j < onemovie[i].tags.size(); j++)
// {
// cout << onemovie[i].tags[j]<<" ";
// }
// cout << endl;
// }
// else
// {
// break;
// }
//}
for (int j = 0; j < onemovie.size() - 5000; j++)//计算电影与电影的相似度,由于数据比较多,这里耗时很长,-5000的目的是防止下标越界,同时减少此次模拟推荐引擎的计算时间,大佬可能有别的办法
{
for (int i = 0; i < onemovie.size() - 5000; i++)
{
if (i == j)
{
}
else
{
double same = yuxuan(onemovie[j].tags, onemovie[i].tags);
if (same > 0.8)
{
/*cout << "movie id=" << j << "的电影与movie id=" << i << "的电影相似度为" << same << endl;*/
onemovie[j].sames.push_back(onemovie[i].id);
}
}
}
/*cout << "movie id=" << j << "的相似movie id 为";
for (int k = 0; k < onemovie[j].sames.size(); k++)
{
cout<< onemovie[j].sames[k] << " ";
}
cout << endl;*/
}
ifstream user("ratings.csv");
if (!user.is_open())
{
cout << "文件打开失败" << endl;
exit(1);
}
vector<string> myuser;
while (user.good())
{
getline(user, line);
myuser.push_back(line);
}
int lens;
lens = myuser.size() + 10;
vector<users> oneuser(lens);
vector<string> p1;
for (int i = 0; i < myuser.size() - 20; i++)
{
vector<string> p(3);
p = split(myuser[i], ',');
oneuser[i].id = atoi(p[0].c_str());
oneuser[i].movies.push_back(atoi(p[1].c_str()));//只用到了ratings文件的前两列数据。
}
for (int s = 0; s < oneuser.size() - 100; s++)
{
for (int j = 0; j < onemovie.size() - 100; j++)
{
if (oneuser[s].movies[0] == onemovie[j].id)
{
cout << oneuser[s].movies[0] << endl;
cout << onemovie[j].id << endl;
oneuser[s].predictmovies.assign(onemovie[j].sames.begin(), onemovie[j].sames.end());//根据用户看过的电影,把符合相似度要求的电影合并到一起,但此处未去重,可能影响推荐效果
for (int k = 0; k < onemovie[s].sames.size()-1; k++)
{
cout << onemovie[j].sames[k] << endl;
}
for (int k = 0; k < oneuser[s].predictmovies.size()-1; k++)
{
cout << oneuser[s].predictmovies[k] << endl;
}
oneuser.erase(oneuser.begin() + j);
}
else
{
break;
}
}
}
for (int s = 0; s < 5; s++)//由于时间问题这里只输出前五个用户的计算结果
{
for (int j = 0; j < oneuser.size() - 10000; j++)
{
if (oneuser[s].id == oneuser[j].id && j != s)
{
oneuser[s].movies.assign(oneuser[j].movies.begin(), oneuser[j].movies.end());
oneuser[s].predictmovies.assign(oneuser[j].predictmovies.begin(), oneuser[j].predictmovies.end());
for (int k = 0; k < oneuser[s].predictmovies.size() - 1; k++)
{
cout << oneuser[s].predictmovies[k] << endl;
}
oneuser.erase(oneuser.begin() + j);
}
}
}
for (int s = 0; s < 5; s++)
{
oneuser[s].prediction = predict_ratio(oneuser[s].predictmovies, oneuser[s].movies);
oneuser[s].recall = recall_ratio(oneuser[s].predictmovies, oneuser[s].movies);
cout << "对用户预测的准确率为:" << oneuser[s].prediction << "\t" << "召回率为:" << oneuser[s].recall << endl;
}
system("pause");
return 0;
}