-
基于Surprise推荐系统实战
- 本文就movielens数据集做测试,实践推荐。movielens数据集格式为:user item rating timestamp 其中主要用到前三列,timestamp在处理自己的数据集的时候可以用别的特征替换,在此不做详细说明。
- 本文基于开源推荐框架surprise,传送门。
- 官网上的例子直接用 Dataset.load_builtin(‘ml-100k’)载入数据集,奈何小白我一直不成功。遂自己去下载了数据集,从本地读取。
- 这里使用的是基于item的协同过滤,也就是这里的电影。相似度计算使用的是皮尔逊相关系数。
- 代码中可能稍微有点费脑的一点是一个转换问题,name<–>rid<–>inner_id,这里rid是一个桥梁的作用,rid:raw_id也就是每部电影所对应的原始id号。而在训练计算皮尔逊相关系数矩阵的时候,又将每部电影进行了id映射,也就是代码中的to_inner_iid()就是讲raw_id转换到相似性矩阵的inner_id。之后计算近邻,得到的inner_id 要将其转换为具体的电影名字,同样需要通过raw_id作为中介进行转换。讲起来有点绕,看代码详细体会。
import io
from surprise import KNNBaseline
from surprise import Dataset,Reader
def read_iter_names():
item_file = 'your path+/ml-100k/u.item'
rid_2_name = {}
name_2_rid = {}
with io.open(item_file,'r',encoding='ISO-8859-1') as f:
for line in f:
line = line.split('|')
rid_2_name[line[0]]=line[1]
name_2_rid[line[1]]=line[0]
return rid_2_name,name_2_rid
reader = Reader(line_format='user item rating timestamp',sep='\t')
file_path = 'your path + /ml-100k'
data = Dataset.load_from_file(file_path=file_path+'/u.data',reader=reader)
train_set = data.build_full_trainset()
sim_options = {'name':'pearson_baseline','user_based':False}
algo = KNNBaseline(sim_options=sim_options)
algo.train(train_set)
rid_2_name,name_2_rid = read_iter_names()
toy_story_raw_id = name_2_rid['Toy Story (1995)']
toy_story_inner_id = algo.trainset.to_inner_iid(toy_story_raw_id)
toy_story_neighbors = algo.get_neighbors(toy_story_inner_id,k = 10)
toy_story_neighbors = (algo.trainset.to_raw_iid(inner_id)
for inner_id in toy_story_neighbors)
toy_story_neighbors = (rid_2_name[rid] for rid in toy_story_neighbors)
print('基于皮尔逊相似计算得到与toy story相近的十个电影为:\n')
for moives in toy_story_neighbors:
print(moives)
- 结果截图:
- -----------------------------------------------------------------------------------------------------------------------------------------
- Surprise
- 在自己的数据集上训练模型
- 建模和存储模型
Surprise
在推荐系统的建模过程中,我们将用到python库 Surprise(Simple Python RecommendatIon System Engine),是scikit系列中的一个(很多同学用过scikit-learn和scikit-image等库)。Surprise的User Guide有详细的解释和说明
简单易用,同时支持多种推荐算法:
其中基于近邻的方法(协同过滤)可以设定不同的度量准则。
相似度度量标准 | 度量标准说明 |
---|
cosine | Compute the cosine similarity between all pairs of users (or items). |
msd | Compute the Mean Squared Difference similarity between all pairs of users (or items). |
pearson | Compute the Pearson correlation coefficient between all pairs of users (or items). |
pearson_baseline | Compute the (shrunk) Pearson correlation coefficient between all pairs of users (or items) using baselines for centering instead of means. |
支持不同的评估准则
评估准则 | 准则说明 |
---|
rmse | Compute RMSE (Root Mean Squared Error). |
mae | Compute MAE (Mean Absolute Error). |
fcp | Compute FCP (Fraction of Concordant Pairs). |
使用示例
基本使用方法如下
from surprise import SVD
from surprise import Dataset
from surprise import evaluate, print_perf
data = Dataset.load_builtin('ml-100k')
data.split(n_folds=3)
algo = SVD()
perf = evaluate(algo, data, measures=['RMSE', 'MAE'])
print_perf(perf)
载入自己的数据集方法
file_path = os.path.expanduser('~/.surprise_data/ml-100k/ml-100k/u.data')
reader = Reader(line_format='user item rating timestamp', sep='\t')
data = Dataset.load_from_file(file_path, reader=reader)
data.split(n_folds=5)
算法调参(让推荐系统有更好的效果)
这里实现的算法用到的算法无外乎也是SGD等,因此也有一些超参数会影响最后的结果,我们同样可以用sklearn中常用到的网格搜索交叉验证(GridSearchCV)来选择最优的参数。简单的例子如下所示:
param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
'reg_all': [0.4, 0.6]}
grid_search = GridSearch(SVD, param_grid, measures=['RMSE', 'FCP'])
data = Dataset.load_builtin('ml-100k')
data.split(n_folds=3)
grid_search.evaluate(data)
print(grid_search.best_score['RMSE'])
print(grid_search.best_params['RMSE'])
print(grid_search.best_score['FCP'])
print(grid_search.best_params['FCP'])
在自己的数据集上训练模型
首先载入数据
import os
from surprise import Reader, Dataset
file_path = os.path.expanduser('./popular_music_suprise_format.txt')
reader = Reader(line_format='user item rating timestamp', sep=',')
music_data = Dataset.load_from_file(file_path, reader=reader)
music_data.split(n_folds=5)
使用不同的推荐系统算法进行建模比较
from surprise import NormalPredictor, evaluate
algo = NormalPredictor()
perf = evaluate(algo, music_data, measures=['RMSE', 'MAE'])
from surprise import BaselineOnly, evaluate
algo = BaselineOnly()
perf = evaluate(algo, music_data, measures=['RMSE', 'MAE'])
from surprise import KNNBasic, evaluate
algo = KNNBasic()
perf = evaluate(algo, music_data, measures=['RMSE', 'MAE'])
from surprise import KNNWithMeans, evaluate
algo = KNNWithMeans()
perf = evaluate(algo, music_data, measures=['RMSE', 'MAE'])
from surprise import KNNBaseline, evaluate
algo = KNNBaseline()
perf = evaluate(algo, music_data, measures=['RMSE', 'MAE'])
from surprise import SVD, evaluate
algo = SVD()
perf = evaluate(algo, music_data, measures=['RMSE', 'MAE'])
from surprise import SVDpp, evaluate
algo = SVDpp()
perf = evaluate(algo, music_data, measures=['RMSE', 'MAE'])
from surprise import NMF
algo = NMF()
perf = evaluate(algo, music_data, measures=['RMSE', 'MAE'])
print_perf(perf)
建模和存储模型
1.用协同过滤构建模型并进行预测
1.1 movielens的例子
from surprise import SVD
from surprise import Dataset
from surprise import evaluate, print_perf
data = Dataset.load_builtin('ml-100k')
data.split(n_folds=3)
algo = SVD()
perf = evaluate(algo, data, measures=['RMSE', 'MAE'])
print_perf(perf)
"""
以下的程序段告诉大家如何在协同过滤算法建模以后,根据一个item取回相似度最高的item,主要是用到algo.get_neighbors()这个函数
"""
from __future__ import (absolute_import, division, print_function,
unicode_literals)
import os
import io
from surprise import KNNBaseline
from surprise import Dataset
def read_item_names():
"""
获取电影名到电影id 和 电影id到电影名的映射
"""
file_name = (os.path.expanduser('~') +
'/.surprise_data/ml-100k/ml-100k/u.item')
rid_to_name = {}
name_to_rid = {}
with io.open(file_name, 'r', encoding='ISO-8859-1') as f:
for line in f:
line = line.split('|')
rid_to_name[line[0]] = line[1]
name_to_rid[line[1]] = line[0]
return rid_to_name, name_to_rid
data = Dataset.load_builtin('ml-100k')
trainset = data.build_full_trainset()
sim_options = {'name': 'pearson_baseline', 'user_based': False}
algo = KNNBaseline(sim_options=sim_options)
algo.train(trainset)
rid_to_name, name_to_rid = read_item_names()
toy_story_raw_id = name_to_rid['Toy Story (1995)']
toy_story_inner_id = algo.trainset.to_inner_iid(toy_story_raw_id)
toy_story_neighbors = algo.get_neighbors(toy_story_inner_id, k=10)
toy_story_neighbors = (algo.trainset.to_raw_iid(inner_id)
for inner_id in toy_story_neighbors)
toy_story_neighbors = (rid_to_name[rid]
for rid in toy_story_neighbors)
print()
print('The 10 nearest neighbors of Toy Story are:')
for movie in toy_story_neighbors:
print(movie)
1.2 音乐预测的例子
from __future__ import (absolute_import, division, print_function, unicode_literals)
import os
import io
from surprise import KNNBaseline
from surprise import Dataset
import cPickle as pickle
id_name_dic = pickle.load(open("popular_playlist.pkl","rb"))
print("加载歌单id到歌单名的映射字典完成...")
name_id_dic = {}
for playlist_id in id_name_dic:
name_id_dic[id_name_dic[playlist_id]] = playlist_id
print("加载歌单名到歌单id的映射字典完成...")
file_path = os.path.expanduser('./popular_music_suprise_format.txt')
reader = Reader(line_format='user item rating timestamp', sep=',')
music_data = Dataset.load_from_file(file_path, reader=reader)
print("构建数据集...")
trainset = music_data.build_full_trainset()
- current_playlist => 歌单名
- playlist_id => 歌单id(网易给的歌单id)
- playlist_inner_id => 内部id(对所有歌单id重新从1开始编码)
print("开始训练模型...")
algo = KNNBaseline()
algo.train(trainset)
current_playlist = name_id_dic.keys()[39]
print(current_playlist)
playlist_id = name_id_dic[current_playlist]
print(playlist_id)
playlist_inner_id = algo.trainset.to_inner_uid(playlist_id)
print(playlist_inner_id)
playlist_neighbors = algo.get_neighbors(playlist_inner_id, k=10)
playlist_neighbors = (algo.trainset.to_raw_uid(inner_id)
for inner_id in playlist_neighbors)
playlist_neighbors = (id_name_dic[playlist_id]
for playlist_id in playlist_neighbors)
print()
print("和歌单 《", current_playlist, "》 最接近的10个歌单为:\n")
for playlist in playlist_neighbors:
print(playlist)
2.用SVD矩阵分解进行预测
from surprise import SVDpp, evaluate
from surprise import Dataset
file_path = os.path.expanduser('./popular_music_suprise_format.txt')
reader = Reader(line_format='user item rating timestamp', sep=',')
music_data = Dataset.load_from_file(file_path, reader=reader)
algo = SVDpp()
trainset = music_data.build_full_trainset()
algo.train(trainset)