surprise推荐系统实践

最新推荐文章于 2024-01-10 09:32:32 发布

Madname

最新推荐文章于 2024-01-10 09:32:32 发布

阅读量737

点赞数

分类专栏：推荐系统文章标签： surprise 推荐系统

本文链接：https://blog.csdn.net/weixin_43282288/article/details/101549400

版权

推荐系统专栏收录该内容

1 篇文章 0 订阅

订阅专栏

协同过滤

# http://surprise.readthedocs.io/en/stable/index.html
# http://files.grouplens.org/datasets/movielens/ml-100k-README.txt
from surprise import KNNBasic,SVD
from surprise import Dataset
from surprise.model_selection import cross_validate

# Load the movielens-100k dataset
data = Dataset.load_builtin('ml-100k')
#协同过滤
algo = KNNBasic()
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

在这里插入图片描述
交叉验证

from surprise.model_selection import GridSearchCV

#lr_all学习率,reg_all正则化惩罚项
param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}

#rmse均方误差与fcp协调分数
grid_search = GridSearchCV(SVD, param_grid, measures=['rmse', 'fcp'], cv=3)
grid_search.fit(data)

# best RMSE score
print(grid_search.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(grid_search.best_params['rmse'])

# best FCP score
print(grid_search.best_score['fcp'])

# combination of parameters that gave the best FCP score
print(grid_search.best_params['fcp'])

在这里插入图片描述

import pandas as pd  
#从字典当中转为表格形式
results_df = pd.DataFrame.from_dict(grid_search.cv_results)
results_df

在这里插入图片描述
矩阵转换并打印推荐

import io  # needed because of weird encoding of u.item file

from surprise import KNNBaseline
from surprise import Dataset
from surprise import get_dataset_dir


def read_item_names():
    """Read the u.item file from MovieLens 100-k dataset and return two
    mappings to convert raw ids into movie names and movie names into raw ids.
    """

    file_name = get_dataset_dir() + '/ml-100k/ml-100k/u.item'
    rid_to_name = {}
    name_to_rid = {}
    with io.open(file_name, 'r', encoding='ISO-8859-1') as f:
        for line in f:
            line = line.split('|')
            rid_to_name[line[0]] = line[1]
            name_to_rid[line[1]] = line[0]

    return rid_to_name, name_to_rid


# First, train the algortihm to compute the similarities between items
data = Dataset.load_builtin('ml-100k')
#数据是一行一行的，优化需要对矩阵分解，转换为原始的比较稀疏的矩阵
trainset = data.build_full_trainset()
#皮尔逊相似度衡量方法，'user_based': False即item_based
sim_options = {'name': 'pearson_baseline', 'user_based': False}
#协同过滤，一种统计方法
algo = KNNBaseline(sim_options=sim_options)
#训练
algo.fit(trainset)

# 基于item的协同过滤，手上有电影名字
rid_to_name, name_to_rid = read_item_names()

# 想知道哪些电影离'Toy Story (1995)'最近
#直接把名字传进去推荐系统不认识，用之前建的模将电影id找出来
toy_story_raw_id = name_to_rid['Toy Story (1995)']

#raw是数据样本中的id，build_full_trainset之后需要转换为矩阵id，inner_id
toy_story_inner_id = algo.trainset.to_inner_iid(toy_story_raw_id)

# 然后get近邻，k=10近邻
toy_story_neighbors = algo.get_neighbors(toy_story_inner_id, k=10)

# 再将inner_id转换为raw_id
#再由raw_id转为电影名字
toy_story_neighbors = (algo.trainset.to_raw_iid(inner_id)
                       for inner_id in toy_story_neighbors)
toy_story_neighbors = (rid_to_name[rid]
                       for rid in toy_story_neighbors)

print()
print('The 10 nearest neighbors of Toy Story are:')
for movie in toy_story_neighbors:
    print(movie)

在这里插入图片描述