基于用户的协同过滤算法
参考文章:https://blog.csdn.net/likeyou1314918273/article/details/89607596
参考代码和数据: https://blog.csdn.net/qq_25948717/article/details/81839463
大部分代码有修改。
python实现
import pandas as pd
import math as mh
movies = pd.read_csv("D:/Python/Python36/ML/datasets/ml-latest-small/movies.csv")
ratings = pd.read_csv("D:/Python/Python36/ML/datasets/ml-latest-small//ratings.csv")
data = pd.merge(movies, ratings, on='movieId')
csv_file = "D:/Python/Python36/ML/datasets/ml-latest-small/data.csv"
data[['userId', 'movieId', 'rating', 'title', 'genres']].sort_values('userId').to_csv(csv_file, index=False,
header=None)
# 采用python字典来表示每位用户评论的电影和评分
# 定义数据结构 {userId: {title: rating}}
data = {}
with open(csv_file, 'r', encoding='UTF-8') as file:
for line in file.readlines():
line = line.strip().split(',')
if not line[0] in data.keys():
data[line[0]] = {line[3]: line[2]}
else:
data[line[0]][line[3]] = line[2]
# print(data)
"""计算任何两位用户之间的相似度,由于每位用户评论的电影不完全一样,所以兽先要找到两位用户共同评论过的电影
然后计算两者之间的欧式距离,最后算出两者之间的相似度
"""
# 使用欧氏距离倒数计算用户间相似度
def euclidean(userId1, userId2):
distance = 0.0
for movieId in data[userId1].keys():
if movieId in data[userId2].keys(): # movieId实际是title
distance += mh.pow((float(data[userId1][movieId]) - float(data[userId2][movieId])), 2)
return 1. / (1.0 + mh.sqrt(distance))
# 计算两用户之间的Pearson相关系数
def pearson(userId1, userId2):
movies_user1 = data[userId1]
movies_user2 = data[userId2]
common = {}
for movie in movies_user1.keys():
if movie in movies_user2.keys():
common[movie] = 1 # 过滤掉没有共同评分的电影
n = len(common)
if n == 0:
return 0
# 逐个评分乘积和
sum12 = sum([float(movies_user1[movie]) * float(movies_user2[movie]) for movie in common])
# 评分和
sum1 = sum([float(movies_user1[movie]) for movie in common])
sum2 = sum([float(movies_user2[movie]) for movie in common])
# 评分平方和
square_sum1 = sum([mh.pow(float(movies_user1[movie]), 2) for movie in common])
square_sum2 = sum([mh.pow(float(movies_user2[movie]), 2) for movie in common])
# 分子
num = n * sum12 - sum1 * sum2
den = mh.sqrt(n * square_sum1 - sum1 * sum1) * mh.sqrt(n * square_sum2 - sum2 * sum2)
if den == 0:
return 0
return num / den
# 测试
e1 = euclidean('1', '8')
print("e1 = ", e1)
p1 = pearson('1', '8')
print("p1 = ", p1)
# 查找指定用户的K个相似用户
def similar_users(userId, K):
if K is None:
K = len(data)
sim_users = []
for userIdx in data.keys():
if not userIdx == userId:
simi = euclidean(userId, userIdx)
sim_users.append((userIdx, simi))
sim_users.sort(key=lambda val: val[1], reverse=True)
# print("res=", res)
return sim_users[: K]
# 构造指定用户的相似用户和指定用户未评分电影的评分矩阵
def unrated_movies_matrix(userId, k):
# 指定用户的k个相似用户
users_scores = similar_users(userId, k)
# 指定用户未评分且指定用户的相似用户已评分的电影集合
unratedMovies = set()
for user_score in users_scores:
user = user_score[0]
for movie in data[user].keys():
if movie not in data[userId].keys():
unratedMovies.add(movie)
unratedMovies = list(unratedMovies)
# movies = unrated_movies('1', 2)
cols = ['user', 'simi']
cols.extend(unratedMovies)
rated_matrix = pd.DataFrame()
for user_score in users_scores:
scores = []
user = user_score[0]
user_simi = user_score[1]
scores.append(user)
scores.append(user_simi)
for movie in unratedMovies:
score = float(data[user].get(movie, 0.0))
scores.append(score)
df_movie = pd.DataFrame([scores])
rated_matrix = rated_matrix.append(df_movie, ignore_index=True)
rated_matrix.columns = cols
csv1_file = "D:/Python/Python36/ML/datasets/ml-latest-small/unweighted_score_matrix_users.csv"
rated_matrix.to_csv(csv1_file)
return rated_matrix
# 加权指定用户未评分且指定用户的相似用户已评分的电影矩阵,得到指定用户对未评分商品的喜好程度
def unrated_interest_of_user(userId, k, topN):
unrated_matrix = unrated_movies_matrix(userId, k)
cols = list(unrated_matrix.columns)
simi_sum = sum(unrated_matrix['simi'])
movies_name = list(unrated_matrix.columns)[2:]
rated_simi = []
weighted_rated_simi = ['users', simi_sum]
for movie in movies_name:
simi = sum(unrated_matrix['simi'].mul(unrated_matrix[movie])) / simi_sum
# print('unrated_simi=', (movie, unrated_simi))
rated_simi.append((movie, simi))
weighted_rated_simi.append(simi)
# simi = unrated_interest_of_user('1', 2, 1)
df_weighted_score = pd.DataFrame([weighted_rated_simi], columns=cols)
rated_matrix = unrated_matrix.append(df_weighted_score, ignore_index=True)
csv1_file = "D:/Python/Python36/ML/datasets/ml-latest-small/weighted_score_matrix_users.csv"
rated_matrix.to_csv(csv1_file)
unrated_simi = sorted(rated_simi, key=lambda x: x[1], reverse=True)
return unrated_simi[: topN]
# 召回topN个电影
unrated_interest = unrated_interest_of_user('4', 5, 10)
print(unrated_interest)