基于物品的协同过滤算法
参考文章:https://blog.csdn.net/likeyou1314918273/article/details/89607596
参考代码和数据: https://blog.csdn.net/qq_25948717/article/details/81839463
大部分代码有修改。
数据的读取和拼接用pandas实现。
用户、电影、评分基础数据使用字典数据结构,当然可以优化为pandas数据框实现,后续可以优化。
代码中拼接未评分物品的相似度矩阵用pandas数据框实现。
代码中用户对未评分物品的喜好程度的数据用pandas数据框实现。
python实现
import pandas as pd
import math as mh
movies_data = pd.read_csv("D:/Python/Python36/ML/datasets/ml-latest-small/movies.csv")
ratings_data = pd.read_csv("D:/Python/Python36/ML/datasets/ml-latest-small//ratings.csv")
data = pd.merge(movies_data, ratings_data, on='movieId')
csv_file = "D:/Python/Python36/ML/datasets/ml-latest-small/data.csv"
data[['userId', 'movieId', 'rating', 'title', 'genres']].sort_values('userId').to_csv(csv_file, index=False,
header=None)
# 采用python字典来表示每位用户评论的电影和评分
# 定义数据结构 {userId: {title: rating}}
data = {}
with open(csv_file, 'r', encoding='UTF-8') as file:
for line in file.readlines():
line = line.strip().split(',')
if not line[0] in data.keys():
data[line[0]] = {line[3]: line[2]}
else:
data[line[0]][line[3]] = line[2]
# print(data)
"""计算任何两个电影之间的相似度,由于每位用户评论的电影不完全一样,所以首先要找到两位用户共同评论过的电影
然后计算两者之间的距离,最后算出两者之间的相似度
"""
# 使用欧氏距离倒数计算电源间相似度
def euclidean(movie1, movie2):
distance = 0.0
for userId in data.keys():
distance += mh.pow(float(data[userId].get(movie1, 0.0)) - float(data[userId].get(movie2, 0.0)), 2)
return 1. / (1.0 + mh.sqrt(distance))
# distance = euclidean('Absolute Power (1997)', 'Buffy the Vampire Slayer (1992)')
# 计算两用户之间的Pearson相关系数
def pearson(userId1, userId2):
movies_user1 = data[userId1]
movies_user2 = data[userId2]
common = {}
for movie in movies_user1.keys():
if movie in movies_user2.keys():
common[movie] = 1 # 过滤掉没有共同评分的电影
n = len(common)
if n == 0:
return 0
# 逐个评分乘积和
sum12 = sum([float(movies_user1[movie]) * float(movies_user2[movie]) for movie in common])
# 评分和
sum1 = sum([float(movies_user1[movie]) for movie in common])
sum2 = sum([float(movies_user2[movie]) for movie in common])
# 评分平方和
square_sum1 = sum([mh.pow(float(movies_user1[movie]), 2) for movie in common])
square_sum2 = sum([mh.pow(float(movies_user2[movie]), 2) for movie in common])
# 分子
num = n * sum12 - sum1 * sum2
den = mh.sqrt(n * square_sum1 - sum1 * sum1) * mh.sqrt(n * square_sum2 - sum2 * sum2)
if den == 0:
return 0
return num / den
p1 = pearson('1', '8')
print("p1 = ", p1)
# 用户基于和其他被该用户评分的电影相似度较高的电影的相似度较高的电影与未评分电影的相似度进行推荐
def similar_movies_based_topmovie(userId, k):
# 计算指定用户的已评分电影间的相似度
movies_simi = [] # movie间相似度
movies = data[userId].keys()
movies = list(movies)
for i in range(len(movies)):
for j in range(i + 1, len(movies)):
simi = euclidean(movies[i], movies[j])
movies_simi.append((movies[i], movies[j], simi))
movies_simi.sort(key=lambda val: val[2], reverse=True)
# 查找和其他电影相似度最高的电影
movies_simi = movies_simi[: k]
# print('movies_simi = ', movies_simi)
movie_count = {}
for movies_score in movies_simi:
for i in [0, 1]:
if movies_score[i] not in movie_count.keys():
movie_count[movies_score[i]] = 0
movie_count[movies_score[i]] += 1
# 字典排序
movie_count = sorted(movie_count.items(), key=lambda kv: (kv[1], kv[0]), reverse=True)
# print('movie_count=', movie_count)
# 和其他电影相似度最高的电影
top_movie = movie_count[0][0]
# 查找到的与该电影相似度最高的已评分电影,命名为相似评分电影
similar_movies = []
for movie in movies:
if not movie == top_movie:
simi = euclidean(top_movie, movie)
similar_movies.append((movie, simi))
similar_movies.sort(key=lambda val: val[1], reverse=True)
# 取前k个得分最高的电影
similar_movies = similar_movies[: k]
similar_movies_score = []
# 将电影相似度修改为评分
for movie_simi in similar_movies:
score = data[userId].get(movie_simi[0])
similar_movies_score.append((movie_simi[0], score))
# print("similar_movies_score=", similar_movies_score)
# 构建相似评分电影与未评分电影矩阵
# 使用未知物品评分对相似用户的相似度进行加权得到用户对商品的喜好程度,并排序后返回推荐列表
# unratedMovies = unrated_movies_matrix(userId)
return similar_movies_score
# 构建相似评分电影与未评分电影矩阵
def unrated_movies_matrix(userId, k):
# 获取用户userId未评分的电影
all_user = data.keys()
unratedMovies = set()
for user in all_user:
for movie in data[user].keys():
if movie not in data[userId].keys():
unratedMovies.add(movie)
unratedMovies = list(unratedMovies)
# print('unratedMovies=', unratedMovies)
# movies = unrated_movies('1', 2)
cols = ['movie', 'rating']
cols.extend(unratedMovies)
unrated_matrix = pd.DataFrame()
# userId基于最高已评分电影的相似评分电影与userId未评分电影的相似度矩阵
unratedMovies_similar = []
movie_similar_score = similar_movies_based_topmovie(userId, k)
for movie_score in movie_similar_score:
# 拼接相似度电影及其评分
scores = []
movie = movie_score[0]
score = float(movie_score[1])
scores.append(movie)
scores.append(score)
# 拼接未评分电影的相似度
for unratedMovie in unratedMovies:
simi = euclidean(movie, unratedMovie) # 相似度电影和未评分电影相似度
scores.append(simi)
df_movie = pd.DataFrame([scores])
unrated_matrix = unrated_matrix.append(df_movie, ignore_index=True)
unrated_matrix.columns = cols
csv1_file = "D:/Python/Python36/ML/datasets/ml-latest-small/unweighted_score_matrix_movies.csv"
unrated_matrix.to_csv(csv1_file)
return unrated_matrix
a = similar_movies_based_topmovie('1', 3)
b = unrated_movies_matrix('1', 3)
# 定义指定用户对未评分商品的喜好程度
def unrated_interest_of_user(userId, k, topN):
unrated_matrix = unrated_movies_matrix(userId, k)
cols = list(unrated_matrix.columns)
scores_sum = sum(unrated_matrix['rating'])
movies_name = list(unrated_matrix.columns)[2:]
rated_simi = []
weighted_rated_simi = ['movies', scores_sum]
for movie in movies_name:
simi = sum(unrated_matrix['rating'].mul(unrated_matrix[movie])) / scores_sum
# print('rated_simi=', (movie, rated_simi))
rated_simi.append((movie, simi))
weighted_rated_simi.append(simi)
# simi = unrated_interest_of_user('1', 2, 1)
df_weighted_score = pd.DataFrame([weighted_rated_simi], columns=cols)
rated_matrix = unrated_matrix.append(df_weighted_score, ignore_index=True)
csv1_file = "D:/Python/Python36/ML/datasets/ml-latest-small/weighted_score_matrix_movies.csv"
rated_matrix.to_csv(csv1_file)
rated_simi = sorted(rated_simi, key=lambda x: x[1], reverse=True)
return rated_simi[: topN]
# 召回topN个电影
unrated_movies_interest = unrated_interest_of_user('1', 3, 5)
print("unrated_movies_interest=", unrated_movies_interest)