几乎完全按照王喆的《深度学习推荐系统》来复现的,用的皮尔逊相似度使用物品平均分对各独立评分作修正,如果评分都是一样的话部分退化成余弦相似度。但是最后效果较差,mae高达惊人的0.9,运算速度极其缓慢,当然也可能与使用语言有关。由于是完全手写,可以很明显的感受到python的笨拙,没有c++那么轻便。
import random
import math
from operator import itemgetter
class ItemBasedCF():
#初始化参数
def __init__(self):
#找到相似的20部,推荐给用户10部
self.n_sim_movie = 20
self.n_rec_movie = 10
#划分训练集和测试集
self.trainSet = {}
self.testSet = {}
#物品相似度矩阵,电影评分人数,电影计数
self.user_movie_matrix = {}
self.movie_sim_matrix = {}
self.movie_popular = {}
self.movie_count = 0
print("相似电影个数是 %d 个" %self.n_sim_movie)
print("推荐电影个数是 %d 个" % self.n_rec_movie)
def get_dataset(self,filename,pivot=0.7):
trainSet_len = 0
testSet_len = 0
for line in self.load_file(filename):
user,movie,rating,timestamp = line.split('\t')
if user == '405' and movie == '288':
self.testSet.setdefault(user, {})
self.testSet[user][movie] = float(rating)
testSet_len += 1
elif(random.random()<pivot):
self.user_movie_matrix.setdefault(user,{})
self.user_movie_matrix[user][movie] = float(rating)
self.trainSet.setdefault(movie,{})
self.trainSet[movie][user] = float(rating)
trainSet_len += 1
else:
self.testSet.setdefault(user, {})
self.testSet[user][movie] = float(rating)
testSet_len += 1
print(self.testSet['405']['288'])
print("成功分隔训练集和测试集")
print("训练集长度:%s" % trainSet_len)
print("测试集长度:%s" % testSet_len)
#读文件,返回每一行
def load_file(self,filename):
with open(filename,'r') as f:
for i,line in enumerate(f):
if i == 0:
continue
yield line.strip('\r\n') #yield 加强版return 返回多个值,去除\r \n
print("%s数据读取成功!" % filename)
#计算相似度
def calcu_movie_sim(self):
#统计各电影评分个数
# for moive,users in self.trainSet.items():
# if movie not in self.movie_popular:
# self.movie_popular[movie] = 0
# self.movie_popular[movie] += 1
# self.movie_count = len(self.movie_popular)
# print("评价电影的总个数:%d" % self.movie_count)
print("计算相似度矩阵")
for movie,users in self.trainSet.items():
for movie1,users1 in self.trainSet.items():
self.movie_sim_matrix.setdefault(movie, {})
if movie1 not in self.movie_sim_matrix[movie].keys():
self.movie_sim_matrix[movie].setdefault(movie1, 0)
#考虑当x两部电影都未被点评过的情况,相似度直接设置为0
len1,len2,sum1,sum2,avg1,avg2 = 0,0,0,0,0,0
for user in users.keys():
len1 += 1
sum1 += self.trainSet[movie][user]
for user in users1.keys():
len2 += 1
sum2 += self.trainSet[movie1][user]
avg1 = sum1/len1
avg2 = sum2/len2
zi = 0
mu1 = 0
mu2 = 0
for user in users.keys():
if user in users1.keys():
zi += (self.trainSet[movie][user] - avg1) * (self.trainSet[movie1][user] - avg2)
mu1 += (self.trainSet[movie][user] - avg1)**2
#如果评分都一样,退化成部分余弦相似度,不然就是皮尔逊相似度
for user in users1.keys():
mu2 += (self.trainSet[movie1][user] - avg2)**2
if mu1 == 0 or mu2 == 0:
if mu1 == 0 and mu2 != 0:
for user in users.keys():
if user in users1.keys():
zi += (self.trainSet[movie][user]) * (self.trainSet[movie1][user] - avg2)
for user in users.keys():
mu1 += (self.trainSet[movie][user]) ** 2
for user in users1.keys():
mu2 += (self.trainSet[movie1][user] - avg2) ** 2
elif mu1!= 0 and mu2 == 0:
for user in users.keys():
if user in users1.keys():
zi += (self.trainSet[movie][user] - avg1) * (self.trainSet[movie1][user])
for user in users.keys():
mu1 += (self.trainSet[movie][user] - avg1) ** 2
for user in users1.keys():
mu2 += (self.trainSet[movie1][user]) ** 2
else:
for user in users.keys():
if user in users1.keys():
zi += (self.trainSet[movie][user]) * (self.trainSet[movie1][user])
for user in users.keys():
mu1 += (self.trainSet[movie][user]) ** 2
for user in users1.keys():
mu2 += (self.trainSet[movie1][user]) ** 2
else:
for user in users.keys():
if user in users1.keys():
zi += (self.trainSet[movie][user] - avg1) * (self.trainSet[movie1][user] - avg2)
for user in users.keys():
mu1 += (self.trainSet[movie][user] - avg1) ** 2
for user in users1.keys():
mu2 += (self.trainSet[movie1][user]) ** 2
mu = math.sqrt(mu1) * math.sqrt(mu2)
self.movie_sim_matrix[movie][movie1] = zi/mu
# print("%s和%s的相似度为%.3f" %(movie,movie1,self.movie_sim_matrix[movie][movie1]))
print("成功计算相似度矩阵")
#给用户做物品推荐
def recommend(self, user):
n = self.n_rec_movie
k = self.n_sim_movie
rank = {}
sum = {}
whatched_movies = self.user_movie_matrix[user]
for movie, rating in whatched_movies.items():
#按照相似度排序取前k个相似的
for related_movies, w in sorted(self.movie_sim_matrix[movie].items(), key=itemgetter(1),reverse=True)[:k]:
if related_movies in whatched_movies:
continue
if related_movies not in rank.keys():
rank.setdefault(related_movies, 0)
#通过与其相似物品与物品related_movie的偏好值相乘并相加
#排名的依据是推荐电影与已看电影的相似度(累积)*用户对已看电影的评分
rank[related_movies] += float(w) * float(rating)
if related_movies not in sum.keys():
sum.setdefault(related_movies, 0)
sum[related_movies] += float(w)
for related_movies in rank:
if sum[related_movies] == 0 :
rank[related_movies] = 0
continue
rank[related_movies] = float(rank[related_movies]/sum[related_movies])
print("预测")
print(rank)
return sorted(rank.items(),key=itemgetter(1), reverse=True)[:n]
#产生推荐列表并通过准确率,召回率和覆盖率进行评估
def evaluate(self):
print("开始评估")
n = self.n_rec_movie
#准确率和召回率MAE
hit = 0
rec_count = 0
test_count = 0
sum = 0
#覆盖率,去掉重复的电影
all_rec_movies = set()
for i,user in enumerate(self.user_movie_matrix):
test_movies = self.testSet.get(user,{})
rec_movies = self.recommend(user)
print("用户id:%s" %(user))
for movie,w in rec_movies:
if movie in test_movies:
hit += 1
print("电影id:%s,电影预测评分%.3f电影实际评分%.3f" %(movie,float(w),float(test_movies[movie])))
sum +=abs(w-float(test_movies[movie]))
all_rec_movies.add(movie)
rec_count += n
test_count +=len(test_movies)
mae = sum/hit
precision = hit /(1.0 * rec_count)
recall = hit /(1.0 * test_count)
print("推荐准确度是%.3f,召回率是%.3f,MAE是%.3f" %(precision,recall,mae))
if __name__ == '__main__':
rating_file = r'ml-100k/u.data'
itemCF = ItemBasedCF()
itemCF.get_dataset(rating_file)
itemCF.calcu_movie_sim()
itemCF.evaluate()