模仿人家代码完全参考《推荐系统实践》上的源码组织的:
#coding=UTF-8
'''Created on 2018/8'''
############## MovieLens 数据集 H:\movies\ml-1m ##############
import sys
import random
import math
import os
from operator import itemgetter
from collections import defaultdict
random.seed(0) #后续生成相同的随机数
class UserCF(object):
def __init__(self):
self.Train_set = {} #训练集
self.Test_set = {} #测试集
self.M_all_train_test = 6
self.k_split_data = 1 #[0,M-1]
self.k_sim_user = 20 #推荐相似用户的个数
self.n_rec_movie = 10 #为用户推荐电影数
self.w_sim_matrix = {} #相似矩阵
self.popular_movie = {} #热门电影
self.rec_movie = {} #推荐电影
self.count_movie = 0 #电影总数量
print('similar user NO. = %d' % self.k_sim_user, sys.stderr)
print('recommended movie NO. = %d' % self.n_rec_movie, sys.stderr)
def loaddata(self,filename): #打开数据文件,加载数据
''' load a file,return a generator '''
fp = open(filename,'r')
for i, line in enumerate(fp):
yield line.strip('\t\n')
if i%100000 == 0: print('%s load %s' % (filename,i),sys.stderr)
fp.close()
print('%s load END ' % filename,sys.stderr)
def SplitData(self,filename): #划分训练集,测试集
M = self.M_all_train_test
k = self.k_split_data
testset_len = 0
trainset_len = 0
for line in self.loaddata(filename):
user, movie, rating,_ = line.split('::')
if random.randint(0,M) == k:
self.Test_set.setdefault(user,{})
self.Test_set[user][movie] = float(rating)
testset_len += 1
else:
self.Train_set.setdefault(user,{})
self.Train_set[user][movie] = float(rating)
trainset_len += 1
print('split data: train set = %s, test set = %s' % (trainset_len, testset_len),sys.stderr)
def UserSimilarity(self): #计算相似矩阵
item_users = dict()
for user,items in self.Train_set.items(): #建立倒排表
for movie in items.keys():
if movie not in item_users: item_users[movie] = set()
item_users[movie].add(user)
if movie not in self.popular_movie: #统计热门电影
self.popular_movie[movie] = 0
self.popular_movie[movie] += 1
self.count_movie = len(item_users)
print('count movie = %d' %self.count_movie, sys.stderr)
for movie,user in item_users.items(): #计算相似矩阵
for u in user:
self.w_sim_matrix.setdefault(u,defaultdict(int))
for v in user:
if u == v: continue
self.w_sim_matrix[u][v] += 1/math.log(1+len(user))
for u,users in self.w_sim_matrix.items():
for v, count in users.items():
self.w_sim_matrix[u][v] = count/math.sqrt(len(self.Train_set[u])*len(self.Train_set[v]))
print('calculate user similarity matrix END ',sys.stderr)
def Recommend(self, user): #推荐
watched_movies = self.Train_set[user]
N = self.n_rec_movie
w = self.w_sim_matrix
K = self.k_sim_user
for v, wuv in sorted(w[user].items(),key=itemgetter(1),reverse=True)[0:K]:
for movie,rat in self.Train_set[v].items():
if movie in watched_movies : continue
self.rec_movie.setdefault(movie,0)
self.rec_movie[movie] += wuv * rat
return sorted(self.rec_movie.items(),key=itemgetter(1),reverse=True)[0:N]
def evaluate(self):
num = 0 # 为用户推荐集合与测试集合的交集
recall_den = 0 #召回率
precision_den = 0 #准确率
recommend_movies = set()
popularity_num = 0 #新颖度
for i, user in enumerate(self.Train_set):
if i % 1e3 == 0: print('i = %d' % i)
test = self.Test_set.get(user,{})
rank = self.Recommend(user)
recall_den += len(test)
precision_den += self.n_rec_movie
for movie,_ in rank:
recommend_movies.add(movie[0])
popularity_num += math.log(1+self.popular_movie[movie[0]])
if movie[0] in test.keys():
num += 1
print('num = %d, recall den = %d, precision den = %d' %(num, recall_den, precision_den),sys.stderr)
print('recommend movies = %d' % len(recommend_movies),sys.stderr)
print('count movie = %d' %self.count_movie, sys.stderr)
recall = num / (1.0*recall_den) #召回率
precision = num / (1.0*precision_den) #准确度
coverage = len(recommend_movies) / (1.0*self.count_movie) #覆盖率
popularity = popularity_num / (1.0*precision_den) #新颖度
print ('recall = %.6f, precision = %.6f, coverage = %.6f, popularity = %.6f'
%( recall, precision, coverage, popularity) )
if __name__ == '__main__':
ratingfile = os.path.join('ml-1m', 'ratings.dat')
usercf = UserCF()
usercf.SplitData(ratingfile)
usercf.UserSimilarity()
usercf.evaluate()