本文是对论文《一种基于标签的电影组推荐方法》熊聪聪,林 颖,史艳翠,冯 阔 (天津科技大学人工智能学院,天津 300457)中,一些算法的实现。
基于标签的用户相似度
代码实现:
# coding = utf-8
# 基于用户的协同过滤推荐算法实现
import random
import math
from operator import itemgetter
import pickle
class UserBasedCF():
# 初始化相关参数
def __init__(self):
# 找到与目标用户兴趣相似的20个用户,为其推荐10部电影
self.n_sim_user = 20
self.n_rec_movie = 10
# 将数据集划分为训练集和测试集
self.trainSet = {}
self.testSet = {}
# 用户相似度矩阵
self.user_sim_matrix = {}
self.movie_count = 0
print('Similar user number = %d' % self.n_sim_user)
print('Recommneded movie number = %d' % self.n_rec_movie)
# 读文件得到“用户-电影”数据
def get_dataset(self, filename, pivot=0.75):
trainSet_len = 0
testSet_len = 0
for line in self.load_file(filename):
user, movie, rating, timestamp = line.split('\t')
if random.random() < pivot:
self.trainSet.setdefault(user, {})
self.trainSet[user][movie] = rating
trainSet_len += 1
else:
self.testSet.setdefault(user, {})
self.testSet[user][movie] = rating
testSet_len += 1
print('Split trainingSet and testSet success!')
print('TrainSet = %s' % trainSet_len)
print('TestSet = %s' % testSet_len)
# 读文件,返回文件的每一行
def load_file(self, filename):
with open(filename, 'r') as f:
for i, line in enumerate(f):
if i == 0: # 去掉文件第一行的title
continue
yield line.strip('\r\n')
print('Load %s success!' % filename)
def user_label(self, user, label):#计算用户对某一标签的偏好
#label 标签下标 0-19
#user 用户id
f_utk = 0 #这里计算该用户所有标签数的总和
for num in self.f_ut[user]:
f_utk += num
p_ut = (self.f_ut[user][label]/f_utk)*math.log(self.m_t/self.m_tj[label])
return p_ut
def importanceOfLabel(self, label): #标签的重要性(权重)
t = self.t #表示所有电影的标签出现次数
t_j = self.m_tj #表示标签tj在总标签中出现次数
n = self.user_n #n表示用户数
# n_tj #n_tj表示看过标签tj的用户数
w_tj = math.log((t/t_j[label])*(n/self.n_tj[label]))
return w_tj
def user_label_w(self, p_ut, label):
return p_ut*self.importanceOfLabel(label)
def calc_user_sim_of_label(self, user1, user2, label_list):
#label_list user1和user2均观看过的标签
p_ui = {} #标签偏好权重均值
p_uj = {} #标签偏好权重均值
p_ui_avg = 0
p_uj_avg = 0
for label in label_list:
p_ui[label] = self.user_label_w(self.user_label(user1, label), label)
p_uj[label] = self.user_label_w(self.user_label(user2, label), label)
p_ui_avg += p_ui[label]
p_uj_avg += p_uj[label]
p_ui_avg /= len(label_list)
p_uj_avg /= len(label_list)
result_fenzi = 0
for label in label_list:
result_fenzi += ((p_ui[label]-p_ui_avg)*(p_uj[label]-p_uj_avg))
user1_fenmu = 0
user2_fenmu = 0
for label in label_list:
user1_fenmu += (p_ui[label]-p_ui_avg)**2
user2_fenmu += (p_uj[label]-p_uj_avg)**2
return (result_fenzi)/(math.sqrt(user1_fenmu * user2_fenmu))
# 计算用户之间的相似度
def calc_user_sim(self):
# 构建“电影-用户”倒排索引
# key = movieID, value = list of userIDs who have seen this movie
# self.movie_label mt
self.f_ut = {}#用户u观看的电影的标签t的个数
self.m_tj = [0 for n in range(19)] #用包含标签tj的电影数
self.m_t = 0 #有标签的电影数,这里的数据集每个电影都有标签
self.t = 0
self.user_n = 0 #用户数
self.n_tj = [0 for n in range(19)] #统计观看的label对应的用户数
print('Building movie-user table ...')
movie_user = {}
for user, movies in self.trainSet.items():
self.user_n += 1
is_collect = [False for n in range(19)]
if user not in self.f_ut:
self.f_ut[user] = [0 for n in range(19)]
for movie in movies:
indexs = 0
for num in self.movie_label[int(movie)]:#电影总共有19个标签
if num!=0:
self.f_ut[user][indexs] += num
self.m_tj[indexs] += 1
self.t += 1
if is_collect[indexs]==False:
self.n_tj[indexs] += 1
is_collect[indexs] = True
indexs += 1
if movie not in movie_user:
movie_user[movie] = set()
movie_user[movie].add(user)
print('Build movie-user table success!')
self.m_t = self.movie_count = len(movie_user)
print('Total movie number = %d' % self.movie_count)
print('Build user co-rated movies matrix ...')
for movie, users in movie_user.items():
for u in users:
for v in users:
if u == v:
continue
if u not in self.user_sim_matrix:
self.user_sim_matrix.setdefault(u, {})
if v not in self.user_sim_matrix[u]:
self.user_sim_matrix[u].setdefault(v, 0)
self.user_sim_matrix[u][v] += 1
print('Build user co-rated movies matrix success!')
# 计算相似性
print('Calculating user similarity matrix ...')
alpha = 0.3
for u, related_users in self.user_sim_matrix.items():
for v, count in related_users.items():
self.user_sim_matrix[u][v] = count / math.sqrt(len(self.trainSet[u]) * len(self.trainSet[v]))
label_list = list()
for index in range(19):
if self.f_ut[u][index]!=0 and self.f_ut[v][index]!=0:
label_list.append(index)
# self.user_sim_matrix[u][v] += self.calc_user_sim_of_label(u, v, label_list)
#计算总体偏好相似度
self.user_sim_matrix[u][v] = (1-alpha)*self.user_sim_matrix[u][v] + alpha*self.calc_user_sim_of_label(u, v, label_list)
print('Calculate user similarity matrix success!')
# 针对目标用户U,找到其最相似的K个用户,产生N个推荐
def recommend(self, user):
K = self.n_sim_user
N = self.n_rec_movie
rank = {}
watched_movies = self.trainSet[user]
# v=similar user, wuv=similar factor
for v, wuv in sorted(self.user_sim_matrix[user].items(), key=itemgetter(1), reverse=True)[0:K]:
for movie in self.trainSet[v]:
if movie in watched_movies:
continue
rank.setdefault(movie, 0)
rank[movie] += wuv
return sorted(rank.items(), key=itemgetter(1), reverse=True)[0:N]
# 产生推荐并通过准确率、召回率和覆盖率进行评估
def evaluate(self):
print("Evaluation start ...")
N = self.n_rec_movie
# 准确率和召回率
hit = 0
rec_count = 0
test_count = 0
# 覆盖率
all_rec_movies = set()
for i, user, in enumerate(self.trainSet):
test_movies = self.testSet.get(user, {})
rec_movies = self.recommend(user)
print("rec_movies = ",rec_movies)
for movie, w in rec_movies:
if movie in test_movies:
hit += 1
all_rec_movies.add(movie)
rec_count += N
test_count += len(test_movies)
precision = hit / (1.0 * rec_count)
recall = hit / (1.0 * test_count)
coverage = len(all_rec_movies) / (1.0 * self.movie_count)
# print('precisioin=%.4f\trecall=%.4f\tcoverage=%.4f' % (precision, recall, coverage))
print('precisioin=%.2f%%\t recall=%.2f%%\t coverage=%.2f%%' % (precision*100, recall*100, coverage*100))
return precision, recall, coverage
def get_label(self):
fs=open('movie_label.pkl','rb')
self.movie_label=pickle.load(fs)
if __name__ == '__main__':
rating_file = '../ItemCF/ml-100k/u.data'
precision, recall, coverage = 0,0,0
for i in range(10):
userCF = UserBasedCF()
userCF.get_dataset(rating_file)
userCF.get_label()
userCF.calc_user_sim()
t1, t2, t3 = userCF.evaluate()
precision += t1
recall += t2
coverage += t3
print('precisioin=%.2f%%\t recall=%.2f%%\t coverage=%.2f%%' % ((precision/10)*100, (recall/10)*100, (coverage/10)*100))
结果:
使用数据集:https://share.weiyun.com/2UP7O67E