使用数据集:https://grouplens.org/datasets/movielens/
源代码:
import random
import math
from operator import itemgetter
class UserCF(object):
def __init__(self, input_path, split_part, test_part, seed):
self.path = input_path
self.split_part = split_part
self.test_part = test_part
self.seed = seed
self._get_data_()
self.split_data()
self.user_similarity, self.user_count = self.user_similarity()
def _get_data_(self):
data = []
with open(self.path, 'r')as f:
for line in f.readlines():
line_list = line.split('\t')
data.append((line_list[0], line_list[1]))
self.data = data
def split_data(self):
random.seed(self.seed)
train, test = dict(), dict()
for user, item in self.data:
rdm = random.randint(0, self.split_part - 1)
if rdm == self.test_part:
if user not in test:
test[user] = set()
test[user].add(item)
else:
if user not in train:
train[user] = set()
train[user].add(item)
self.train, self.test = train, test
def user_similarity(self):
# 1.倒排
item_users = dict()
for user, items in self.train.items():
for item in items:
if item not in item_users:
item_users[item] = set()
item_users[item].add(user)
# 2 计算关联度
user_count, user_sim = dict(), dict()
for item, users in item_users.items():
for user in users:
# 统计个数
if user not in user_count:
user_count[user] = 0
user_count[user] += 1
for other_user in users:
if other_user == user:
continue
if user not in user_sim:
user_sim[user] = dict()
if other_user not in user_sim[user]:
user_sim[user][other_user] = 1 / math.log(1 + len(users))
else:
user_sim[user][other_user] = user_sim[user][other_user] + (1 / math.log(1 + len(users)))
# 3 计算相似度
res_sim = dict()
for user, user_similarity in user_sim.items():
if user not in res_sim:
res_sim[user] = dict()
for other_user, similarity in user_similarity.items():
val = similarity / math.sqrt(user_count[other_user] * user_count[user])
res_sim[user][other_user] = val
return res_sim, user_count
def recommend(self, user, k):
user_relate_items = self.train[user]
sim_users = sorted(self.user_similarity[user].items(), key=itemgetter(1), reverse=True)[0:k]
res_rank = dict()
for other_user, similarity in sim_users:
items = self.train[other_user]
for item in items:
if item not in user_relate_items:
if item not in res_rank:
res_rank[item] = similarity
else:
res_rank[item] += similarity
return res_rank
def recall(self, N, K):
hit = 0
all = 0
for user in self.train.keys():
if user in self.test:
tu = self.test[user]
rank = self.recommend(user, K)
rk = sorted(rank.items(), key=itemgetter(1), reverse=True)[0:N]
for item, pui in rk:
if item in tu:
hit += 1
all += len(tu)
return hit / (all * 1.0)
path = '../data/ml-100k/u.data'
user_cf = UserCF(input_path=path, split_part=10, test_part=1, seed=10)
print(user_cf.recommend('1', 3))
print(user_cf.recall(100, 80))