1.简介
基于用户的协同过滤核心思想为:基于用户对物品的偏好,找到相邻邻居用户,然后将邻居用户喜欢的商品推荐给当前用户。
主要两个步骤:
- 找到和目标用户兴趣相似的用户集合
- 找到这个集合中的用户喜欢的,且目标用户没有听说过的物品推荐给目标用户。
2.举例
假如我们有一个这样的矩阵:
[
m
o
v
i
e
1
m
o
v
i
e
2
m
o
v
i
e
3
m
o
v
i
e
4
m
o
v
i
e
5
m
o
v
i
e
6
u
s
e
r
1
5
0
0
5
0
2
u
s
e
r
2
4
0
2
4
0
0
u
s
e
r
3
3
3
4
0
3
2
]
\left[ \begin{matrix} & movie1 & movie2 & movie3 & movie4 & movie5 & movie6\\ user1 & 5 & 0 & 0 & 5 & 0 & 2\\ user2 & 4 & 0 & 2 & 4 & 0 & 0\\ user3 & 3 & 3 & 4 & 0 & 3 & 2\\ \end{matrix} \right]
⎣⎢⎢⎡user1user2user3movie1543movie2003movie3024movie4540movie5003movie6202⎦⎥⎥⎤
纵轴为用户,横轴为电影,数值表示为用户对电影的评分,0表示为没看过。
首先我们通过余弦相似性计算user1与user2、user3之间相似性:
[
u
s
e
r
2
u
s
e
r
3
u
s
e
r
1
0.9
0.2
]
\left[ \begin{matrix} & user2 & user3\\ user1 & 0.9 & 0.2 \end{matrix} \right]
[user1user20.9user30.2]
我们用余弦相似性主要是因为,不同用户对电影的打分力度不同,有的严一点平均打分低,有的松一点平均打分都很高,用余弦相似性可以排除打分程度的干扰。
对于user1没看过的电影为movie2、movie3、movie5分别进行打分,每部电影为相似用户的相似系数乘上用户的打分。
所以movie2=0.6 movie3=2.6 movie5=0.6,所以推荐顺序为movie3、movie2、movie5
3.代码
#!/usr/bin/Python
# -*- coding: utf-8 -*-
import pandas as pd
import math
class UserCF:
def __init__(self, data):
self.data = data
def _cosine_sim(self, target_movies, other_movies):
"""
计算余弦相似性
:param target_movies:
:param other_movies:
:return:
"""
target_dict = target_movies.set_index("movieId").to_dict()['rating']
movies_dict = other_movies.set_index("movieId").to_dict()['rating']
union_movies = set(target_dict.keys()) & set(movies_dict.keys())
if len(union_movies) == 0:
return 0.0
score_1 = 0
for movie in union_movies:
score_1 += (target_dict[movie] * movies_dict[movie])
rating_1 = sum(target_movies['rating'].values ** 2)
rating_2 = sum(other_movies['rating'].values ** 2)
score_2 = math.sqrt(rating_1 * rating_2)
return score_1 / score_2
def _get_top_n_users(self, target_user_id, user_n):
"""
计算目标用户与其他用户的相似性
:param target_user_id:
:param user_n:
:return:
"""
target_movies = self.data[self.data['userId'] == target_user_id][['movieId', 'rating']]
other_users_id = set(self.data['userId'].unique()) - set([target_user_id])
# 二维矩阵,每一维包含当前用户看过的电影id
other_movies = [self.data[self.data['userId'] == i][['movieId', 'rating']] for i in other_users_id]
sim_list = [self._cosine_sim(target_movies, movies) for movies in other_movies]
sim_list = sorted(zip(other_users_id, sim_list), key=lambda x: x[1], reverse=True)
return sim_list[:user_n]
def _get_candidates_items(self, target_user_id):
"""
从源数据中找到与目标用户没有看过的所有电影
:param target_user_id:
:return:
"""
target_user_movies = set(self.data[self.data['userId'] == target_user_id]['movieId'])
candidates_movies = set(self.data['movieId'].unique()) - target_user_movies
return candidates_movies
def _get_top_m_items(self, top_n_users, candidates_movies, item_n):
"""
计算候选movies中top n感兴趣的电影
:param top_n_users:
:param candidates_movies:
:param item_n:
:return:
"""
top_n_user_data = [self.data[self.data['userId'] == k] for k, _ in top_n_users]
interest_list = []
for movie_id in candidates_movies:
temp = []
i = 0
for user_data in top_n_user_data:
i += 1
if movie_id in user_data['movieId'].values:
temp.append(user_data[user_data['movieId'] == movie_id][
'rating'].values[0] / 5)
else:
temp.append(0)
interest = sum([top_n_users[i][1] * temp[i] for i in range(len(top_n_users))])
interest_list.append((movie_id, interest))
interest_list = sorted(interest_list, key=lambda x: x[1], reverse=True)
return interest_list[:item_n]
def calculate(self, target_user_id=1, user_n=20, item_n=10):
"""
用userCF来做推荐
:param target_user_id: 对目标用户进行推荐
:param user_n: 找到最相似的20个用户
:param item_n: 推荐Top item_n个
:return:
"""
# 最相似的top N用户
top_n_users = self._get_top_n_users(target_user_id, user_n)
# 推荐系统的候选movies
candidates_movies = self._get_candidates_items(target_user_id)
# 最感兴趣的top M电影
top_m_items = self._get_top_m_items(top_n_users, candidates_movies, item_n)
return top_m_items
if __name__ == "__main__":
file_path = '../data/ml-latest-small/ratings.csv'
data = pd.read_csv(file_path)
user_cf = UserCF(data=data)
print(user_cf.calculate())
"""
给用户1的推荐为:
[(589, 5.237113631352064),
(1200, 5.090210153129966),
(2762, 4.986030458779002),
(858, 4.862967521061869),
(1387, 4.824667207267076),
(1968, 4.5848061705738),
(1036, 4.550872910458577),
(1610, 4.505195253790564),
(2918, 4.162146002324606),
(2804, 4.111230215054932)]
"""
4.评测
我们随机抽取20个user进行测评
def calculate_total(self, calcu_user_n=20, user_n=20, item_n=10, seed=1):
"""
计算所有指标
:param calcu_user_n: 计算用户个数
:param user_n:
:param item_n:
:param seed:
:return:
"""
self._split_data(seed=seed)
self._set_top(user_n = user_n, item_n=item_n)
test_user_list = list(set(self.test['userId'].unique()))
user_list = [test_user_list[random.randint(0, len(test_user_list)) - 1]
for i in range(calcu_user_n)]
hit = 0 # 击中长度
all_recom = 0 # 所有用户推荐个数和,用于计算精确率
like_item = 0 # 用户在测试集中喜欢的项目长度,用于计算召回率
all_recom_set = set()
all_item = set(self.train['movieId'].unique())
item_popular = Counter(self.train['movieId'].values)
ret = 0
n = 0
print('\n计算所有测评指标中...')
for user in tqdm(user_list):
recom_data = self._get_recommend(user,)
recom_item = set([data[0] for data in recom_data])
user_item = set(
self.test[self.test['userId'] == user]['movieId'].values)
overlap = recom_item & user_item
hit += len(overlap)
like_item += len(user_item)
all_recom += len(recom_item)
all_recom_set.update(recom_item)
for rec in set([data[0] for data in recom_data]):
ret += math.log(1 + item_popular.get(rec))
n += 1
print('\n计算完毕。')
print('精确率为:', hit / (all_recom * 1.0))
print('召回率为:', hit / (like_item * 1.0))
print('覆盖率为:', len(all_recom_set) / (len(all_item) * 1.0))
print('新颖度为:', (ret / n * 1.0))
"""
精确率为: 0.3025
召回率为: 0.12173038229376258
覆盖率为: 0.0161631925091963
新颖度为: 4.838176063699056
"""
详细代码见我的github:https://github.com/cz95/movie_recommend