用户物品相似度计算
users = ["User1","User2","User3","User4","User5"]
items =["Item A","Item B","Item C","Item D","Item E"]
#用户购买记录数据
datasets = [
[1, 0, 1, 1, 0],
[1, 0, 0, 1, 1],
[1, 0, 1, 0, 0],
[0, 1, 0, 1, 1],
[1, 1, 1, 0, 1],
]
import pandas as pd
df = pd.DataFrame(datasets, columns=items,index=users)
df
Item A | Item B | Item C | Item D | Item E | |
---|---|---|---|---|---|
User1 | 1 | 0 | 1 | 1 | 0 |
User2 | 1 | 0 | 0 | 1 | 1 |
User3 | 1 | 0 | 1 | 0 | 0 |
User4 | 0 | 1 | 0 | 1 | 1 |
User5 | 1 | 1 | 1 | 0 | 1 |
from sklearn.metrics.pairwise import pairwise_distances
# 计算用户的相似度
user_similar = 1 - pairwise_distances(df.values, metric='jaccard')
user_similar = pd.DataFrame(user_similar, columns=users, index=users)
user_similar
D:\Apps\Anaconda3\lib\site-packages\sklearn\metrics\pairwise.py:1735: DataConversionWarning: Data was converted to boolean for metric jaccard
warnings.warn(msg, DataConversionWarning)
User1 | User2 | User3 | User4 | User5 | |
---|---|---|---|---|---|
User1 | 1.000000 | 0.50 | 0.666667 | 0.2 | 0.4 |
User2 | 0.500000 | 1.00 | 0.250000 | 0.5 | 0.4 |
User3 | 0.666667 | 0.25 | 1.000000 | 0.0 | 0.5 |
User4 | 0.200000 | 0.50 | 0.000000 | 1.0 | 0.4 |
User5 | 0.400000 | 0.40 | 0.500000 | 0.4 | 1.0 |
# 计算物品的相似度
item_similar = 1 - pairwise_distances(df.T.values, metric='jaccard')
item_similar = pd.DataFrame(item_similar, columns=items, index=items)
item_similar
D:\Apps\Anaconda3\lib\site-packages\sklearn\metrics\pairwise.py:1735:DataConversionWarning: Data was converted to boolean for metric jaccard
warnings.warn(msg, DataConversionWarning)
Item A | Item B | Item C | Item D | Item E | |
---|---|---|---|---|---|
Item A | 1.00 | 0.200000 | 0.75 | 0.40 | 0.400000 |
Item B | 0.20 | 1.000000 | 0.25 | 0.25 | 0.666667 |
Item C | 0.75 | 0.250000 | 1.00 | 0.20 | 0.200000 |
Item D | 0.40 | 0.250000 | 0.20 | 1.00 | 0.500000 |
Item E | 0.40 | 0.666667 | 0.20 | 0.50 | 1.000000 |
基于用户的协同过滤 UserCF
# 为每一个用户找到最相似的2个用户
topN_users = {}
for i in user_similar.index:
# 取出每一列数据 删除自己 按照相似度排序
_df = user_similar.loc[i].drop([i])
_df_sorted = _df.sort_values(ascending = False)
top2 = list(_df_sorted.index[:2])
topN_users[i] = top2
topN_users
{‘User1’: [‘User3’, ‘User2’],
‘User2’: [‘User4’, ‘User1’],
‘User3’: [‘User1’, ‘User5’],
‘User4’: [‘User2’, ‘User5’],
‘User5’: [‘User3’, ‘User4’]}
topN_users.items()
dict_items([(‘User1’, [‘User3’, ‘User2’]), (‘User2’, [‘User4’, ‘User1’]), (‘User3’, [‘User1’, ‘User5’]), (‘User4’, [‘User2’, ‘User5’]), (‘User5’, [‘User3’, ‘User4’])])
df.loc['User1'].index
Index([‘Item A’, ‘Item B’, ‘Item C’, ‘Item D’, ‘Item E’], dtype=‘object’)
import numpy as np
# 根据topn的相似用户构建推荐结果
re_results = {}
for user, sim_users in topN_users.items():
re_result = set() # 当前用户的相似用户的交互过的物品集合
for sim_user in sim_users:
re_result = re_result.union(set(df.loc[sim_user].replace(0, np.nan).dropna().index))
# 过滤掉自己交互过的物品
re_result -= set(df.loc[user].replace(0,np.nan).dropna().index)
re_results[user] = re_result
re_results
{‘User1’: {‘Item E’},
‘User2’: {‘Item B’, ‘Item C’},
‘User3’: {‘Item B’, ‘Item D’, ‘Item E’},
‘User4’: {‘Item A’, ‘Item C’},
‘User5’: {‘Item D’}}