包含全部示例的代码仓库见GIthub
1 导入库
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
2 数据准备
data = pd.DataFrame({'one':[4, np.nan, 2, np.nan],
'two':[np.nan, 4, np.nan, 5],
'three':[5, np.nan, 2, np.nan],
'four':[3, 4, np.nan, 3],
'five':[5, np.nan, 1, np.nan],
'six': [np.nan, 5, np.nan, 5],
'seven':[np.nan, np.nan, np.nan, 4]},
index = list('ABCD'))
data
# output
one two three four five six seven
A 4.0 NaN 5.0 3.0 5.0 NaN NaN
B NaN 4.0 NaN 4.0 NaN 5.0 NaN
C 2.0 NaN 2.0 NaN 1.0 NaN NaN
D NaN 5.0 NaN 3.0 NaN 5.0 4.0
3 找到和A趣味相投的人
3.1 余弦相似度
sim_AB = cosine_similarity(data.loc['A',:].fillna(0).values.reshape(1, -1),
data.loc['B',:].fillna(0).values.reshape(1, -1))
# 新版本 Serise 没有 reshape
sim_AB
# output
array([[0.18353259]])
sim_AC = cosine_similarity(data.loc['A',:].fillna(0).values.reshape(1, -1),
data.loc['C',:].fillna(0).values.reshape(1, -1))
sim_AC
# output
array([[0.88527041]])
3.2 去中心化
可能有些用户对所有物品的评分都比较高,所以对行取平均值
data_center = data.apply(lambda x: x-x.mean(), axis=1) # 对行取平均值
sim_AB = cosine_similarity(data_center.loc['A',:].fillna(0).values.reshape(1, -1),
data_center.loc['B',:].fillna(0).values.reshape(1, -1))
sim_AB
# output
array([[0.30772873]])
sim_AC = cosine_similarity(data_center.loc['A',:].fillna(0).values.reshape(1, -1),
data_center.loc['C',:].fillna(0).values.reshape(1, -1))
sim_AC
# output
array([[-0.24618298]])
sim_AD = cosine_similarity(data_center.loc['A',:].fillna(0).values.reshape(1, -1),
data_center.loc['D',:].fillna(0).values.reshape(1, -1))
sim_AD
# output
array([[0.56818182]])
4 预测A对two的评分
(sim_AD*data.loc['D', 'two'] + sim_AB*data.loc['B', 'two'])/(sim_AD+sim_AB)
# output
array([[4.64867562]])