使用Python的Pandas库实现基于用户的协同过滤推荐算法

本文在下文的代码基础上修改而来：

【笔记3】用pandas实现矩阵数据格式的推荐算法 (基于用户的协同)

Python 3.5.5
Pandas 0.22.0
import pandas as pd

df = None

def dataSet2Matrix(filename):
"""
导入训练数据
:param filename: 数据文件路径
"""
table_name = ['userId', 'movieId', 'rating', 'timestamp']
# 按照','分割读取csv文件
global df
# 转换成User-Item矩阵
df = ratings.pivot(index='userId', columns='movieId', values='rating')

dataSet2Matrix('test.csv')
df
userId/movieId 1 2 3 4 5 6 7 8
1 3.5 2.0 NaN 4.5 5.0 1.5 2.5 2.0
2 2.0 3.5 4.0 NaN 2.0 3.5 NaN 3.0
3 5.0 1.0 1.0 3.0 5.0 1.0 NaN NaN
4 3.0 4.0 4.5 NaN 3.0 4.5 4.0 2.0
5 NaN 4.0 1.0 4.0 NaN NaN 4.0 1.0
6 NaN 4.5 4.0 5.0 5.0 4.5 4.0 4.0
7 5.0 2.0 NaN 3.0 5.0 4.0 5.0 NaN
8 3.0 NaN NaN 5.0 4.0 2.5 3.0 4.0

# 构建共同的评分向量
def build_xy(user_id1, user_id2):
bool_array = df.loc[user_id1].notnull() & df.loc[user_id2].notnull()
return df.loc[user_id1, bool_array], df.loc[user_id2, bool_array]

print(build_xy(1,2))
(movieId
1    3.5
2    2.0
5    5.0
6    1.5
8    2.0
Name: 1, dtype: float64, movieId
1    2.0
2    3.5
5    2.0
6    3.5
8    3.0
Name: 2, dtype: float64)

# 欧几里德距离
def euclidean(user_id1, user_id2):
x, y = build_xy(user_id1, user_id2)
try:
value = sum((x - y)**2)**0.5
except ZeroDivisionError:
value = 0
return value

# 余弦相似度
def cosine(user_id1, user_id2):
x, y = build_xy(user_id1, user_id2)
# 分母
denominator = (sum(x*x)*sum(y*y))**0.5
try:
value = sum(x*y)/denominator
except ZeroDivisionError:
value = 0
return value

# 皮尔逊相关系数
def pearson(user_id1, user_id2):
x, y = build_xy(user_id1, user_id2)
mean1, mean2 = x.mean(), y.mean()
# 分母
denominator = (sum((x-mean1)**2)*sum((y-mean2)**2))**0.5
try:
value = sum((x - mean1) * (y - mean2)) / denominator
except ZeroDivisionError:
value = 0
return value

print(pearson(1,2))
-0.9040534990682686
metric_funcs = {
'euclidean': euclidean,
'pearson': pearson,
'cosine': cosine
}

# 计算最近的邻居
def computeNearestNeighbor(user_id, metric='pearson', k=3):
"""
metric: 度量函数
k:      返回k个邻居
返回：pd.Series，其中index是邻居名称，values是距离
"""
if metric in ['manhattan', 'euclidean']:
return df.drop(user_id).index.to_series().apply(metric_funcs[metric], args=(user_id,)).nsmallest(k)
elif metric in ['pearson', 'cosine']:
return df.drop(user_id).index.to_series().apply(metric_funcs[metric], args=(user_id,)).nlargest(k)

print(computeNearestNeighbor(3))
userId
1    0.819782
6    0.801784
7    0.766965
Name: userId, dtype: float64
# 向给定用户推荐（返回：pd.Series）
def recommend(user_id):
# 找到距离最近的用户id
nearest_user_id = computeNearestNeighbor(user_id, metric='cosine').index[0]
print('最近邻用户id：', nearest_user_id)
# 找出邻居评价过、但自己未曾评价的乐队（或商品）
# 结果：index是商品名称，values是评分
return df.loc[nearest_user_id, df.loc[user_id].isnull() & df.loc[nearest_user_id].notnull()].sort_values()

recommend(3)
最近邻用户id： 1

movieId
8    2.0
7    2.5
Name: 1, dtype: float64