第1关:用相关系数计算直线之间的相似度
# 欧几里得相关系数
def euclidean(p, q):
# 如果两数据集数目不同,计算两者之间都对应有的数
same = 0
for i in p:
if i in q:
same += 1
# 计算欧几里德距离,并将其标准化
########## Begin ##########
e = sum([(p[j] - q[j]) ** 2 for j in range(same)])
########## End ##########
return 1 / (1 + e ** 0.5)
print("欧几里得计算出的相似度为",euclidean([1, 2, 3, 4, 5], [2, 4, 6, 8, 10]))
# 余弦相似度
def cosine_similarity(x, y):
xx = 0.0
yy = 0.0
xy = 0.0
for i in range(len(x)):
xx += x[i] * x[i]
yy += y[i] * y[i]
xy += x[i] * y[i]
xx_sqrt = xx ** 0.5
# print(xx)
# print(yy)
# print(xy)
yy_sqrt = yy ** 0.5
cos = xy/(xx_sqrt*yy_sqrt)
return cos
print('余弦相关系数计算出的相似度为',cosine_similarity([5,3],[5,8]))
#泊松相关系数
import scipy
import numpy
from scipy.stats import pearsonr
#print()
########## Begin ##########
import pandas as pd
df=pd.read_csv('/data/bigfiles/2017fb1c-45c5-4d02-bcf2-e825b34f8662')
#print(df.loc[[1,16],'2004/1/20':'4/20/20'].values.tolist()) # 选取了第0,1,16行的用户从2004/1/20到2004/20/20这段日期的数据
#print(df.head())
x =numpy.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,1,1])
########## End ##########
y =numpy.array([1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,3])
r_row, p_value = pearsonr(x, y)
print ("用户(UID)84001033与用户(UID)84001003从2004/1/20到4/20/20这段日期的相似度为",r_row)
# print (p_value)
第2关:基于相似度度量的商品推荐
# (一):找到与目标用户兴趣相似的用户集合
# ########## Begin ##########
# 目标用户(A用户喜欢a、b、d商品)
# 相似用户用户()
target_user = {'A':['a','b','d']}
print(f'目标用户:{target_user}')
# 相似用户用户()
alike_user = {'B': ['a','c'],'C': ['b','e'],'D':['c','d','e']}
print(f'相似用户:{alike_user}')
# ########## End ##########
# 倒排表
"""
a A B C
"""
# 总共商品类型
key_value = []
value1 = target_user.values()
for item in value1:
for good in item:
# 如果不再就添加到键值
if good not in key_value:
key_value.append(good)
value2 = alike_user.values()
########## Begin ##########
for item in value2:
for good in item:
if good not in key_value:
key_value.append(good)
print(f'总共商品类型:{key_value}')
########## End ##########
new_table = []
for good in key_value:
new_dict = {}
user_list = []
# 目标用户
key_value_list = target_user.items()
# print(key_value_list)
for key_value in key_value_list:
key = key_value[0]
value = key_value[1]
if (good in value) & (key not in user_list):
user_list.append(key)
# new_dict[good] = user_list
# new_table.append(new_dict)
# 相似用户
key_value_list = alike_user.items()
# print(key_value_list)
for key_value in key_value_list:
key = key_value[0]
value = key_value[1]
if (good in value) & (key not in user_list):
user_list.append(key)
new_dict[good] = user_list
new_table.append(new_dict)
print(new_table)
########## Begin ##########
# 计算余弦相似度
import pandas as pd
import numpy as np
df = pd.DataFrame(data=np.zeros((4,4)), columns=['A','B','C','D'],index=['A','B','C','D'])
print(df)
# 统计交集
for item in new_table:
print(list(item.values())[0])
label = list(item.values())[0]
x = label[0]
y = label[1]
df.loc[x,y] = df.loc[x,y] + 1
df.loc[y,x] = df.loc[y,x] + 1
print(df)
# 统计交集
########## End ##########
# 计算两两之间的相似度
count_list = {}
for i in ['A','B','C','D']:
count = df.loc[i,:].sum()
count_list[i] = count
print(count_list)
# 计算余弦相似度
########## Begin ##########
for i in ['A','B','C','D']:
for j in ['A', 'B', 'C', 'D']:
df.loc[i,j] = df.loc[i,j] / np.sqrt(count_list[i] * count_list[j])
########## End ##########
print(df)
########## Begin ##########
# 计算p(A,c)和p(A,e)
p_Ac = df.loc['A','B'] + df.loc['A','D']
print(f'p(A,c):{p_Ac}')
p_Ae = df.loc['A','C'] + df.loc['A','D']
print(f'p(A,e):{p_Ae}')
########## End ##########
if p_Ac > p_Ae:
print("用户A对c商品更感兴趣,将e商品推荐给A")
elif p_Ac < p_Ae:
print("用户A对e商品更感兴趣,将e商品推荐给A")
else:
print("用户A对c商品和e商品同样感兴趣!")