#coding=utf-8
from math import sqrt
##--------------以下使用欧几里得计算用户之间的相似度--------------------##
def sim_distance(prefer, person1, person2):
sim = {}
for item in prefer[person1]:
if item in prefer[person2]:
sim[item] = 1 #添加共同项到字典中
#无共同项,返回0
if len(sim)==0:
return 0
#计算所有共有项目的差值的平方和
sum_all = sum([pow(prefer[person1][item]-prefer[person2][item], 2) for item in sim])
#返回改进的相似度函数
return 1/(1+sqrt(sum_all))
##-----------------------pearson相关度系数------------------------##
def sim_pearson(prefer, person1, person2):
sim = {}
#查找双方都评价过的项,并将相同的项添加到字典sim中
for item in prefer[person1]:
if item in prefer[person2]:
sim[item] = 1
#元素的个数
n = len(sim)
if len(sim)==0:
return -1
#所有偏好之和
sum1 = sum([prefer[person1][item] for item in sim])
sum2 = sum([prefer[person2][item] for item in sim])
#求平方和
sum1Sq = sum( [pow(prefer[person1][item] ,2) for item in sim] )
sum2Sq = sum( [pow(prefer[person2][item] ,2) for item in sim] )
#求乘积之和 ∑XY
sumMulti = sum([prefer[person1][item]*prefer[person2][item] for item in sim])
num1 = sumMulti - (sum1*sum2/n) #计算∑XY-∑X∑Y/n
num2 = sqrt( (sum1Sq-pow(sum1,2)/n)*(sum2Sq-pow(sum2,2)/n)) #计算sqrt((∑x*x -(∑x*x/n))(∑y*y -(∑y*y/n)))
if num2==0:
return 0
return num1/num2
##-----------------获取用户相似度(从用户评价字典中返回Top-K匹配者,K,相似度函数 为可选参数)--------------------------##
def topMatches(prefer, person, n=1, similarity=sim_pearson):
scores=[ (similarity(prefer,person,other),other) for other in prefer if other!=person ]
#对scores列表排序,从高到底
scores.sort()
scores.reverse()
#返回排序列表, 注意[0:n],仅返回前n项;
return scores[0:n]
##--------------------获取推荐(提供推荐,利用所有人评价的加权均值。相似度高,影响因子越大。)---------------------------##
def getRecommendations(prefer, person, similarity=sim_pearson):
totals = {}
simSums = {}
for other in prefer:
if other == person:
continue
else:
sim = similarity(prefer, person, other) #计算比较其他用户的相似度
#相似度>0
if sim<=0: continue
for item in prefer[other]:
if item not in prefer[person]:
#加权评价值:相似度*评价值
totals.setdefault(item,0) #每轮循环开始时初始化为0
totals[item] += prefer[other][item]*sim
#相似度之和
simSums.setdefault(item,0)
simSums[item] += sim
#建立归一化列表,注意此处归一化并没有将最后的结果限制在0-1之间, sigmoid函数可以将最后的结果设置为0-1之间,
#还有一种最简单的方法进行归一化,将分母取为所有totals[item]的最大值即可
##使用ranks=[]表明最后ranks返回的是一个列表,列表的每一项为(total/simSums[item],item)
##其中total/simSums[item] = totals[item]/simSums[item] 分子是加权相似度,分母只是相似度之和
##这是分子和分母的唯一区别
ranks = [ (total/simSums[item],item) for item,total in totals.items() ]
#返回经排序后的列表
ranks.sort()
ranks.reverse()
return ranks
if __name__ == "__main__":
print("\n测试计算欧几里得距离的方法sim_distance()....")
#数据集格式
Prefer = {"tommy":{'War':2.3,'The lord of wings':3.0,'Kongfu':5.0},
"lily":{'War':2.0,'The lord of wings':3.6,'Kongfu':4.1},
"jim":{'War':1.9,'The lord of wings':4.0,'Beautiful America':4.7,'the big bang':1.0},
"jack":{'War':2.8,'The lord of wings':3.5,'Kongfu':5.5}}
print("sim_distance(dic,'lily','jim') = ",sim_distance(Prefer, 'lily', 'jack'))
print("sim_distance(dic,'tommy','jim') = ",sim_distance(Prefer, 'tommy', 'jim'))
print("sim_distance(dic,'tommy','lily') = ",sim_distance(Prefer, 'tommy', 'lily'))
print("sim_distance(dic,'tommy','jack') = ",sim_distance(Prefer, 'tommy', 'jack'))
print("\n测试计算Pearson系数的方法sim_pearson()....")
print("sim_pearson(dic,'lily','jim') = ",sim_pearson(Prefer, 'lily', 'jim'))
print("sim_pearson(dic,'tommy','jim') = ",sim_pearson(Prefer, 'tommy', 'jim'))
print("sim_pearson(dic,'tommy','lily') = ",sim_pearson(Prefer, 'tommy', 'lily'))
print("sim_pearson(dic,'tommy','jack') = ",sim_pearson(Prefer, 'tommy', 'jack'))
print("\n测试topMatches()方法......")
print(topMatches(Prefer, 'tommy'))
print("\n测试推荐方法getRecommendations(prefer, person, similarity=sim_pearson)......")
print(getRecommendations(Prefer, 'tommy'))
简单的推荐系统搭建--Python(pearson,欧几里得相似度)
最新推荐文章于 2022-03-12 21:43:48 发布