这本书比较简单,适合做推荐系统入门级读物。作者写的很仔细,一些复杂的公式都拆开讲。
这是他的网站:http://guidetodatamining.com/
一、通过用户的相似度(最相似用户)进行推荐。
主要有三种方式:1.距离 2.pearson 3.cosine
原理就不说了,书中都有,很简单。主要贴下代码:
1 recommend by distance (when data is dense)
from math import sqrt
users = {"Angelica": {"Blues Traveler": 3.5, "Broken Bells": 2.0, "Norah Jones": 4.5, "Phoenix": 5.0, "Slightly Stoopid": 1.5, "The Strokes": 2.5, "Vampire Weekend": 2.0},
"Bill":{"Blues Traveler": 2.0, "Broken Bells": 3.5, "Deadmau5": 4.0, "Phoenix": 2.0, "Slightly Stoopid": 3.5, "Vampire Weekend": 3.0},
"Chan": {"Blues Traveler": 5.0, "Broken Bells": 1.0, "Deadmau5": 1.0, "Norah Jones": 3.0, "Phoenix": 5, "Slightly Stoopid": 1.0},
"Dan": {"Blues Traveler": 3.0, "Broken Bells": 4.0, "Deadmau5": 4.5, "Phoenix": 3.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 2.0},
"Hailey": {"Broken Bells": 4.0, "Deadmau5": 1.0, "Norah Jones": 4.0, "The Strokes": 4.0, "Vampire Weekend": 1.0},
"Jordyn": {"Broken Bells": 4.5, "Deadmau5": 4.0, "Norah Jones": 5.0, "Phoenix": 5.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 4.0},
"Sam": {"Blues Traveler": 5.0, "Broken Bells": 2.0, "Norah Jones": 3.0, "Phoenix": 5.0, "Slightly Stoopid": 4.0, "The Strokes": 5.0},
"Veronica": {"Blues Traveler": 3.0, "Norah Jones": 5.0, "Phoenix": 4.0, "Slightly Stoopid": 2.5, "The Strokes": 3.0}
}
def computeDistance(u1,u2,r):
"""
compute distance between user1 and user2,r define the distance's category
"""
u1List = users[u1]
u2List = users[u2]
s = 0
for key in u1List:
if key in u2List:
s += pow(abs(u1List[key]-u2List[key]),r)
# print s
distance = pow(s,1.0/r)
return distance
def computeNearestNeighbor(u1,r):
"""
compute the nearest neighbor of u1
"""
minDistance = 1000 # a very large number
minUser = ''
'''
for key in users:
if key <> u1:
currentDistance = computeDistance(u1,key,r)
if currentDistance < minDistance:
minUser,minDistance = key,currentDistance
'''
distanceList = [(computeDistance(u1,key,r),key) for key in users if key<>u1]
distanceList.sort()
return distanceList[0][1]
#return distanceList
def recommend(username,r):
"""
give list of recommendations
"""
recommendList = []
nearestNeighbor = computeNearestNeighbor(username,r)
for key in users[nearestNeighbor]:
if key not in users[username]:
recommendList.append((key,users[nearestNeighbor][key]))
return recommendList
2 pearson (when different users use different scales,"grade-inflation")
def pearson(u1,u2):
"""
compute pearson value between user1 and user2
"""
u1List = users[u1]
u2List = users[u2]
#1.find the same part
sPart = []
for key in u1List:
if key in u2List:
sPart.append(key)
#2.numerator part
a = sum([u1List[key]*u2List[key] for key in sPart])
b1 = sum([u1List[key] for key in sPart])
b2 = sum([u2List[key] for key in sPart])
numerator = a-b1*b2/len(sPart)
#3.denominator part
c = sum([u1List[key]*u1List[key] for key in sPart])
d = sum([u2List[key]*u2List[key] for key in sPart])
denominator = sqrt(c-b1*b1/len(sPart))*sqrt(d-b2*b2/len(sPart))
if denominator == 0:
return 0
else:
return numerator/denominator
3.cosine similiarity(when data is spase)
def scalar(data):
'''
this function calculate the scalar of vector 'data'
'''
total = 0
for key,value in data.items():
total += value*value
return sqrt(total)
def cosine(u1,u2):
'''
calculate the cosine of v1 and v2
'''
v1 = users[u1]
v2 = users[u2]
numerator = 0
for key in v1:
if key in v2:
numerator += v1[key]*v2[key]
result = numerator/(scalar(v1)*scalar(v2))
return result
二、K-nearest neighbor
依靠上面”最相似“的人去推荐,存在”推荐这个人的特殊癖好“的问题,误差大。解决方法也比较直观:找k个最相似的,根据它们的Pearson系数的大小决定推荐分数占得比重。
def rateKNN(username,itemname,k):
"""
predict item's rating of username by KNN
"""
recommendList = []
pearsonDict = {}
for key in users:
pearsonDict[key] = pearson(username,key)
pearsonList = sorted(pearsonDict.items(), key = lambda d:d[1],reverse=True)
numerator = 0
denominator = 0
count = 0
for key,value in pearsonList[1:]:
if itemname in users[key]:
count += 1
if count>k or value<=0:#esure that pearson is >0
break
numerator += value*users[key][itemname]
denominator += value
return numerator/denominator
书中的代码:把当前用户所有缺省的评价都依照邻居计算出来,然后取前k个进行Knn加权求和,最后推荐N个,不过有个疑问:如果和他最相近的那个人没有相应的item评分,则这个人还需要占pearson比重吗?书中的代码weight = nearest[i][1] / totalDistance 没有考虑这种情况。
一下是书中的代码:
def recommend(self, user):
"""Give list of recommendations"""
recommendations = {}
# first get list of users ordered by nearness
nearest = self.computeNearestNeighbor(user)
#
# now get the ratings for the user
#
userRatings = self.data[user]
#
# determine the total distance
totalDistance = 0.0
for i in range(self.k):
totalDistance += nearest[i][1]
# now iterate through the k nearest neighbors
# accumulating their ratings
for i in range(self.k):
# compute slice of pie
weight = nearest[i][1] / totalDistance#??????????????????????
# get the name of the person
name = nearest[i][0]
# get the ratings for this person
neighborRatings = self.data[name]
# get the name of the person
# now find bands neighbor rated that user didn't
for artist in neighborRatings:
if not artist in userRatings:
if artist not in recommendations:
recommendations[artist] = (neighborRatings[artist]
* weight)
else:
recommendations[artist] = (recommendations[artist]
+ neighborRatings[artist]
* weight)
# now make list from dictionary
recommendations = list(recommendations.items())
recommendations = [(self.convertProductID2name(k), v)
for (k, v) in recommendations]
# finally sort and return
recommendations.sort(key=lambda artistTuple: artistTuple[1],
reverse = True)
# Return the first n items
return recommendations[:self.n]
练习:推荐电影,其余的都一样,把数据预处理的部分贴出来。
def loadMovieDB(path=''):
"""
load the movie dataset.Path is where the Movie file is located.
"""
f = open(path + "Movie_Ratings1.csv" , 'r')
movies = f.readline().strip().split(',')[1:]
users = {}
for line in f:
ratelist = line.strip().split(',')
users[ratelist[0]]={}
for index,item in enumerate(ratelist[1:]):
if item=='':
continue
else:
users[ratelist[0]][movies[index]]=int(item)
print users
return users
--------------done---------------