1、SlopeOne 是有 Daniel Lemire 和 Anna Maclachlan 在 2005 年提出的一种特殊的基于项目推荐方式,其简单方便有效。下面简单介绍一下它的基本思想。
用户\商品 | 商品1 | 商品2 |
用户1 | 3 | 4 |
用户2 | 4 | ? |
从上表中预测用户2对商品2的评分时采用SlopeOne算法计算方式为:R(用户2,商品2) = 4 +(4-3)= 5
这就是 SlopeOne 推荐的基本原理,它将用户的评分之间的关系看作简单的线性关系:Y = X + b
2、计算步骤:
(1)根据评分矩阵计算两两项目之间的偏差。
(2)根据偏差数据为用户未评分商品做出推荐列表。
3、代码:
import codecs
import time
from math import sqrt
class recommender:
def __init__(self):
self.frequencies = {}
self.deviations = {}
def loadMovieLens(self, path='F:\Programmer\'s Guide to Data Mining\movelenseData\ml-100k\\'):
self.data = {}
self.datatest = {}
i = 0
j = 0
f = codecs.open(path + "u1.base", 'r', 'ascii')
for line in f:
i += 1
fields = line.split('\t')
user = fields[0]
movie = fields[1]
rating = int(fields[2].strip().strip('"'))
if user in self.data:
currentRatings = self.data[user]
else:
currentRatings = {}
currentRatings[movie] = rating
self.data[user] = currentRatings
f.close()
f1 = codecs.open(path + "u1.test", 'r', 'ascii')
for line1 in f1:
j += 1
fields1 = line1.split('\t')
user1 = fields1[0]
movie1 = fields1[1]
rating1 = int(fields1[2].strip().strip('"'))
if user1 in self.datatest:
currentRatings1 = self.datatest[user1]
else:
currentRatings1 = {}
currentRatings1[movie1] = rating1
self.datatest[user1] = currentRatings1
f1.close()
print len(self.datatest)
print self.datatest["458"]
def computeDeviations(self):
print "computing deviations....."
for ratings in self.data.values():
for (item, rating) in ratings.items():
self.frequencies.setdefault(item, {})
self.deviations.setdefault(item, {})
for (item2, rating2) in ratings.items():
if item != item2:
self.frequencies[item].setdefault(item2, 0)
self.deviations[item].setdefault(item2, 0.0)
self.frequencies[item][item2] += 1
self.deviations[item][item2] += rating - rating2
for (item, ratings) in self.deviations.items():
for item2 in ratings:
ratings[item2] /= self.frequencies[item][item2]
print "complite computing deciations."
def slopeOneRecommendations(self, userRatings):
recommendations = {}
frequencies = {}
for (userItem, userRating) in userRatings.items():
for (diffItem, diffRatings) in self.deviations.items():
if diffItem not in userRatings and \
userItem in self.deviations[diffItem]:
freq = self.frequencies[diffItem][userItem]
recommendations.setdefault(diffItem, 0.0)
frequencies.setdefault(diffItem, 0)
recommendations[diffItem] += (diffRatings[userItem] +
userRating) * freq
frequencies[diffItem] += freq
for (k, v) in recommendations.items():
recommendations[k] = v / frequencies[k]
return recommendations
def validation(self):
mae = 0.0
rmse = 0.0
erro_sum = 0.0
sqrError_sum = 0.0
setSum = 0
i = 0
count = 0
recommendation = {}
for user in self.datatest:
print "user",user
i += 1
recommendation = self.slopeOneRecommendations(self.data[user]).copy()
count += len(recommendation.items())
userRatings = self.datatest[user]
for item in recommendation:
if item in userRatings:
erro_sum += abs(userRatings[item]-recommendation[item])
sqrError_sum += (userRatings[item]-recommendation[item])**2
setSum +=1
mae = erro_sum / setSum
rmse = sqrt(sqrError_sum / setSum)
print "setSum",setSum
print "count",count
return mae, rmse
start = time.clock()
r = recommender()
r.loadMovieLens()
r.computeDeviations()
mae,rmse = r.validation()
print "MAE:", mae
print "RMSE:", rmse
end = time.clock()
print "Total times: %f s" % (end - start)
4、结果:
5、总结:
可见SlopeOne的运行结果还可以,但运行时间比较慢,最主要的原因在于使用的字典来处理问题,如果使用矩阵来处理会快很多。
6、参考文献:Programmer's Guide to Data Mining