一:物品相似度计算&推荐源代码(算法参考项亮《推荐系统实战》P55):
其他代码模块参考我的另外一篇博客:
这里写链接内容
def ItemSimilarity(self, train=None):
train = self.traindata or train
#calculate co-rated users between items
C = dict()
N = dict()
for u , items in train.items():
for i in items:
N.setdefault(i,0)
N[i] += 1
for j in items:
if i == j:
continue
C.setdefault(i,{})
C[i].setdefault(j, 0)
C[i][j] += 1
#calculate final similarity matrix W
self.W=dict()
for i, related_items in C.items():
for j ,cij in related_items.items():
if i==j:
continue
self.W.setdefault(i,{})
self.W[i].setdefault(j,0)
self.W[i][j]=cij/math.sqrt(N[i]*N[j]*1.0)
return self.W
def recommend(self,user_id,train=None ,k=8,nitem=40):
train = self.traindata or train
rank=dict()
ru = train[user_id]
for i ,pi in ru.items():
for j ,wj in sorted(self.W[i].items(),key = lambda x : x[1],reverse=True)[0:k]:
if j in ru:
continue
rank.setdefault(j,0)
rank[j] += pi*wj
return dict(sorted(rank.items(),key = lambda x :x[1],reverse = True)[0:nitem])
测试语句:
w=cf.ItemSimilarity()
print w['242']['86']
print "**************************"
R=cf.recommend('196')
print R.keys()
实验结果:
1,物品id=242 与物品id=86的相似度
w[‘242’][‘86’]
2,对user_id=196的推荐的物品
3,结果(K=8取最相似的8个物品的集合,nitem=10 取推荐的前10个物品,数据集:u1.base)
数据集:u.date
问题:不知道为啥,这个跑的过程相当慢O__O “…,比userbasedcf慢4倍左右
二:改进算法
def ItemSimilarity(self, train=None):
train = self.traindata or train
#calculate co-rated users between items
C = dict()
N = dict()
self.W=dict()
for u , items in train.items():
for i in items.keys():
N.setdefault(i,0)
N[i] += 1
for j in items.keys():
if i == j:
continue
C.setdefault(i,{})
C[i].setdefault(j, 0)
C[i][j] += 1/math.log( 1+ len(items)*1.0)
#calculate final similarity matrix W
for i, related_items in C.items():
for j ,cij in related_items.items():
if i==j:
continue
self.W.setdefault(i,dict())
self.W[i][j] = cij/math.sqrt(N[i]*N[j]*1.0)
运行结果: