#coding:utf-8
importrandom,mathfrom operator importitemgetterclassUserBasedCF:def __init__(self,trainDataFile=None,testDataFile=None,splitor='\t'):if trainDataFile!=None:
self.train=self.loadData(trainDataFile, splitor)if testDataFile!=None:
self.test=self.loadData(testDataFile, splitor)
self.simiMatrix={}defsetData(self,train,test):
self.train=train
self.test=testdef loadData(self,dataFile,splitor='\t'):
data={}for line inopen(dataFile):
user,item,record,_=line.split()
data.setdefault(user,{})
data[user][item]=recordreturndatadef recallAndPrecision(self,peersCount,topN=10):
hit=0
recall=0
precision=0for user inself.train.keys():
itemOfuser=self.test.get(user,{})
recItems=self.recommend(user,peersCount,topN)for item,pui inrecItems.items():if item initemOfuser:
hit+=1recall+=len(itemOfuser)
precision+=topN#print 'Recall:%s hit:%s allRatings:%s'%(hit/(recall*1.0),hit,precision)
return (hit / (recall * 1.0),hit / (precision * 1.0))def coverage(self,peersCount,topN=10):
recommend_items=set()
all_items=set()for user inself.train.keys():for item inself.train[user].keys():
all_items.add(item)
rank=self.recommend(user,peersCount,topN)for item,pui inrank.items():
recommend_items.add(item)return len(recommend_items)/(len(all_items)*1.0)def popularity(self,peersCount,topN=10):
item_popularity=dict()for user,items inself.train.items():for item initems.keys():if item not initem_popularity:
item_popularity[item]=1item_popularity[item]+=1ret=0
n=0for user inself.train.keys():
rank=self.recommend(user,peersCount,topN)for item,pui inrank.items():
ret+=math.log(1+item_popularity[item])
n+=1
return ret/(n*1.0)defcalUserSimilarity(self):
item_users=dict()for u,ratings inself.train.items():for i inratings.keys():
item_users.setdefault(i,set())
item_users[i].add(u)#calculate co-rated items between users
coRatedCount=dict()
itemCountOfUser=dict()for item,users initem_users.items():for u inusers:
itemCountOfUser.setdefault(u,0)
itemCountOfUser[u]+=1
for v inusers:if u==v:continuecoRatedCount.setdefault(u,{})
coRatedCount[u].setdefault(v,0)
coRatedCount[u][v]+=1/math.log(1+len(users))
userSimiMatrix=dict()for u,related_users incoRatedCount.items():
userSimiMatrix.setdefault(u,{})for v,cuv inrelated_users.items():
userSimiMatrix[u][v]=cuv/math.sqrt(itemCountOfUser[u]*itemCountOfUser[v])
self.simiMatrix=userSimiMatrixdef recommend(self,userU,peersCount,topN=10):
recItems=dict()
interacted_items=self.train[userU]'''prepare the user similarity matrix first'''
if notself.simiMatrix:
self.calUserSimilarity()for userV,simiUV in sorted(self.simiMatrix[userU].items(),key=itemgetter(1),reverse=True)[0:peersCount]:for item,ratingV4I inself.train[userV].items():if item ininteracted_items:continue
if item not inrecItems:
recItems[item]=0
recItems[item]+=simiUV*float(ratingV4I)#transform 4 stars into score 0.8
'''if len(recItems)==topN:
return recItems'''
return dict(sorted(recItems.items(),key = lambda x :x[1],reverse =True)[0:topN])deftestUserBasedCF():
cf=UserBasedCF(trainDataFile=r'E:\ResearchAndPapers\DataSet\ml-100k\u3.base',testDataFile=r'E:\ResearchAndPapers\DataSet\ml-100k\u3.test')#cf.calUserSimilarity()
print("%3s%15s%15s%15s%15s" % ('K',"precision",'recall','coverage','popularity'))for k in [5,10,20,40,80,160]:
recall,precision= cf.recallAndPrecision(peersCount =k)
coverage= cf.coverage(peersCount =k)
popularity= cf.popularity(peersCount =k)print("%3d%14.2f%%%14.2f%%%14.2f%%%15.2f" % (k,precision * 100,recall * 100,coverage * 100,popularity))def SplitData(wholeData,M,k,seed,splitor='\t'):
test={}
train={}
random.seed(seed)for line inwholeData:
user,item,score,time=line.strip().split(splitor)if random.randint(0,M)==k:
test.setdefault(user,{})
test[user][item]=scoreelse:
train.setdefault(user,{})
train[user][item]=scorereturntrain,testdeftestUserBasedCF2():
wholeData=open(r'E:\ResearchAndPapers\DataSet\ml-1m\ratings.dat')
train,test=SplitData(wholeData, 8, 5, 10, splitor='::')
cf=UserBasedCF()
cf.setData(train, test)#cf=UserBasedCF(trainDataFile=r'E:\ResearchAndPapers\DataSet\ml-100k\u5.base',testDataFile=r'E:\ResearchAndPapers\DataSet\ml-100k\u5.test')
#cf.calUserSimilarity()
print("%3s%15s%15s%15s%15s" % ('K',"precision",'recall','coverage','popularity'))for k in [5,10,20,40,80,160]:
recall,precision= cf.recallAndPrecision(peersCount =k)
coverage= cf.coverage(peersCount =k)
popularity= cf.popularity(peersCount =k)print("%3d%14.2f%%%14.2f%%%14.2f%%%15.2f" % (k,precision * 100,recall * 100,coverage * 100,popularity))if __name__=="__main__":
testUserBasedCF()#testUserBasedCF2()