python寻找相似用户_基于用户相似性的协同过滤——Python实现

#coding:utf-8

importrandom,mathfrom operator importitemgetterclassUserBasedCF:def __init__(self,trainDataFile=None,testDataFile=None,splitor='\t'):if trainDataFile!=None:

self.train=self.loadData(trainDataFile, splitor)if testDataFile!=None:

self.test=self.loadData(testDataFile, splitor)

self.simiMatrix={}defsetData(self,train,test):

self.train=train

self.test=testdef loadData(self,dataFile,splitor='\t'):

data={}for line inopen(dataFile):

user,item,record,_=line.split()

data.setdefault(user,{})

data[user][item]=recordreturndatadef recallAndPrecision(self,peersCount,topN=10):

hit=0

recall=0

precision=0for user inself.train.keys():

itemOfuser=self.test.get(user,{})

recItems=self.recommend(user,peersCount,topN)for item,pui inrecItems.items():if item initemOfuser:

hit+=1recall+=len(itemOfuser)

precision+=topN#print 'Recall:%s hit:%s allRatings:%s'%(hit/(recall*1.0),hit,precision)

return (hit / (recall * 1.0),hit / (precision * 1.0))def coverage(self,peersCount,topN=10):

recommend_items=set()

all_items=set()for user inself.train.keys():for item inself.train[user].keys():

all_items.add(item)

rank=self.recommend(user,peersCount,topN)for item,pui inrank.items():

recommend_items.add(item)return len(recommend_items)/(len(all_items)*1.0)def popularity(self,peersCount,topN=10):

item_popularity=dict()for user,items inself.train.items():for item initems.keys():if item not initem_popularity:

item_popularity[item]=1item_popularity[item]+=1ret=0

n=0for user inself.train.keys():

rank=self.recommend(user,peersCount,topN)for item,pui inrank.items():

ret+=math.log(1+item_popularity[item])

n+=1

return ret/(n*1.0)defcalUserSimilarity(self):

item_users=dict()for u,ratings inself.train.items():for i inratings.keys():

item_users.setdefault(i,set())

item_users[i].add(u)#calculate co-rated items between users

coRatedCount=dict()

itemCountOfUser=dict()for item,users initem_users.items():for u inusers:

itemCountOfUser.setdefault(u,0)

itemCountOfUser[u]+=1

for v inusers:if u==v:continuecoRatedCount.setdefault(u,{})

coRatedCount[u].setdefault(v,0)

coRatedCount[u][v]+=1/math.log(1+len(users))

userSimiMatrix=dict()for u,related_users incoRatedCount.items():

userSimiMatrix.setdefault(u,{})for v,cuv inrelated_users.items():

userSimiMatrix[u][v]=cuv/math.sqrt(itemCountOfUser[u]*itemCountOfUser[v])

self.simiMatrix=userSimiMatrixdef recommend(self,userU,peersCount,topN=10):

recItems=dict()

interacted_items=self.train[userU]'''prepare the user similarity matrix first'''

if notself.simiMatrix:

self.calUserSimilarity()for userV,simiUV in sorted(self.simiMatrix[userU].items(),key=itemgetter(1),reverse=True)[0:peersCount]:for item,ratingV4I inself.train[userV].items():if item ininteracted_items:continue

if item not inrecItems:

recItems[item]=0

recItems[item]+=simiUV*float(ratingV4I)#transform 4 stars into score 0.8

'''if len(recItems)==topN:

return recItems'''

return dict(sorted(recItems.items(),key = lambda x :x[1],reverse =True)[0:topN])deftestUserBasedCF():

cf=UserBasedCF(trainDataFile=r'E:\ResearchAndPapers\DataSet\ml-100k\u3.base',testDataFile=r'E:\ResearchAndPapers\DataSet\ml-100k\u3.test')#cf.calUserSimilarity()

print("%3s%15s%15s%15s%15s" % ('K',"precision",'recall','coverage','popularity'))for k in [5,10,20,40,80,160]:

recall,precision= cf.recallAndPrecision(peersCount =k)

coverage= cf.coverage(peersCount =k)

popularity= cf.popularity(peersCount =k)print("%3d%14.2f%%%14.2f%%%14.2f%%%15.2f" % (k,precision * 100,recall * 100,coverage * 100,popularity))def SplitData(wholeData,M,k,seed,splitor='\t'):

test={}

train={}

random.seed(seed)for line inwholeData:

user,item,score,time=line.strip().split(splitor)if random.randint(0,M)==k:

test.setdefault(user,{})

test[user][item]=scoreelse:

train.setdefault(user,{})

train[user][item]=scorereturntrain,testdeftestUserBasedCF2():

wholeData=open(r'E:\ResearchAndPapers\DataSet\ml-1m\ratings.dat')

train,test=SplitData(wholeData, 8, 5, 10, splitor='::')

cf=UserBasedCF()

cf.setData(train, test)#cf=UserBasedCF(trainDataFile=r'E:\ResearchAndPapers\DataSet\ml-100k\u5.base',testDataFile=r'E:\ResearchAndPapers\DataSet\ml-100k\u5.test')

#cf.calUserSimilarity()

print("%3s%15s%15s%15s%15s" % ('K',"precision",'recall','coverage','popularity'))for k in [5,10,20,40,80,160]:

recall,precision= cf.recallAndPrecision(peersCount =k)

coverage= cf.coverage(peersCount =k)

popularity= cf.popularity(peersCount =k)print("%3d%14.2f%%%14.2f%%%14.2f%%%15.2f" % (k,precision * 100,recall * 100,coverage * 100,popularity))if __name__=="__main__":

testUserBasedCF()#testUserBasedCF2()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值