进行实验的源代码参考了项亮的《推荐系统实践》和http://my.oschina.net/zhangjiawen/blog/185625
1.版本1——核心代码为书中代码
import random
import math
import time
class UserBasedCF:
def __init__(self,datafile = None):
self.datafile = datafile
self.readData()
self.splitData(3,47)
def readData(self,datafile = None):
"""
read the data from the data file which is a data set
把文件中的内容读到data中"""
self.datafile = datafile or self.datafile
self.data = []
for line in open(self.datafile):
userid,itemid,record,_ = line.split()
self.data.append((userid,itemid,int(record)))
def splitData(self,k,seed,data=None,M = 8):
"""
split the data set
testdata is a test data set
traindata is a train set
test data set / train data set is 1:M-1
"""
self.testdata = {}
self.traindata = {}
data = data or self.data
random.seed(seed)
for user,item, record in self.data:
if random.randint(0,M) == k:
self.testdata.setdefault(user,{})
self.testdata[user][item] = record
else:
self.traindata.setdefault(user,{})
self.traindata[user][item] = record
def userSimilarity(self,train = None):
train = train or self.traindata
self.userSim = dict()
for u in train.keys():
for v in train.keys():
if u == v:
continue
self.userSim.setdefault(u,{})
self.userSim[u][v] = len(set(train[u].keys()) & set(train[v].keys()))
self.userSim[u][v] /=math.sqrt(len(train[u]) * len(train[v]) *1.0)
def userSimilarityBest(self,train = None):
"""
the other method of getting user similarity which is better than above
you can get the method on page 46
In this experiment,we use this method
"""
train = train or self.traindata
self.userSimBest = dict()
item_users = dict()
for u,item in train.items():
for i in item.keys():
item_users.setdefault(i,set())
item_users[i].add(u)
user_item_count = dict()
count = dict()
for item,users in item_users.items():
for u in users:
user_item_count.setdefault(u,0)
user_item_count[u] += 1
for v in users: