#!/usr/bin/python
from math import sqrt
def genUserBasedMap(file = 'u.data'):
map = {}
f = open(file)
for line in f:
(user, item, rate) = line.split('\t')[0:3]
map.setdefault(int(user), {})
map[int(user)][int(item)] = int(rate)
f.close()
return map
def genItemBasedMap(file = 'u.data'):
map = {}
f = open(file)
for line in f:
(user, item, rate) = line.split('\t')[0:3]
map.setdefault(int(item), {})
map[int(item)][int(user)] = int(rate)
f.close()
return map
def userBased(map, person, n=5, similarity=pearson):
items = {}
itemsSim = {}
for p in map:
if(p == person): continue
score = distance(map, p, person, similarity)
if(score <= 0): continue
for i in map[p]:
if(i != 0 and i != None and i not in map[person]):
items.setdefault(i, 0)
itemsSim.setdefault(i, 0)
items[i] += score * map[p][i]
itemsSim[i] += score
#normalize the items
rankings = [(total/itemsSim[item], item) for item, total in items.items()]
rankings.sort()
rankings.reverse()
return rankings[0:n]
def itemBased(map, item, n = 5, similarity=pearson):
score = []
for i in map:
if i == item: continue
score.append((distance(map, item, i, similarity), i))
score.sort()
score.reverse()
return score[0:n]
def distance(map, p1, p2, similarity=cosine):
si = {}
for item in map[p1]:
if item in map[p2]:
si[item] = 1
if len(si) == 0: return 0
# calc the distance
v1 = [map[p1][i] for i in si]
v2 = [map[p2][i] for i in si]
distance = similarity(v1, v2)
return distance
#different similarity functions
def euclidean(v1, v2):
length = min(len(v1), len(v2))
if length == 0: return 0
d = 0
for i in range(length):
d += pow((v1[i] - v2[i]), 2)
#return sqrt(d)
return 1 / float(1+d)
def cosine(v1, v2):
length = min(len(v1), len(v2))
if length == 0: return 0
dp = 0 #dot product
m1 = 0 #modulus of v1
m2 = 0 #modulus of v2
for i in range(length):
dp += v1[i] * v2[i]
m1 += v1[i] * v1[i]
m2 += v2[i] * v2[i]
if m1 == 0 or m2 == 0: return 0
distance = dp / (sqrt(m1) * sqrt(m2))
return distance
def pearson(v1, v2):
length = min(len(v1), len(v2))
if length == 0: return 0
#e of v1 v2
e1 = 0
e2 = 0
for i in range(length):
e1 += v1[i]
e2 += v2[i]
e1 /= float(length)
e2 /= float(length)
cov = 0 #cov of v1 v2
d1 = 0 #variance of v2
d2 = 0 #variance of v2
for i in range(length):
diff1 = v1[i] - e1
diff2 = v2[i] - e2
cov += diff1 * diff2
d1 += diff1 * diff1
d2 += diff2 * diff2
cov /= float(length)
d1 /= float(length)
d2 /= float(length)
if d1 == 0 or d2 == 0: return 0
return cov / sqrt(d1 * d2)
推荐算法(userBased, itemBased)
最新推荐文章于 2022-10-24 15:28:01 发布