题目描述:
某电影院收集了N个用户对M个电影的观影记录。每个用户一行,第i行的记录形式为:"<用户名ID>\t<电影1>,<电影2>,....."
已知某用户的观影记录为:84,14,90,91,34,76,43,67,36,47,58,24,43
找出与该用户最匹配的钱5名用户。
N个用户的观影记录:
123 44,12
124 30,44
125 28,30
126 84,14,9,23,33,55
127 24,43,67,76,34,42,99,44,55,66
128 91,84,90,76,12,13,15,18
129 34,47,58,74,66,24,43,47
130 90,91,23
131 36,58,47,33,97,92
132 54,76,65,25,46,76,24
133 54,56,74
134 45,65,76,87,98,93
135 86,53,32,85,54,76
136 4,55,23,48,63,85,54
137 88,33,11,55,65,31
python代码实现:
#!/usr/bin/env python
# -*- coding: cp936 -*-
import math
def calc_similar(a1,a2):
return float(calc_same(a1,a2))/math.sqrt(len(a1)*len(a2))
def calc_same(a1,a2):
n1=len(a1)
n2=len(a2)
i=0
j=0
count=0
while(i<n1) and (j<n2):
if a1[i]>a2[j]:
j+=1
elif a1[i]<a2[j]:
i+=1
else:
count +=1
i+=1
j+=1
return count
if __name__=="__main__":
fp=file('F:\\film.txt')
first_user=[1,0,0.0,0,[84,14,90,91,34,76,43,67,36,47,58,24,43]]
first_user[4].sort()
for line in fp:
d=line.split('\t')
user=int(d[0])
films=map(int,d[1].split(','))
films.sort()
test_user=[]
test_user.append(user) #0:用户名ID
test_user.append(0) #1:该用户和其他用户观看的相同电影数目
test_user.append(0.0) #2:该用户和其他用户的相似度
test_user.append(len(films)) #3:其他用户所看的电影数目
test_user.append(films) #4:其他用户的观看电影列表
test_user[2]=calc_similar(first_user[4],test_user[4])
test_user[1]=calc_same(first_user[4],test_user[4])
print test_user
结果如下:
[123, 0, 0.0, 2, [12, 44]]
[124, 0, 0.0, 2, [30, 44]]
[125, 0, 0.0, 2, [28, 30]]
[126, 2, 0.22645540682891913, 6, [9, 14, 23, 33, 55, 84]]
[127, 5, 0.43852900965351466, 10, [24, 34, 42, 43, 44, 55, 66, 67, 76, 99]]
[128, 4, 0.3922322702763681, 8, [12, 13, 15, 18, 76, 84, 90, 91]]
[129, 5, 0.4902903378454601, 8, [24, 34, 43, 47, 47, 58, 66, 74]]
[130, 2, 0.32025630761017426, 3, [23, 90, 91]]
[131, 3, 0.3396831102433787, 6, [33, 36, 47, 58, 92, 97]]
[132, 2, 0.20965696734438366, 7, [24, 25, 46, 54, 65, 76, 76]]
[133, 0, 0.0, 3, [54, 56, 74]]
[134, 1, 0.11322770341445956, 6, [45, 65, 76, 87, 93, 98]]
[135, 1, 0.11322770341445956, 6, [32, 53, 54, 76, 85, 86]]
[136, 0, 0.0, 7, [4, 23, 48, 54, 55, 63, 85]]
[137, 0, 0.0, 6, [11, 31, 33, 55, 65, 88]]
求相似度的其他公式: