也就是tanimoto相关度能够很好的解决01关系, 也就是是否关系, 比如是否看过某部电影; 而皮尔逊相关度能够很好的解决一些用程度衡量的, 比如为某部电影打分就是程度
其中用到的数据来自集体智慧编程的作者
只是把距离函数改掉了:
#coding:utf-8
import os
import sys
import chardet
from math import sqrt
from PIL import Image, ImageDraw
import random
def readFile(fileName):
lines = [line for line in file(fileName)]
colNames = lines[0].strip().split('\t')[1:]
rowNames = []
data = []
for line in lines[1:]:
p = line.strip().split('\t')
rowNames.append(p[0])
data.append([float(x) for x in p[1:]])
return rowNames, colNames, data
def pearsonBeta(v1, v2):
sum1 = sum(v1)
sum2 = sum(v2)
sum1Sq = sum([pow(v, 2) for v in v1])
sum2Sq = sum([pow(v, 2) for v in v2])
pSum = sum([v1[i] * v2[i] for i in range(len(v1))])
nums = pSum - (sum1 * sum2 / len(v1))
den = sqrt((sum1Sq - pow(sum1, 2) / len(v1)) * (sum2Sq - pow(sum2, 2) / len(v2)))
if(den == 0):
return 0
return 1.0 - nums/den
#距离函数
def pearson(v1, v2):
sum1 = sum(v1)
sum2 = sum(v2)
eSum1 = sum1 / len(v1)
eSum2 = sum2 / len(v2)
pSum = sum([(v1[i] - eSum1) * (v2[i] - eSum2) for i in range(len(v1))])
pTmp1 = sqrt(sum([pow(v1[i] -eSum