在《集体智慧编程》中提到了如何计算数据的相似性,有两种相关度:欧几里德距离评价和皮尔逊相关度。
欧几里德是用几何上求距离的方法计算,即差值的平方和再开方。
def sim_distance(data_dict,id1,id2,interst_type):
si = {}
if id1 not in data_dict or id2 not in data_dict:
return 0.0
for item in interst_type:
if item in data_dict[id1] and item in data_dict[id2]:
si[item] = 1
if len(si) == 0:
return 0.0
sum_of_squre = sum([pow(data_dict[id1][i]-data_dict[id2][i],2) for i in si])
return 1/(1+sqrt(sum_of_squre))
其中interst_type为数据属性中,需要进行比较的类型。data_dict是一个多重字典。
皮尔逊相关度是以拟合的方法来判断两个数据的相关性,和欧式方法相比更适合数据不是很规范的情况。
def sim_pearson(data_dict,id1,id2,interst_type):
si = {}
if id1 not in data_dict or id2 not in data_dict:
return 0.0
for item in interst_type:
if item in data_dict[id1] and item in data_dict[id2]:
si[item] = 1
n = len(si)
if n==0:
return 0.0
sum1 = sum(data_dict[id1][it] for it in si)
sum2 = sum(data_dict[id2][it] for it in si)
sum1sq = sum(pow(data_dict[id1][it],2) for it in si)
sum2sq = sum(pow(data_dict[id2][it],2) for it in si)
pSum = sum([data_dict[id1][it]*data_dict[id2][it] for it in si])
num = pSum -(sum1*sum2/n)
den = sqrt((sum1sq-pow(sum1,2)/n)*(sum2sq-pow(sum2,2)/n))
if den == 0:
return 0.0
return num/den
一个例子:
class data(object):
def __init__(self):
self.__data_dic = {}
self.__int_type = ['LVMAX4_HP','LVMAX4_ATK']
self.__mesh_type = ['GLUTEN','DURABLE','AGILE','MAGIC','LUCKY','TREASURE']
self.__compare_type = ['LVMAX4_HP','LVMAX4_ATK','GLUTEN','DURABLE','AGILE','MAGIC','LUCKY','TREASURE']
self.__artribute_mesh = {'E':6,'A':5,'B':4,'C':3,'D':2,'E':1,'\\':6}
def catch_datas(self,url):
m = data_pattern.search(get_html(url))
if m:
res,n = sp_pattern.subn('\\\"',m.group(1))
print res
eval(res)
self.data_dict[int(di['ID'])] = di
def insert(self,id,item):
for m in self.__mesh_type:
#print m,item[m]
item[m] = self.__artribute_mesh[item[m][0]]
for m in self.__int_type:
item[m] = int(item[m])
self.__data_dic[id] = item
def cacu_dis(self,i,j,func):
return func(self.__data_dic,i,j,self.__mesh_type)
def print_name(self):
for i in self.__data_dic:
s = self.__data_dic[i]['NAME']
print s
这部分尚存在问题。catchdata是接受一个网址,该网页中含有一个字典数据,使用eval将其执行时会发生错误。目前只能将数据复制来进行测试。
...#数据部分、较长
ob.insert(1,it1)
ob.insert(2,it2)
ob.catch_datas('http://fgowiki.com/guide/petdetail/5')#有误
print ob.cacu_dis(1,2,sim_distance)
print ob.cacu_dis(1,2,sim_pearson)
参考资料:
《集体智慧编程》
python的编码问题:http://www.cnblogs.com/fnng/p/5008884.html