#tanamoto系数,适用于1和0
def tanamoto(v1, v2):
c1, c2, shr = 0, 0, 0
for i in range(len(v1)):
if v1[i] != 0: c1 += 1 # in v1
if v2[i] != 0: c2 += 1 # in v2
if v1[i] != 0 and v2[i] != 0: shr += 1 # in both
return 1.0 - (float(shr) / (c1 + c2 - shr))
#wants,people,data=readfile('zebo.txt')
#clust=hcluster(data,distance=tanamoto)
#drawdendrogram(clust,wants,jpeg='zeboclusters.jpg')
def scaledown(data, distance=pearson, rate=0.01):
n = len(data)
# 每一对群组的真实距离
realdist = [[distance(data[i], data[j]) for j in range(n)] #nxn的一个数组
for i in range(0, n)]
# 随机初始化节点在2维空间的坐标,将一个群组的多维数据投影成2维
loc = [[random.random(), random.random()] for i in range(n)]#2xn的一个数组
fakedist = [[0.0 for j in range(n)] for i in range(n)]#nxn的一个数组
lasterror = None
for m in range(0, 1000):
# 寻找投影后的每两点距离(有重复计算,A1--》A2与A2--》A1)
for i in range(n):
for j in range(n):
fakedist[i][j] = sqrt(sum([pow(loc[i][x] - loc[j][x], 2)
for x in range(len(loc[i]))]))
# 移动节点
grad = [[0.0, 0.0] for i in range(n)]
totalerror = 0
for k in range(n):
for j in range(n):
if j == k: continue
# 误差值等于(目标距离-实际距离)/实际距离
errorterm = (fakedist[j][k] - realdist[j][k]) / realdist[j][k]
# 每个节点根据误差,按比例移动靠近或远离其他节点(与其他节点的差值的累积和)
grad[k][0] += ((loc[k][0] - loc[j][0]) / fakedist[j][k]) * errorterm
grad[k][1] += ((loc[k][1] - loc[j][1]) / fakedist[j][k]) * errorterm
# 记录总误差
totalerror += abs(errorterm)
#print(totalerror)
# 如果结果变差,结束
if lasterror and lasterror < totalerror: break
lasterror = totalerror
# 根据 rate 和 gradient乘积,移动每一个节点
for k in range(n):
loc[k][0] -= rate * grad[k][0]
loc[k][1] -= rate * grad[k][1]
return loc
def draw2d(data, labels, jpeg='mds2d.jpg'):
img = Image.new('RGB', (2000, 2000), (255, 255, 255))
draw = ImageDraw.Draw(img)
for i in range(len(data)):
x = (data[i][0] + 0.5) * 1000
y = (data[i][1] + 0.5) * 1000
draw.text((x, y), labels[i], (0, 0, 0))
img.save(jpeg, 'JPEG')
coords=scaledown(data)
draw2d(coords,blognames,jpeg="blogs2d.jpg")
Pytho集体智慧编程之聚类算法2
于 2024-09-14 01:20:24 首次发布

被折叠的 条评论
为什么被折叠?



