看网上似乎没有层次聚类关于single/comlplete linkage只用numpy的轮子,于是根据作业需求造了一个。
虽然都是层次聚类,但是基于single/comlplete linkage的和average linkage的着实不太一样。首先从直观角度来讲,后者每一次合并后都得重新算一次新的簇的中心结点是什么,复杂度非常高。而前者只考虑初始叶结点之间的距离作为最终所有簇间距的评估标准。
现在来仔细思考一下这个内容,如何去设计这个合并簇建立树的过程呢。首先计算两两叶结点之间的Euler距离,这个复杂度看似大实际上是。因为对于之后的每一次聚类,用到的linkag都会从这里面去挑选,并且一定是从小到大(或者从大到小)。
# Calculate all the distance and get the rank in dictionary
for i in range(nodes_len-1):
for j in range(i+1,nodes_len):
d_key = (nodes[i].id[0],nodes[j].id[0])
# print nodes[i].id[1]
distance_list[d_key] = euler_distance(nodes[i].id[1],nodes[j].id[1])
# sort the distance
rank_list = sorted(distance_list.items(),key = lambda item:item[1])
而选择的标准是,如果这个linkage是两个簇间的,就被选中,否则就继续选下一个。而判断是否是一个簇,仅需判断叶结点的根节点是不是同一个结点即可。因此
loop_times = 0
while loop_times < 91:
# 14(leaf nodes) + 13(no-leaf nodes) = 27
if len(nodes)>=self.k:
break
nodes_id1,nodes_id2 = rank_list[loop_times][0]
nodes1,nodes2 = nodes[nodes_id1],nodes[nodes_id2]
# find the cluster's
nodeptr1 = nodes1
nodeptr2 = nodes2
while nodeptr1.father!=None:
nodeptr1 = nodeptr1.father
while nodeptr2.father!=None:
nodeptr2 = nodeptr2.father
# if these two nodes have the same root
# They're in the same cluster without merging
if nodeptr1==nodeptr2:
loop_times+=1
continue
# merge
new_node = ClusterNode(left=nodeptr1,right=nodeptr2,distance=rank_list[loop_times][1],count=nodeptr1.count+nodeptr2.count,id=newnode_id_num)
newnode_id_num +=1
nodeptr1.father = new_node
nodeptr2.father = new_node
# print new_node.distance
print ('In loop ',loop_times,'The single linkage is: ',new_node.distance)
loop_times +=1
nodes.append(new_node)
当然代码中有一些特殊数值量如91之类的,是根据结点的数量进行了计算,当然可以让程序进行计算。完整代码,14个结点的建树过程如下:
# -*- coding: utf-8 -*
import numpy as np
import math
# calculate the euler disctance with two array
def euler_distance(a,b):
dist = np.sqrt(np.sum(np.square(a-b)))
return dist
# define the cluster class
class ClusterNode(object):
#initialize the nodes
def __init__(self,left=None,right=None,distance=-1,count=1,id=None,father=None):
self.left = left
self.right = right
self.distance = distance
self.count = count
self.id = id
self.father = father
class Hierarchical(object):
# define the stop point
def __init__(self,k=1):
assert k>0
self.k = k;
self.labels = None
def train(self,x):
nodes = [ClusterNode(id=i)for i in enumerate(x)]
newnode_id_num = 13
nodes_len = len(nodes)
#dictionary
distance_list = {}
rank_list = []
# dim
points_num,features_num = np.shape(x)
# initialize the labels
self.labels = [-1]*points_num
curr_clustid = -1
# Calculate all the distance and get the rank in dictionary
for i in range(nodes_len-1):
for j in range(i+1,nodes_len):
d_key = (nodes[i].id[0],nodes[j].id[0])
# print nodes[i].id[1]
distance_list[d_key] = euler_distance(nodes[i].id[1],nodes[j].id[1])
# sort the distance
rank_list = sorted(distance_list.items(),key = lambda item:item[1])
# print rank_list
# stop condition is assert k
# each out loop just merge two parts
loop_times = 0
while loop_times < 91:
# 14(leaf nodes) + 13(no-leaf nodes) = 27
if len(nodes)>=self.k:
break
nodes_id1,nodes_id2 = rank_list[loop_times][0]
nodes1,nodes2 = nodes[nodes_id1],nodes[nodes_id2]
# find the cluster's
nodeptr1 = nodes1
nodeptr2 = nodes2
while nodeptr1.father!=None:
nodeptr1 = nodeptr1.father
while nodeptr2.father!=None:
nodeptr2 = nodeptr2.father
# if these two nodes have the same root
# They're in the same cluster without merging
if nodeptr1==nodeptr2:
loop_times+=1
continue
# merge
new_node = ClusterNode(left=nodeptr1,right=nodeptr2,distance=rank_list[loop_times][1],count=nodeptr1.count+nodeptr2.count,id=newnode_id_num)
newnode_id_num +=1
nodeptr1.father = new_node
nodeptr2.father = new_node
# print new_node.distance
print ('In loop ',loop_times,'The single linkage is: ',new_node.distance)
loop_times +=1
nodes.append(new_node)
# for node in reversed(nodes):
# print node.id
self.nodes = nodes
self.Label()
def Label(self):
# From the last to the first to label these fucking nodes
label = 0
for node in reversed(self.nodes):
self.leaf_traversal(node,label)
label += 1
# traversal the leaf nodes to label
def leaf_traversal(self,node,label):
if node.left == None and node.right == None:
if self.labels[node.id[0]] == -1:
self.labels[node.id[0]] = label
if node.left:
self.leaf_traversal(node.left,label)
if node.right:
self.leaf_traversal(node.right,label)
def loadDataSet(fileName):
xArr = [];
yArr = []
for line in open(fileName).readlines():
curLine = line.strip().split()
# curLine = line.strip().split('\t')
xonerow = []
for i in range(len(curLine) - 1):
xonerow.append(float(curLine[i]))
xArr.append(xonerow)
yArr.append(float(curLine[-1]))
return xArr, yArr
if __name__ =="__main__":
train_x,train_y = loadDataSet('D:\untitled\Hierarchical.txt')
Hierarchy = Hierarchical(k=27)
print np.array(train_x).shape[0]
Hierarchy.train(np.array(train_x))
print np.array(Hierarchy.labels)
print train_y
我设定为最后到还剩两类结束,结果为:
('In loop ', 0, 'The single linkage is: ', 1.0)
('In loop ', 1, 'The single linkage is: ', 3.605551275463989)
('In loop ', 2, 'The single linkage is: ', 3.605551275463989)
('In loop ', 3, 'The single linkage is: ', 5.0990195135927845)
('In loop ', 4, 'The single linkage is: ', 5.385164807134504)
('In loop ', 5, 'The single linkage is: ', 5.916079783099616)
('In loop ', 6, 'The single linkage is: ', 6.4031242374328485)
('In loop ', 8, 'The single linkage is: ', 7.810249675906654)
('In loop ', 9, 'The single linkage is: ', 8.54400374531753)
('In loop ', 13, 'The single linkage is: ', 8.831760866327848)
('In loop ', 15, 'The single linkage is: ', 9.486832980505138)
('In loop ', 18, 'The single linkage is: ', 10.392304845413264)
与真实结果比较:
[0 1 0 0 0 1 0 0 0 0 0 0 0 0]
[0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0]
发现误差还真是蛮大的,14个点的特征捕捉不准确,真就分不出来嗷。