hierarchal cluster (层次聚类,single/complete linkage)

看网上似乎没有层次聚类关于single/comlplete linkage只用numpy的轮子,于是根据作业需求造了一个。

虽然都是层次聚类,但是基于single/comlplete linkage的和average linkage的着实不太一样。首先从直观角度来讲,后者每一次合并后都得重新算一次新的簇的中心结点是什么,复杂度非常高。而前者只考虑初始叶结点之间的距离作为最终所有簇间距的评估标准。

现在来仔细思考一下这个内容,如何去设计这个合并簇建立树的过程呢。首先计算两两叶结点之间的Euler距离,这个复杂度看似大实际上是O(n^{2})。因为对于之后的每一次聚类,用到的linkag都会从这里面去挑选,并且一定是从小到大(或者从大到小)。

# Calculate all the distance and get the rank in dictionary
        for i in range(nodes_len-1):
            for j in range(i+1,nodes_len):
                d_key = (nodes[i].id[0],nodes[j].id[0])
                # print nodes[i].id[1]
                distance_list[d_key] = euler_distance(nodes[i].id[1],nodes[j].id[1])
                # sort the distance
                rank_list = sorted(distance_list.items(),key = lambda item:item[1])

而选择的标准是,如果这个linkage是两个簇间的,就被选中,否则就继续选下一个。而判断是否是一个簇,仅需判断叶结点的根节点是不是同一个结点即可。因此

loop_times = 0
        while loop_times < 91:
            # 14(leaf nodes) + 13(no-leaf nodes) = 27
            if len(nodes)>=self.k:
                break
            nodes_id1,nodes_id2 = rank_list[loop_times][0]
            nodes1,nodes2 = nodes[nodes_id1],nodes[nodes_id2]
            # find the cluster's
            nodeptr1 = nodes1
            nodeptr2 = nodes2
            while nodeptr1.father!=None:
                nodeptr1 = nodeptr1.father
            while nodeptr2.father!=None:
                nodeptr2 = nodeptr2.father
            # if these two nodes have the same root
            # They're in the same cluster without merging
            if nodeptr1==nodeptr2:
                loop_times+=1
                continue
            # merge

            new_node = ClusterNode(left=nodeptr1,right=nodeptr2,distance=rank_list[loop_times][1],count=nodeptr1.count+nodeptr2.count,id=newnode_id_num)
            newnode_id_num +=1
            nodeptr1.father = new_node
            nodeptr2.father = new_node
            # print new_node.distance
            print ('In loop ',loop_times,'The single linkage is: ',new_node.distance)
            loop_times +=1
            nodes.append(new_node)

当然代码中有一些特殊数值量如91之类的,是根据结点的数量进行了计算,当然可以让程序进行计算。完整代码,14个结点的建树过程如下:

# -*- coding: utf-8 -*
import numpy as np
import math

# calculate the euler disctance with two array
def euler_distance(a,b):
    dist = np.sqrt(np.sum(np.square(a-b)))
    return dist

# define the cluster class
class ClusterNode(object):
    #initialize the nodes
    def __init__(self,left=None,right=None,distance=-1,count=1,id=None,father=None):
        self.left = left
        self.right = right
        self.distance = distance
        self.count = count
        self.id = id
        self.father = father


class Hierarchical(object):
    # define the stop point
    def __init__(self,k=1):
        assert k>0
        self.k = k;
        self.labels = None
    def train(self,x):
        nodes = [ClusterNode(id=i)for i in enumerate(x)]
        newnode_id_num = 13
        nodes_len = len(nodes)
        #dictionary
        distance_list = {}
        rank_list = []
        # dim
        points_num,features_num = np.shape(x)
        # initialize the labels
        self.labels = [-1]*points_num
        curr_clustid = -1

        # Calculate all the distance and get the rank in dictionary
        for i in range(nodes_len-1):
            for j in range(i+1,nodes_len):
                d_key = (nodes[i].id[0],nodes[j].id[0])
                # print nodes[i].id[1]
                distance_list[d_key] = euler_distance(nodes[i].id[1],nodes[j].id[1])
                # sort the distance
                rank_list = sorted(distance_list.items(),key = lambda item:item[1])
        # print rank_list
        # stop condition is assert k
        # each out loop just merge two parts
        loop_times = 0
        while loop_times < 91:
            # 14(leaf nodes) + 13(no-leaf nodes) = 27
            if len(nodes)>=self.k:
                break
            nodes_id1,nodes_id2 = rank_list[loop_times][0]
            nodes1,nodes2 = nodes[nodes_id1],nodes[nodes_id2]
            # find the cluster's
            nodeptr1 = nodes1
            nodeptr2 = nodes2
            while nodeptr1.father!=None:
                nodeptr1 = nodeptr1.father
            while nodeptr2.father!=None:
                nodeptr2 = nodeptr2.father
            # if these two nodes have the same root
            # They're in the same cluster without merging
            if nodeptr1==nodeptr2:
                loop_times+=1
                continue
            # merge

            new_node = ClusterNode(left=nodeptr1,right=nodeptr2,distance=rank_list[loop_times][1],count=nodeptr1.count+nodeptr2.count,id=newnode_id_num)
            newnode_id_num +=1
            nodeptr1.father = new_node
            nodeptr2.father = new_node
            # print new_node.distance
            print ('In loop ',loop_times,'The single linkage is: ',new_node.distance)
            loop_times +=1
            nodes.append(new_node)
        # for node in reversed(nodes):
        #     print node.id
        self.nodes = nodes
        self.Label()
    def Label(self):
        # From the last to the first to label these fucking nodes
        label = 0
        for node in reversed(self.nodes):
            self.leaf_traversal(node,label)
            label += 1

    # traversal the leaf nodes to label
    def leaf_traversal(self,node,label):
        if node.left == None and node.right == None:
            if self.labels[node.id[0]] == -1:
                self.labels[node.id[0]] = label
        if node.left:
            self.leaf_traversal(node.left,label)
        if node.right:
            self.leaf_traversal(node.right,label)
def loadDataSet(fileName):
    xArr = [];
    yArr = []
    for line in open(fileName).readlines():
        curLine = line.strip().split()
        # curLine = line.strip().split('\t')
        xonerow = []
        for i in range(len(curLine) - 1):

            xonerow.append(float(curLine[i]))
        xArr.append(xonerow)
        yArr.append(float(curLine[-1]))

    return xArr, yArr

if __name__ =="__main__":
    train_x,train_y = loadDataSet('D:\untitled\Hierarchical.txt')
    Hierarchy = Hierarchical(k=27)
    print np.array(train_x).shape[0]

    Hierarchy.train(np.array(train_x))
    print np.array(Hierarchy.labels)
    print train_y


我设定为最后到还剩两类结束,结果为:

('In loop ', 0, 'The single linkage is: ', 1.0)
('In loop ', 1, 'The single linkage is: ', 3.605551275463989)
('In loop ', 2, 'The single linkage is: ', 3.605551275463989)
('In loop ', 3, 'The single linkage is: ', 5.0990195135927845)
('In loop ', 4, 'The single linkage is: ', 5.385164807134504)
('In loop ', 5, 'The single linkage is: ', 5.916079783099616)
('In loop ', 6, 'The single linkage is: ', 6.4031242374328485)
('In loop ', 8, 'The single linkage is: ', 7.810249675906654)
('In loop ', 9, 'The single linkage is: ', 8.54400374531753)
('In loop ', 13, 'The single linkage is: ', 8.831760866327848)
('In loop ', 15, 'The single linkage is: ', 9.486832980505138)
('In loop ', 18, 'The single linkage is: ', 10.392304845413264)

与真实结果比较:

[0 1 0 0 0 1 0 0 0 0 0 0 0 0]
[0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0]

发现误差还真是蛮大的,14个点的特征捕捉不准确,真就分不出来嗷。

  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值