hierarchal cluster （层次聚类，single/complete linkage）

最新推荐文章于 2022-03-12 11:34:16 发布

forest小拳拳

最新推荐文章于 2022-03-12 11:34:16 发布

阅读量1.7k

点赞数 1

分类专栏：机器/深度学习机器学习数据挖掘

本文链接：https://blog.csdn.net/qq_33097439/article/details/89598132

版权

机器学习同时被 3 个专栏收录

6 篇文章 0 订阅

订阅专栏

数据挖掘

6 篇文章 0 订阅

订阅专栏

机器/深度学习

5 篇文章 0 订阅

订阅专栏

看网上似乎没有层次聚类关于single/comlplete linkage只用numpy的轮子，于是根据作业需求造了一个。

虽然都是层次聚类，但是基于single/comlplete linkage的和average linkage的着实不太一样。首先从直观角度来讲，后者每一次合并后都得重新算一次新的簇的中心结点是什么，复杂度非常高。而前者只考虑初始叶结点之间的距离作为最终所有簇间距的评估标准。

现在来仔细思考一下这个内容，如何去设计这个合并簇建立树的过程呢。首先计算两两叶结点之间的Euler距离，这个复杂度看似大实际上是 $O(n^{2})$ 。因为对于之后的每一次聚类，用到的linkag都会从这里面去挑选，并且一定是从小到大（或者从大到小）。

# Calculate all the distance and get the rank in dictionary
        for i in range(nodes_len-1):
            for j in range(i+1,nodes_len):
                d_key = (nodes[i].id[0],nodes[j].id[0])
                # print nodes[i].id[1]
                distance_list[d_key] = euler_distance(nodes[i].id[1],nodes[j].id[1])
                # sort the distance
                rank_list = sorted(distance_list.items(),key = lambda item:item[1])

而选择的标准是，如果这个linkage是两个簇间的，就被选中，否则就继续选下一个。而判断是否是一个簇，仅需判断叶结点的根节点是不是同一个结点即可。因此

loop_times = 0
        while loop_times < 91:
            # 14(leaf nodes) + 13(no-leaf nodes) = 27
            if len(nodes)>=self.k:
                break
            nodes_id1,nodes_id2 = rank_list[loop_times][0]
            nodes1,nodes2 = nodes[nodes_id1],nodes[nodes_id2]
            # find the cluster's
            nodeptr1 = nodes1
            nodeptr2 = nodes2
            while nodeptr1.father!=None:
                nodeptr1 = nodeptr1.father
            while nodeptr2.father!=None:
                nodeptr2 = nodeptr2.father
            # if these two nodes have the same root
            # They're in the same cluster without merging
            if nodeptr1==nodeptr2:
                loop_times+=1
                continue
            # merge

            new_node = ClusterNode(left=nodeptr1,right=nodeptr2,distance=rank_list[loop_times][1],count=nodeptr1.count+nodeptr2.count,id=newnode_id_num)
            newnode_id_num +=1
            nodeptr1.father = new_node
            nodeptr2.father = new_node
            # print new_node.distance
            print ('In loop ',loop_times,'The single linkage is: ',new_node.distance)
            loop_times +=1
            nodes.append(new_node)

当然代码中有一些特殊数值量如91之类的，是根据结点的数量进行了计算，当然可以让程序进行计算。完整代码，14个结点的建树过程如下：

# -*- coding: utf-8 -*
import numpy as np
import math

# calculate the euler disctance with two array
def euler_distance(a,b):
    dist = np.sqrt(np.sum(np.square(a-b)))
    return dist

# define the cluster class
class ClusterNode(object):
    #initialize the nodes
    def __init__(self,left=None,right=None,distance=-1,count=1,id=None,father=None):
        self.left = left
        self.right = right
        self.distance = distance
        self.count = count
        self.id = id
        self.father = father


class Hierarchical(object):
    # define the stop point
    def __init__(self,k=1):
        assert k>0
        self.k = k;
        self.labels = None
    def train(self,x):
        nodes = [ClusterNode(id=i)for i in enumerate(x)]
        newnode_id_num = 13
        nodes_len = len(nodes)
        #dictionary
        distance_list = {}
        rank_list = []
        # dim
        points_num,features_num = np.shape(x)
        # initialize the labels
        self.labels = [-1]*points_num
        curr_clustid = -1

        # Calculate all the distance and get the rank in dictionary
        for i in range(nodes_len-1):
            for j in range(i+1,nodes_len):
                d_key = (nodes[i].id[0],nodes[j].id[0])
                # print nodes[i].id[1]
                distance_list[d_key] = euler_distance(nodes[i].id[1],nodes[j].id[1])
                # sort the distance
                rank_list = sorted(distance_list.items(),key = lambda item:item[1])
        # print rank_list
        # stop condition is assert k
        # each out loop just merge two parts
        loop_times = 0
        while loop_times < 91:
            # 14(leaf nodes) + 13(no-leaf nodes) = 27
            if len(nodes)>=self.k:
                break
            nodes_id1,nodes_id2 = rank_list[loop_times][0]
            nodes1,nodes2 = nodes[nodes_id1],nodes[nodes_id2]
            # find the cluster's
            nodeptr1 = nodes1
            nodeptr2 = nodes2
            while nodeptr1.father!=None:
                nodeptr1 = nodeptr1.father
            while nodeptr2.father!=None:
                nodeptr2 = nodeptr2.father
            # if these two nodes have the same root
            # They're in the same cluster without merging
            if nodeptr1==nodeptr2:
                loop_times+=1
                continue
            # merge

            new_node = ClusterNode(left=nodeptr1,right=nodeptr2,distance=rank_list[loop_times][1],count=nodeptr1.count+nodeptr2.count,id=newnode_id_num)
            newnode_id_num +=1
            nodeptr1.father = new_node
            nodeptr2.father = new_node
            # print new_node.distance
            print ('In loop ',loop_times,'The single linkage is: ',new_node.distance)
            loop_times +=1
            nodes.append(new_node)
        # for node in reversed(nodes):
        #     print node.id
        self.nodes = nodes
        self.Label()
    def Label(self):
        # From the last to the first to label these fucking nodes
        label = 0
        for node in reversed(self.nodes):
            self.leaf_traversal(node,label)
            label += 1

    # traversal the leaf nodes to label
    def leaf_traversal(self,node,label):
        if node.left == None and node.right == None:
            if self.labels[node.id[0]] == -1:
                self.labels[node.id[0]] = label
        if node.left:
            self.leaf_traversal(node.left,label)
        if node.right:
            self.leaf_traversal(node.right,label)
def loadDataSet(fileName):
    xArr = [];
    yArr = []
    for line in open(fileName).readlines():
        curLine = line.strip().split()
        # curLine = line.strip().split('\t')
        xonerow = []
        for i in range(len(curLine) - 1):

            xonerow.append(float(curLine[i]))
        xArr.append(xonerow)
        yArr.append(float(curLine[-1]))

    return xArr, yArr

if __name__ =="__main__":
    train_x,train_y = loadDataSet('D:\untitled\Hierarchical.txt')
    Hierarchy = Hierarchical(k=27)
    print np.array(train_x).shape[0]

    Hierarchy.train(np.array(train_x))
    print np.array(Hierarchy.labels)
    print train_y

我设定为最后到还剩两类结束，结果为：

('In loop ', 0, 'The single linkage is: ', 1.0)
('In loop ', 1, 'The single linkage is: ', 3.605551275463989)
('In loop ', 2, 'The single linkage is: ', 3.605551275463989)
('In loop ', 3, 'The single linkage is: ', 5.0990195135927845)
('In loop ', 4, 'The single linkage is: ', 5.385164807134504)
('In loop ', 5, 'The single linkage is: ', 5.916079783099616)
('In loop ', 6, 'The single linkage is: ', 6.4031242374328485)
('In loop ', 8, 'The single linkage is: ', 7.810249675906654)
('In loop ', 9, 'The single linkage is: ', 8.54400374531753)
('In loop ', 13, 'The single linkage is: ', 8.831760866327848)
('In loop ', 15, 'The single linkage is: ', 9.486832980505138)
('In loop ', 18, 'The single linkage is: ', 10.392304845413264)

与真实结果比较：

[0 1 0 0 0 1 0 0 0 0 0 0 0 0]
[0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0]

发现误差还真是蛮大的，14个点的特征捕捉不准确，真就分不出来嗷。

forest小拳拳

关注

1
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
hierarchal cluster （层次聚类，single/complete linkage）

看网上似乎没有层次聚类关于single/comlplete linkage只用numpy的轮子，于是根据作业需求造了一个。虽然都是层次聚类，但是基于single/comlplete linkage的和average linkage的着实不太一样。首先从直观角度来讲，后者每一次合并后都得重新算一次新的簇的中心结点是什么，复杂度非常高。而前者只考虑初始叶结点之间的距离作为最终所有簇间距的评估标准。...
复制链接

扫一扫