聚类(Clustering):hierarchical clustering 层次聚类及其应用

最新推荐文章于 2020-12-01 21:27:41 发布

Moelimoe

最新推荐文章于 2020-12-01 21:27:41 发布

阅读量758

点赞数

分类专栏： ML 文章标签： python3 深度学习 HierarchicalClustering

本文链接：https://blog.csdn.net/Moelimoe/article/details/98473896

版权

ML 专栏收录该内容

15 篇文章 0 订阅

订阅专栏

聚类(Clustering):hierarchical clustering 层次聚类及其应用

clustering实现：

from numpy import *
import math
#基于matrix计算的pakage

#定义树形结构图的结点，当再往上两则分为一类时则将其连接起来
#用面向对象(class)的方法实现此次的代码:
class cluster_node:
    def __init__ (self,vec,left=None,right=None,distance=0.0,id=None,count=1):#初始化的函数，每次实例化这个类的时候都会自动调用次函数(同java中的构造函数,self同java中的this)
        '''vec:传入的数据为numpy array，每一行就是一个vec；
        left:左节点；right=右节点；distance：结点之间距离
        id:给节点定义一个名字，算节点平均值时要区分哪些节点包含在哪
        count：节点计数
        '''
        self.vec = vec
        self.left = left
        self.right = right
        self.distance = distance
        self.id = id
        self.count = count #only used for weighted average 

#找出最近的两点的距离:
def L2dist(v1,v2):
    return math.sqrt(sum((v1-v2)**2)) #2维的两点(向量计算)的直线距离,向量为坐标形式，一个坐标包含x，y两个数值

def L1dist(v1,v2): #计算一维的距离时使用
    return sum(abs(v1-v2))

# def Chi2dist(v1,v2):
 
#     return sqrt(sum((v1-v2)**2))

#实现 HC的具体树状结构
def hcluster(features,distance=L2dist):
    '''features:类型为numpy array
    distance:默认为2D距离
    '''
    distances = {} #用来储存所有distance,注意字典distances和函数distance的区别
    currentid = -1 #初始化实例的clusterid

    #最开始每一个单独的数据就是一个单独的cluster，对应的就是feature(array/matrix)中的每一行==> # clusters are initially just the individual rows
    clust = [cluster_node(array(features[i]),id=i) for i in range(len(features))]#注意这里array赋值属性的用法和for循环的使用方法
    '''features的每一组[]里的数据对应一个点，每一个点都赋值一个类别i,调用cluster_node'''
    #进行分类，知道所有分类个数为1时结束循环：
    while len(clust)>1:
        lowestpair = (0,1)#初始化以(0,1)为最近的组
        closest = distance(clust[0].vec,clust[1].vec)#首先取前两个向量,只有两个向量，他们的距离也就是最小的(调用了L2dist计算最小距离)
        # loop through every pair looking for the smallest distance
        for i in range(len(clust)):
            for j in range(len(clust)):
                # distances is the cache of distance calculation
                if (clust[i].id,clust[j].id) not in distances:
                    distances[clust[i].id,clust[j].id] = distance(clust[i].vec,clust[j].vec)
                    '''将i,j两点的距离计算并传到字典distances中'''
                d = distances[(clust[i].id,clust[j].id)]
                if d < closest: #比较出最小距离,赋值给closest,同时更新最近距离的对组
                    closest = d
                    lowestpair = (i,j) 

        # calculate the average of the two clusters.有多种定义分类的方法(最近距离、最远距离、均值、中值),这里取平均距离
        print(clust[lowestpair[0].vec[0]])
        mergevec = [(clust[lowestpair[0]].vec[i]+clust[lowestpair[1]].vec[i])/2.0 for i in range(len(clust[0].vec))]

        # create the new cluster
        newcluster = cluster_node(array(mergevec),left=clust[lowestpair[0]],right=clust[lowestpair[1]],
                        distance=closest,id=currentid)

        # cluster ids that weren't in the original set are negative
        # 不在原始的数据集合中currentid为负数
        currentid -= 1
        #每执行完一次结点计算就将之前的结点坐标值删除？
        del clust[lowestpair[0]]
        del clust[lowestpair[1]]
        clust.append(newcluster)
    return clust[0]
    # (以上已经建立好树状图了)

def extract_cluster(clust,dist): #dist为预设的距离值
    # extract list of sub-tree clusters from hcluster tree with distance<dist
    cluster = {}
    if clust.distance<dist:
        # we have found a cluster subtree
        return [clust]
    else:
        # check the right and left branches
        cl = []
        cr = []
        if clust.left != None:#这里可以直接.left？
            cl = extract_cluster(clust.left,dist=dist)
        if clust.right != None:
            cr = extract_cluster(clust.right,dist=dist)
        return cl+cr

def get_cluster_elements(clust):
    # return ids for elements in a cluster sub-tree
    # 如果该要求的clust没有子集，那就返回他本身，如果有子集就返回左子集和右子集相加
    if clust.id>=0:
        # positive id means that this is a leaf
        return clust.id
    else:
        # check the right and left branches
        cl = []
        cr = []
        if clust.left != None:
            cl = get_cluster_elements(clust.left)
        if clust.right !=None:
            cr = get_cluster_elements(clust.right)
        return cl+cr

def printclust(clust,labels=None,n=0):
    for i in range(n): print(''),
    if clust.id<0:
        # negative id means that this is branch
        print('-')
    else:
        # positive id means that this is an endpoint
        if labels==None: print(clust.id)
        else: print(labels[clust.id])
    if clust.left != None: printclust(clust.left,labels=labels,n=n+1)
    if clust.right !=None: printclust(clust.right,labels=labels,n=n+1)

def getheight(clust):
    # Is this an endpoint? Then the height is just 1
    if clust.left == None and clust.right ==None: return 1
    # Otherwise the height is the same of the heights of each branch
    return getheight(clust.left)+getheight(clust.right) #为什么还要加上getheight这个函数？

def getdepth(clust):  #深度是？
     # The distance of an endpoint is 0.0
    if clust.left == None and clust.right == None: return 0    
    # The distance of a branch is the greater of its two sides plus its own distance  
    return max(getdepth(clust.left),getdepth(clust.right))+clust.distance

clustering代码应用:(借用链接：https://blog.csdn.net/weixin_41790863/article/details/81412564 )

from PIL import ImageDraw, Image
import numpy as np
import os
import sys

nodeList = []  # 用于存储所有的节点，包含图片节点，与聚类后的节点
distance = {}  # 用于存储所有每两个节点的距离，数据格式{(node1.id,node2.id):30.0,(node2.id,node3.id):40.0}

class node:
    def __init__(self, data):
        '''每个样本及样本合并后节点的类
            data：接受两种格式，
            1、当为字符（string）时，是图片的地址，同时也表示这个节点就是图片
            2、合并后的类，传入的格式为(leftNode,rightNode) 即当前类表示合并后的新类，而对应的左右节点就是子节点
        '''
        self.id = len(nodeList)  # 设置一个ID,以nodeList当然长度为ID,在本例中ID本身没太大用处，只是如果看代码时，有时要看指向时有点用
        self.parent = None  # 指向合并后的类
        self.pos = None  # 用于最后绘制节构图使用，赋值时为(x,y,w,h)格式
        if type(data) == type(""):
            '''节点为图片'''
            self.imgData = Image.open(data)
            self.left = None
            self.right = None
            self.level = 0  # 图片为最终的子节点，所有图片的层级都为0，设置层级是为了最终绘制结构图

            npTmp = np.array(self.imgData).reshape(-1, 3)  # 将图片数据转化为numpy数据，shape为(高，宽，3)，3为颜色通道
            npTmp = npTmp.reshape(-1, 3)  # 重新排列，shape为(高*宽，3)
            self.feature = npTmp.mean(axis=0)  # 计算RGB三个颜色通道均值

        else:
            '''节点为合成的新类'''
            self.imgData = None
            self.left = data[0]
            self.right = data[1]
            self.left.parent = self
            self.right.parent = self

            self.level = max(self.left.level, self.right.level) + 1  # 层级为左右节高层级的级数+1
            self.feature = (self.left.feature + self.right.feature) / 2  # 两类的合成一类时，就是左右节点的feature相加/2

        # 计算该类与每个其他类的距离，并存入distance
        for x in nodeList:
            distance[(x, self)] = np.sqrt(np.sum((x.feature - self.feature) ** 2))

        nodeList.append(self)  # 将本类加入nodeList变量

    def drawNode(self, img, draw, vLineLenght):
        # 绘制结构图
        if self.pos == None: return
        if self.left == None:
            # 如果是图片
            self.imgData.thumbnail((self.pos[2], self.pos[3])) #thumbnail将图片变小成缩略图
            img.paste(self.imgData, (self.pos[0], self.pos[1]))
            draw.line((int(self.pos[0] + self.pos[2] / 2)
                       , self.pos[1] - vLineLenght
                       , int(self.pos[0] + self.pos[2] / 2)
                       , self.pos[1])
                      , fill=(255, 0, 0))
        else:
            # 如果不是图片
            draw.line((int(self.pos[0])
                       , self.pos[1]
                       , int(self.pos[0] + self.pos[2])
                       , self.pos[1])
                      , fill=(255, 0, 0))

            draw.line((int(self.pos[0] + self.pos[2] / 2)
                       , self.pos[1]
                       , int(self.pos[0] + self.pos[2] / 2)
                       , self.pos[1] - self.pos[3])
                      , fill=(255, 0, 0))

def loadImg(path):
    '''path 图片目录，根据自己存的地方改写'''
    files = None
    try:
        files = os.listdir(path)
    except:
        print('未正确读取目录：' + path + ',图片目录，请根据自己存的地方改写,并保证没有hierarchicalResult.jpg,该文件为最后生成文件')
        return None
    for i in files:

        if os.path.splitext(i)[1].lower() == '.jpg' and os.path.splitext(i)[0].lower() != 'hierarchicalresult':
            fileName = os.path.join(path, i)
            node(fileName)
    return os.path.join(path, 'hierarchicalResult.jpg')

def getMinDistance():
    '''从distance中过滤出未分类的结点，并读取最小的距离'''
    vars = list(filter(lambda x: x[0].parent == None and x[1].parent == None, distance))
    minDist = vars[0]
    for x in vars:
        if minDist == None or distance[x] < distance[minDist]:
            minDist = x
    return minDist

def createTree():
    while len(list(filter(lambda x: x.parent == None, nodeList))) > 1:  # 合并到最后时，只有一个类，只要有两个以上未合并，就循环
        minDist = getMinDistance()
        # 创建非图片的节点，之所以把[1]做为左节点，因为绘图时的需要，
        # 在不断的产生非图片节点时，在nodeList的后面的一般是新节点，但绘图时绘在左边
        node((minDist[1], minDist[0]))
    return nodeList[-1]  # 最后一个插入的节点就是要节点

def run():
    root = createTree()  # 创建树结构

    # 一句话的PYTON，实现二叉树的左右根遍历，通过通过遍历，进行排序后，取出图片，做为最底层的打印
    sortTree = lambda node: ([] if node.left == None else sortTree(node.left)) + (
    [] if node.right == None else sortTree(node.right)) + [node]
    treeTmp = sortTree(root)
    treeTmp = list(filter(lambda x: x.left == None, treeTmp))  # 没有左节点的，即为图片

    thumbSize = 60  # 缩略图的大小，，在60X60的小格内缩放
    thumbSpace = 20  # 缩略图间距
    vLineLenght = 80  # 上下节点，即每个level之间的高度

    imgWidth = len(treeTmp) * (thumbSize + thumbSpace)
    imgHeight = (root.level + 1) * vLineLenght + thumbSize + thumbSpace * 2
    img = Image.new('RGB', (imgWidth, imgHeight), (255, 255, 255))
    draw = ImageDraw.Draw(img)

    for item in enumerate(treeTmp):
        # 为所有图片增加绘图数据
        x = item[0] * (thumbSize + thumbSpace) + thumbSpace / 2
        y = imgHeight - thumbSize - thumbSpace / 2 - ((item[1].parent.level - 1) * vLineLenght)
        w = item[1].imgData.width
        h = item[1].imgData.height
        if w > h:
            h = h / w * thumbSize
            w = thumbSize
        else:
            w = w / h * thumbSize
            h = thumbSize
            x += (thumbSize - w) / 2
        item[1].pos = (int(x), int(y), int(w), int(h))
        item[1].drawNode(img, draw, vLineLenght)

    for x in range(1, root.level + 1):
        # 为所有非图片增加绘图的数据
        items = list(filter(lambda i: i.level == x, nodeList))
        for item in items:
            x = item.left.pos[0] + (item.left.pos[2] / 2)
            w = item.right.pos[0] + (item.right.pos[2] / 2) - x
            y = item.left.pos[1] - (item.level - item.left.level) * vLineLenght
            h = ((item.parent.level if item.parent != None else item.level + 1) - item.level) * vLineLenght
            item.pos = (int(x), int(y), int(w), int(h))
            item.drawNode(img, draw, vLineLenght)
    img.save(resultFile)

resultFile = loadImg(r"G:\Pythonnotes\test\HierarchicalClusterDataset")  # 读取数据，并返回最后结果要存储的文件名，目录根据自己存的位置进行修改
if resultFile != 'None':
    run()
    print("结构图生成成功，最终结构图存储于：" + resultFile)

Moelimoe

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
聚类(Clustering):hierarchical clustering 层次聚类及其应用

聚类(Clustering):hierarchical clustering 层次聚类及其应用clustering实现：from numpy import *import math#基于matrix计算的pakage#定义树形结构图的结点，当再往上两则分为一类时则将其连接起来#用面向对象(class)的方法实现此次的代码:class cluster_node: def __...
复制链接

扫一扫