统计学习方法：K-means（hierarchy cluster）

最新推荐文章于 2023-11-15 09:30:44 发布

try_trying_try

最新推荐文章于 2023-11-15 09:30:44 发布

阅读量2.5k

点赞数

分类专栏：统计学习方法

本文链接：https://blog.csdn.net/u012114900/article/details/107719158

版权

统计学习方法专栏收录该内容

30 篇文章 2 订阅

订阅专栏

K-means（hierarchy cluster）

先用hierarchy cluster找寻初始k个簇中心
参考资料：K-means

重点：熟练alg大纲点，details可省

import numpy as np
import collections
def calDist(x1,x2): # compute两个样本point之间的距离(欧式距离)
    return np.sum(np.square(x1-x2))

def cal_Cluster_distance(D,cluster1,cluster2): #compute两个蔟之间的距离，类间距离（最短距离）
    minDist=10000
    for i in cluster1:
        for j in cluster2:
            if D[i][j]<minDist:
                minDist=D[i][j]
    return minDist
#_________________________________________________________________________
def find_miniDist_index(D,cluster): #find最短距离的两个类,return两个类的索引
    minDist=10000
    indexi,indexj=-1,-1
    for i in cluster.keys():
        for j in cluster.keys():
            if i!=j:
                distance=cal_Cluster_distance(D,cluster[i],cluster[j])
                if distance<minDist:
                    minDist=distance
                    indexi, indexj=i,j
    return indexi,indexj

def hierarchical_clustering(data,k): #until样本被分为指定类数
    '''
    1.first: each sample为一类
    2.compute: samples距离矩阵
    3.find: 最短距离两个类，合并
        不断合并，until k个类
    4.对于each类，return最接近(each类中)中心点的样本
    '''
    # 我们使用一个字典保存属于每一类的有哪些样本点
    # 首先将样本每一个点分为一类
    cluster={}
    for i in range(len(data)): cluster[i]=[i]

    D=[[0 for _ in range(len(data))] for _ in range(len(data))]# 样本距离矩阵:each数据点两两之间
    for i in range(len(data)):    #为对称矩阵，只需计算上三角部分
        for j in range(i,len(data)):
            distance=calDist(data[i],data[j])
            D[i][j]=distance
            D[j][i] = distance

    clusters=len(cluster)    #不断合并，until k个类
    while clusters>k:
        print(f'clustering *** the cluster num is {clusters}')
        i ,j=find_miniDist_index(D,cluster)
        cluster[i].extend(cluster[j])  # 备注一
        del cluster[j]
        clusters=len(cluster)

    print(f'hierarchical clustering：{cluster}')  #已完成了k类工作
    intial_start=[]  #返回k个最接近每个蔟中心点的样本作为初始中心点(k-means的)
    for i in cluster.keys():  #对于each类
        center=np.array([0. for _ in range(data.shape[1])]) #列dim保持
        for j in range(len(cluster[i])): #comput：eeach类的中心点位置
            center+=data[cluster[i][j]]
        center/= len(cluster[i])

        # 找到离中心点最近的点
        minDist=-10000
        index=-1
        for j in range(len(cluster[i])):
            tmp=calDist(center,data[cluster[i][j]])
            if tmp<=minDist:
                index=cluster[i][j]
                minDist=tmp
        intial_start.append(index)

    return intial_start


#_________________________________________________________________________
def k_means(start, data, k):
    '''
    1.对each数据进行判断：离哪一个蔟中心点更近，将其分到那一类
    2. 1全部分完后，统计 a)每个data被分到了哪个类-----center        b)每个类有几个data-----count
    '''
    m,n=data.shape # 样本数量,特征数
    cluster={} # 保存分类样本点
    cluster_center={}

    for i in range(m):
        cluster[i]=-1  #Init:还没有分类  each样本原始所属类 (ith样本, 类别标记)
    for i in range(k):
        cluster_center[i]=data[start[i]]

    changed_data = 1
    while changed_data:
        changed_data=0
        for i in range(m): #对each数据进行判断：离哪一个蔟中心点更近，将其分到那一类
            minDist=10000
            cluster_belong = -1  # 所属的类别 Init无类别
            for c in range(len(cluster_center)):  #与k个类分别比较
                dis=calDist(cluster_center[c],data[i])
                if dis<minDist:
                    minDist=dis
                    cluster_belong=c
            if cluster_belong!=cluster[i]: #each样本：修改后的所属类！=原始所属类
                changed_data+=1
                cluster[i]=cluster_belong

        count=[0 for i in range(k)]
        center=[np.array([0. for _ in range(n)]) for _ in range(k)] #k*n record每个类有哪些样本,结果之和
        for index,c in cluster.items():  #index:ith 样本  c:所属哪个类的标记
            center[c] +=data[index]  #
            count[c]+=1

        for i in range(k): cluster_center[i]=center[i]/count[i]  # 修改中心
    return cluster

#_________________________________________________________________________
def loadData(filename):
    data=[]
    #with open ('iris.txt') as file:
    with open(filename) as file:
        for line in file.readlines(): # 按行读取
            line=line.split(',')[:-1]
            linedata=[]
            for i in range(len(line)): linedata.append(eval(line[i])) #must eval,因读入的是str==>float
            linedata=np.array(linedata)
            data.append(linedata)
    return np.array(data)


if __name__=="__main__":
    data=loadData('iris.txt')
    start=hierarchical_clustering(data,3)
    cluster=k_means(start,data,3)
    print(cluster)
    print(collections.Counter(cluster.values())) #（类别标记，个数统计）

bug之一: return 缩进位置

备注:
一、
# 将i，j两个蔟合并到i，删除j
# 使用extend，将cluster[j]中的元素逐个添加到cluster[i]中
cluster[i].extend(cluster[j])

新收获命令

#.append()与.extend()的区别
lst=[1,2] >>>[1,2]
lst.append([3,4]) >>>[1, 2, [3, 4]]
lst.extend([3,4]) >>>[1, 2, 3, 4]

with open(filename) as file:
    for line in file.readlines():  # 按行读取