聚类算法（k_means与层次聚类）

最新推荐文章于 2024-07-02 10:45:45 发布

吃着油条唱歌

最新推荐文章于 2024-07-02 10:45:45 发布

阅读量1.2k

点赞数 1

分类专栏：统计学习笔记文章标签：算法 python 聚类

本文链接：https://blog.csdn.net/weixin_42559124/article/details/106691485

版权

统计学习笔记专栏收录该内容

2 篇文章 0 订阅

订阅专栏

K-means聚类

算法思路如下：

首先输入 k 的值，即我们指定希望通过聚类得到 k 个分组；
从数据集中随机选取 k 个数据点作为初始质心；
对集合中每一个样本点，计算与每一个初始质心的距离，离哪个初始质心距离近，就属于那个类。
按距离对所有样本分完组之后，生成新的质心。
重复（2）（3）（4）直到新的质心和原质心相等，算法结束。

程序：

import os
import random
import numpy as np
import copy
import matplotlib.pyplot as plt
nor = 0 # Number of runs 运行次数
M = 10 # 最大可分多少类
# 主要读取的数组为字符类型，要转化为数字
error = 0
def distance(c0, c1, dtype = 'Euclidean'):
    c0 = np.array(c0)
    c1 = np.array(c1)
    switch ={
        "Euclidean": lambda x1, x2:np.linalg.norm(c0-c1), # 欧氏距离
        "Manhattan": lambda x1, x2:np.linalg.norm(c0-c1,ord=1), #曼哈顿距离
        "Chebyshev": lambda x1, x2:np.linalg.norm(c0-c1,ord=np.inf), #切比雪夫距离
    }
    return switch[dtype](c0,c1)
def get_data():
    point_data = []
    with open('E:/VSC_project/VSC py/Data mining/5/iris.txt', 'r', encoding='utf-8') as f:
        for line in f:
            temp = (line.split(","))[:-1]
            temp = [float(x) for x in temp]
            point_data.append(temp)
    if point_data[-1]==[]:
        del point_data[-1]
    f.close()
    return point_data
def SSE(c0, category):
    return np.sum([ np.sum([distance(c0[i], category[i][j]) for j in range(len(category[i]))])**2 for i in range(len(category))])
def k_means(point_data, k, error = 0.1):
    global nor
    c0_index = random.sample(range(len(point_data)), k)
    c0 = [point_data[i] for i in c0_index] # 初始质心
    c_error = error + 1 # 只要不满足退出while的情况即可
    new_c0 = copy.deepcopy(c0)
    while c_error > error:
        category = [[] for i in range(k)] # 存放每个类的点集
        c0 = copy.deepcopy(new_c0) # 深拷贝不然会产生浅拷贝的问题
        for i in range(len(point_data)):
            dmin = float('inf')
            for j in range(len(c0)):
                dis = distance(c0[j], point_data[i])
                if dis < dmin:
                    dmin = dis # 最小距离
                    ptype = j # 点的类型
            category[ptype].append(point_data[i]) # 不同的点进不同的类的集合中
        #new_c0 = np.zeros([k, len(point_data[0])])
        new_c0 = [ [ np.mean([ category[i][j][l] for j in range(len(category[i]))]) for l in range(len(category[i][0]))] for i in range(len(category))]
        '''
        for i in range(len(category)):
            for l in range(len(category[i][0])): # 坐标的第l个分量
                new_c0[i][l] =np.mean([category[i][j][l] for j in range(len(category[i]))])
        '''
        c_error = np.sum([distance(c0[i], new_c0[i]) for i in range(len(c0))])
        nor += 1
        #print("K值为%s, 运行了%s次, 质点的误差为%s, 期望误差为 %s" %(str(k), str(nor), str(c_error), str(error)))
    return SSE(new_c0, category), new_c0
def judgment(c0, point):
    dis = [ distance(i, point) for i in c0 ]
    return dis.index(min(dis))
def plot_3k(c0, point_data):
    plt.subplots()
    plt.title('K = 3 Classify images')
    x=[1,2,3,4]
    put_ = [1,1,1]
    for i in range(len(point_data)):
        l_type = judgment(c0, point_data[i]) # 判断这个点属于哪一类
        color = 'blue' if l_type==0 else 'red' if l_type==1 else 'green'
        y = point_data[i]
        if put_[l_type] == 1:
            plt.plot(x, c0[l_type], linewidth = '3', color = color, marker='|')
            plt.plot(x, y, linewidth = '1', label = 'Type '+str(l_type), color = color, linestyle=':', marker='|')
            put_[l_type] = 0
        else:
            plt.plot(x, y, linewidth = '1', color = color, linestyle=':', marker='|')
    plt.legend()
if __name__ == '__main__':
    point_data = get_data()
    SSE_list = []
    for k in range(2,M):
        sse_num, c0 = k_means(point_data, k, error)
        SSE_list.append(sse_num)
        if k ==3:
            c3 = c0 
    x = np.array(list(range(2, M)))
    y = np.array(SSE_list)
    plt.figure()
    plt.xlabel('K')
    plt.ylabel('SSE')
    plt.grid(linestyle='-.')
    plt.plot(x, y)
    plot_3k(c3, point_data)
    plt.show()
    #print(point_data)

SSE_K

簇的个数与SSE的关系时随着簇个数的增加SSE逐渐减小

其中线段组中最粗的一条线为该类对应的质心

层次聚类

算法

基本凝聚层次聚类算法
1:如果需要，计算邻近度矩阵
2:repeat
3:合并最接近的两个矮
4:更新邻近性矩阵，以反映新的簇与原来的簇之间的邻近性
5:until仅剩下一个族

单链或MIN

对于层次聚类的单链或MIN版本，两个族的邻近度定义为两个不同筷中任意两点之间的最短距离（最大相似度）。使用图的术语，如果我们从所有点作为单点簇开始，每次在点之间加上一条链，最短的链先加，则这些链将点合并成簇。单链技术擅长于处理非楠圆形状的簇，但对噪声和离群点很敏感。

全链或MAX

对于层次聚类的全链或MAX版本，两个筷的邻近度定义为两个不同族中任意两点之间的最长距离（最小相似度）。使用图的术语，如果我们从所有点作为单点簇开始，每次在点之间加上一条链，最短的链先加，则一组点直到其中所有的点都完全被连接（即形成团）才形成一个簇完全连接对噪声和离群点不太敏感，但是它可能使大的簇破裂，并且偏好球形。

程序

import os
import random
import numpy as np
import copy
import math
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
import numpy.matlib
import scipy
r = 1
num = 10
tree_index = num #由于从0开始所以每个点的下标为0到(num-1)
#cluster = [] #注意每次使用search函数是要初始化该结果集
# 维护的三个集合
cluster = [] # 1.簇集
c_num = [] # 2.簇所对应的编号
Z = [] # 3.对应的z
def sin(x):
    return math.sin(x)
def cos(x):
    return math.cos(x)
def uniform(x, y, num):
    return [random.uniform(x, y) for i in range(num)]
def get_data(r, number):
    pi = math.pi
    r_list = uniform(0, r, number)
    angle_list = uniform(0, 2*pi, number)
    point_set = [[r_list[i]*cos(angle_list[i]), r_list[i]*sin(angle_list[i])] for i in range(number)]
    return point_set
def plot(point_set):
    global r
    '''
    plt.xlim(xmax=r,xmin=-r)
    plt.ylim(ymax=r,ymin=-r)
    '''
    x = [point_set[i][0] for i in range(len(point_set))]
    y = [point_set[j][1] for j in range(len(point_set))]
    n = np.arange(len(point_set))
    fig, ax = plt.subplots()
    ax.scatter(x, y, c='r')
    #plt.plot(x, y, '.')
    for i,txt in enumerate(n):
        ax.annotate(txt,(x[i],y[i]))
    #plt.show()
def Dr(newlist): # Dimensionality reduction
	d = []
	for element in newlist:
		if not isinstance(element,list):
			d.append(element)
		else:
			d.extend(Dr(element))
	return d
def MIN(set_a_index, set_b_index, distance):
    if set_a_index == set_b_index:
        return float('inf')
    '''
    if type(set_a_index) == int:
        set_a_index = np.array([set_a_index])
    if type(set_b_index) == int:
        set_b_index = np.array([set_b_index])
    if type(set_a_index)==list:
        set_a_index = np.array(set_a_index)
    if type(set_b_index)==list:
        set_b_index = np.array(set_b_index)
    '''
    #set_a_index = set_a_index.flatten()
    #set_b_index = set_b_index.flatten()
    set_a_d = distance[set_a_index] #全部赋值为浅拷贝，切片类操作为深拷贝
    return np.min(set_a_d[:,set_b_index])
def MAX(set_a_index, set_b_index, distance):
    if set_a_index == set_b_index:
        return float('-inf')
    '''
    if type(set_a_index) == int:
        set_a_index = np.array([set_a_index])
    if type(set_b_index) == int:
        set_b_index = np.array([set_b_index])
    if type(set_a_index) == list:
        set_a_index = np.array(set_a_index)
    if type(set_b_index) == list:
        set_b_index = np.array(set_b_index)
    '''
    #set_a_index = set_a_index.flatten() #多维变一维
    #set_b_index = set_b_index.flatten()
    set_a_d = distance[set_a_index] #全部赋值为浅拷贝，切片类操作为深拷贝
    return np.max(set_a_d[:,set_b_index])
def point_distance(point_set): # 空间换时间
    point_set = np.array(point_set)
    point_len = len(point_set)
    return np.array([ [np.linalg.norm(point_set[i]-point_set[j]) for j in range(point_len)] for i in range(point_len) ])
def set_distance(point_set, fun, distance):
    new_point_set = [[Dr([point_set[i]])] for i in range(len(point_set))]
    '''
    for i in point_set:
        if type(i)==int:
            new_point_set.append([i])
        if type(i)==list:
            new_point_set.append((np.array(i)).flatten())
    '''
    dis = [ [fun(i, j, distance) for i in new_point_set] for j in new_point_set]
    if fun == MAX:
        temp = np.where(dis == np.max(np.array(dis)))
        return temp[0][0], temp[1][0] #返回下次操作的行列
    elif fun == MIN:
        temp = np.where(dis == np.min(np.array(dis)))
        return temp[0][0], temp[1][0] #返回下次操作的行列
def search_set(num, result):
    global cluster, c_num, tree_index
    if len(Dr(result)) == num: # 由于num>=2的所以result一定是个list
        cluster.append(result)
        c_num.append(tree_index)
        tree_index += 1
    if len(Dr([result[0]])) == num:
        cluster.append(result[0])
        c_num.append(tree_index)
        tree_index += 1
    elif len(Dr([result[0]])) > num:
        search_set(num, result[0])
    if len(Dr([result[1]])) == num:
        cluster.append(result[1])
        c_num.append(tree_index)
        tree_index += 1
    elif len(Dr([result[1]])) > num:
        search_set(num, result[1])
def build_z(result, fun, distance):
    global cluster, Z, c_num, cluster
    cluster = []
    for i in range(2, num + 1): # 从第2个节点的簇开始维护一直到num个节点的簇
        search_set(i, result)
        # 为新增加的簇集添加对应的Z
        for j in range(len(Z), len(cluster)): #只有z没有添加数据， cluster和c_num在search中被维护完毕
            temp_z = []
            #print('^o^')
            if type(cluster[j][0]) == int:
                temp_z.append(cluster[j][0]) #当字节点的类型为数字时，他节点编号就是簇编号
            else:
                temp_z.append(c_num[cluster.index(cluster[j][0])]) #这个子节点在已知簇集中的位置所对应的簇编号
            if type(cluster[j][1]) == int:
                temp_z.append(cluster[j][1]) #当字节点的类型为数字时，他节点编号就是簇编号
            else:
                temp_z.append(c_num[cluster.index(cluster[j][1])]) #这个子节点在已知簇集中的位置所对应的簇编号
            temp_z.append(fun(Dr([cluster[j][0]]), Dr([cluster[j][1]]), distance))
            temp_z.append(len(Dr([cluster[j]])))
            Z.append(temp_z)
def Hc(point_set_index, fun, distance): # Hierarchical clustering
    # 注意point_set_index为list类型
    while len(point_set_index)!=1:
        i, j = set_distance(point_set_index, fun, distance)
        #print(str(i)+" "+str(j))
        merge=[point_set_index[i], point_set_index[j]]
        if i<j:
            del point_set_index[i], point_set_index[j-1]
        if i>j:
            del point_set_index[j], point_index_set[i-1]
        point_set_index.append(merge)
    return point_set_index
def main(fun):
    global r, num
    #生成数据（1）
    point_set = get_data(r, num)
    plot(point_set)
    #生成数据（2）
    #points = scipy.randn(700,2)  
    #plot(points)
    #
    point_index_set = list(range(num))
    dis = point_distance(point_set)
    result = Hc(point_index_set, fun, dis)
    #plot_tree(result[0], MIN, dis)
    build_z(result[0], fun, dis)
    fig = plt.figure(figsize=(25, 10))
    arr_Z = np.array(Z)
    #他们组成的簇形成了新的簇，编号要对应上
    dn = dendrogram(arr_Z)
    plt.show()
if __name__ =='__main__':
    main(MIN)
    cluster =[] #1.簇
    c_num = [] # 2.簇所对应的编号
    Z = [] # 3.对应的z
    tree_index = num
    main(MAX)

结果

min对应的结果

max对应的图像

层次聚类中的result结果是以数组的形式表达构建出来的树，但是scipy中的dendrogram函数需要每个簇的信息，从而画出层次聚类的树图像，所以我编写了一个算法，来构建出Z值其中Z值得意思参考了“https://blog.csdn.net/Tan_HandSome/article/details/79371076”

build_z 算法

构建与维护的变量

cluster 簇集以多维list类型存储，从2个节点的开始构造
c_num cluster对应的簇的编号，与cluster一样在search_set里添加维护
Z dendrogram函数需要的变量，前两个变量是为了生成最终的Z而成立的

算法步骤

Step 1: 从2个点开始在search_set函数中寻找为2个点的簇，并将其编号与结构记录在c_num与cluster中
Step 2: 寻找3个点的簇集使用search_set 并且在已知的cluster中寻找3个点的簇集中已经被记录的簇集，将寻找到的部分记录到Z中，用Dr函数计算簇中的点数，MIN或MAX函数记录该簇的两个部分之间的距离
Step 3: 簇集个数（num）加一，按照Step2的方法继续寻找,if num < 总点数循环Step 3 否则算法结束

吃着油条唱歌

关注

1
点赞
踩
2

收藏

觉得还不错? 一键收藏
1
评论
聚类算法（k_means与层次聚类）

K-means聚类算法思路如下：首先输入 k 的值，即我们指定希望通过聚类得到 k 个分组；从数据集中随机选取 k 个数据点作为初始质心；对集合中每一个样本点，计算与每一个初始质心的距离，离哪个初始质心距离近，就属于那个类。按距离对所有样本分完组之后，生成新的质心。重复（2）（3）（4）直到新的质心和原质心相等，算法结束。程序：import osimport randomimport numpy as npimport copyimport matplotlib.py
复制链接

扫一扫

专栏目录