深度学习花书 笔记4-Kmeans算法流程与实现

深度学习花书 笔记4-Kmeans算法流程与实现

1. K-means 算法流程

在这里插入图片描述

2. K-means 编程实现

Kmeans_ 为快速版,Kmeans 为普通版

'''
Created on Apr 11, 2020
Author: yali
'''
import os, sys
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import time

def LoadDataSet(fileName, delim='\t'):
    fr = open(fileName)
    stringArr = [line.strip().split(delim) for line in fr.readlines()]
    datArr = [list(map(float, line)) for line in stringArr]
    return np.mat(datArr)

def RandomClusterCenter(data, k):
    # 为待聚类的点初始化聚类中心
    m, n = data.shape
    centers = np.mat(np.zeros((k,n)))
    for j in range(n):
        min_value = np.min(data[:, j])
        max_value = np.max(data[:, j])
        range_value = float(max_value - min_value)
        centers[:,j] = np.mat(min_value + range_value * np.random.rand(k,1))
    return centers

def ComputeDistance(data, centers):
    m, n = data.shape
    distances = np.tile(centers, (m, 1)) - data
    distances = np.multiply(distances, distances)
    distances = distances.sum(axis=1)
    distances = np.sqrt(distances)
    return distances

def Kmeans(data, k):
    m, n = data.shape
    # 初始化聚类类别, 第一列存储分类标签,第二列存储分类中心与样本点的距离误差
    cluster = np.mat(np.zeros((m,2)))
    centers = RandomClusterCenter(data, k)  # 初始化聚类中心点
    cluster_changed = True
    while cluster_changed:
        cluster_changed = False
        for i in range(m):
            min_distace = np.inf; # 无限大的正数
            min_cluster = -1   # 类别标签:0,1
            for j in range(k):
                distance = ComputeDistance(centers[j,:],data[i,:])  # 计算中心与样本点之间的距离之和
                if distance < min_distace:  # 选择距离最小的作为类别
                    min_distace = distance
                    min_cluster = j  # 标识类别
            if cluster[i,0] != min_cluster:  # 如果样本点聚类的类别更新了,说明还需进行下一次的迭代
                cluster_changed = True
            cluster[i,:] = min_cluster,min_distace**2
        for cent in range(k):#recalculate centroids
            cluster_points = data[np.nonzero(cluster[:,0].A==cent)[0]] # 获取每个类别对应的所有样本点
            centers[cent,:] = np.mean(cluster_points, axis=0) # 计算每个类别样本点的均值作为新的聚类中心点
    return centers, cluster

def Kmeans_(data, k):
    m, n = data.shape
    # 初始化聚类类别, 第一列存储分类标签,第二列存储分类中心与样本点的距离误差
    cluster = np.mat(np.zeros((m,2)))
    min_distace = np.multiply(np.mat(np.ones((m,1))) , np.inf)
    min_cluster = np.multiply(np.mat(np.zeros((m,1))) , -1)
    centers = RandomClusterCenter(data, k)  # 初始化聚类中心点
    cluster_changed = True
    while cluster_changed:
        cluster_changed = False
        for j in range(k):
            distance = ComputeDistance(centers[j,:],data)  # 计算中心与样本点之间的距离之和
            min_value = np.min(distance, axis=1)
            dist, index = np.where(min_value < min_distace)
            min_distace[dist] = min_value[dist]
            min_cluster[dist] = j
        if np.sum(np.abs(cluster[:,0] - min_cluster)) != 0:  # 如果样本点聚类的类别更新了,说明还需进行下一次的迭代
            cluster_changed = True
        cluster[:, 0] = min_cluster
        cluster[:, 1] = np.multiply(min_distace, min_distace)
        for cent in range(k):
            cluster_points = data[np.nonzero(cluster[:,0].A==cent)[0]] # 获取每个类别对应的所有样本点
            centers[cent,:] = np.mean(cluster_points, axis=0) # 计算每个类别样本点的均值作为新的聚类中心点
    return centers, cluster

def DisplayData(data):
    figure = plt.figure()
    ax = figure.add_subplot(111)
    for i in range(len(data)):
        if data[i, 2] == 0:
            ax.scatter(data[i, 0], data[i, 1], marker='o', s=80, c='green')
        if data[i, 2] == 1:
            ax.scatter(data[i, 0], data[i, 1], marker='^', s=80, c='red')

if __name__ == '__main__':
    data = LoadDataSet('data.txt')

    start_time = time.clock()
    centers, clusterAssment = Kmeans_(data[:, 0:2], 2)  # 快速版
    end_time = time.clock()
    print("Kmeans_ Execution Time: {:.8f}s ".format(end_time - start_time))

    start_time = time.clock()
    centers, clusterAssment = Kmeans(data[:, 0:2], 2)
    end_time = time.clock()
    print("Kmeans Execution Time: {:.8f}s ".format(end_time - start_time))

    DisplayData(data)
    plt.plot(centers[:, 0], centers[:, 1], '+', color='black', markersize=40)
    plt.show()

下面为测试结果:
在这里插入图片描述

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值