深度学习花书 笔记4-Kmeans算法流程与实现
1. K-means 算法流程
2. K-means 编程实现
Kmeans_ 为快速版,Kmeans 为普通版
'''
Created on Apr 11, 2020
Author: yali
'''
import os, sys
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import time
def LoadDataSet(fileName, delim='\t'):
fr = open(fileName)
stringArr = [line.strip().split(delim) for line in fr.readlines()]
datArr = [list(map(float, line)) for line in stringArr]
return np.mat(datArr)
def RandomClusterCenter(data, k):
# 为待聚类的点初始化聚类中心
m, n = data.shape
centers = np.mat(np.zeros((k,n)))
for j in range(n):
min_value = np.min(data[:, j])
max_value = np.max(data[:, j])
range_value = float(max_value - min_value)
centers[:,j] = np.mat(min_value + range_value * np.random.rand(k,1))
return centers
def ComputeDistance(data, centers):
m, n = data.shape
distances = np.tile(centers, (m, 1)) - data
distances = np.multiply(distances, distances)
distances = distances.sum(axis=1)
distances = np.sqrt(distances)
return distances
def Kmeans(data, k):
m, n = data.shape
# 初始化聚类类别, 第一列存储分类标签,第二列存储分类中心与样本点的距离误差
cluster = np.mat(np.zeros((m,2)))
centers = RandomClusterCenter(data, k) # 初始化聚类中心点
cluster_changed = True
while cluster_changed:
cluster_changed = False
for i in range(m):
min_distace = np.inf; # 无限大的正数
min_cluster = -1 # 类别标签:0,1
for j in range(k):
distance = ComputeDistance(centers[j,:],data[i,:]) # 计算中心与样本点之间的距离之和
if distance < min_distace: # 选择距离最小的作为类别
min_distace = distance
min_cluster = j # 标识类别
if cluster[i,0] != min_cluster: # 如果样本点聚类的类别更新了,说明还需进行下一次的迭代
cluster_changed = True
cluster[i,:] = min_cluster,min_distace**2
for cent in range(k):#recalculate centroids
cluster_points = data[np.nonzero(cluster[:,0].A==cent)[0]] # 获取每个类别对应的所有样本点
centers[cent,:] = np.mean(cluster_points, axis=0) # 计算每个类别样本点的均值作为新的聚类中心点
return centers, cluster
def Kmeans_(data, k):
m, n = data.shape
# 初始化聚类类别, 第一列存储分类标签,第二列存储分类中心与样本点的距离误差
cluster = np.mat(np.zeros((m,2)))
min_distace = np.multiply(np.mat(np.ones((m,1))) , np.inf)
min_cluster = np.multiply(np.mat(np.zeros((m,1))) , -1)
centers = RandomClusterCenter(data, k) # 初始化聚类中心点
cluster_changed = True
while cluster_changed:
cluster_changed = False
for j in range(k):
distance = ComputeDistance(centers[j,:],data) # 计算中心与样本点之间的距离之和
min_value = np.min(distance, axis=1)
dist, index = np.where(min_value < min_distace)
min_distace[dist] = min_value[dist]
min_cluster[dist] = j
if np.sum(np.abs(cluster[:,0] - min_cluster)) != 0: # 如果样本点聚类的类别更新了,说明还需进行下一次的迭代
cluster_changed = True
cluster[:, 0] = min_cluster
cluster[:, 1] = np.multiply(min_distace, min_distace)
for cent in range(k):
cluster_points = data[np.nonzero(cluster[:,0].A==cent)[0]] # 获取每个类别对应的所有样本点
centers[cent,:] = np.mean(cluster_points, axis=0) # 计算每个类别样本点的均值作为新的聚类中心点
return centers, cluster
def DisplayData(data):
figure = plt.figure()
ax = figure.add_subplot(111)
for i in range(len(data)):
if data[i, 2] == 0:
ax.scatter(data[i, 0], data[i, 1], marker='o', s=80, c='green')
if data[i, 2] == 1:
ax.scatter(data[i, 0], data[i, 1], marker='^', s=80, c='red')
if __name__ == '__main__':
data = LoadDataSet('data.txt')
start_time = time.clock()
centers, clusterAssment = Kmeans_(data[:, 0:2], 2) # 快速版
end_time = time.clock()
print("Kmeans_ Execution Time: {:.8f}s ".format(end_time - start_time))
start_time = time.clock()
centers, clusterAssment = Kmeans(data[:, 0:2], 2)
end_time = time.clock()
print("Kmeans Execution Time: {:.8f}s ".format(end_time - start_time))
DisplayData(data)
plt.plot(centers[:, 0], centers[:, 1], '+', color='black', markersize=40)
plt.show()
下面为测试结果: