K-means Python实现
import numpy as np
import random
import matplotlib.pyplot as plt
"""
1 从n个向量对象任意选择k个向量作为初始聚类中心
2 根据在步骤(1)中设置的k个向量(中心对象向量),计算每个对象与这k个中心对象各自的距离
3 对于步骤(2)中的计算,任何一个向量与这个k个向量都有一个距离,有的远有的近,把这个向量与距离它最近的中心向量对象归在一个类簇中
4 重新计算每个类簇的中心向量位置
5 重复(3)(4),直到类簇聚类方案中的向量归类变化极少时为止。例如:一次迭代后,只有少于1%的向量还在发生类簇聚类漂移,可认为分为完成。
"""
fig, ax = plt.subplots()
def find_centroids(points: list,
k: int) -> list:
"""
从points中随机获取k个点作为质心\n
:param points: 候选的点
:param k: 选取的点的数量
:return: np.ndarray
"""
return random.sample(points, k)
def calculate_distance(vecA: np.ndarray,
vecB: np.ndarray) -> float:
"""
计算向量vecA和向量vecB之间的欧氏距离\n
:param vecA:
:param vecB:
:return:
"""
return np.sqrt(np.sum(np.square(vecA - vecB)))
def min_distance(points: np.ndarray,
centroid_list: list) -> dict:
"""
计算data_get中的元素与centroidList中k个聚类中心的欧式距离\n
找出距离最小的, 将该元素加入相应的聚类中\n
:param points:
:param centroid_list:
:return:
"""
cluster_dict = dict()
for element in points:
vecA = np.array(element)
flag = 0
minDis = float("inf")
for i in range(len(centroid_list)):
vecB = np.array(centroid_list[i])
distance = calculate_distance(vecA, vecB)
if distance < minDis:
minDis = distance
flag = i
if flag not in cluster_dict.keys():
cluster_dict[flag] = list()
cluster_dict[flag].append(element)
return cluster_dict
def get_centroids(cluster_dict: dict) -> list:
"""
求聚类中心即求解每列的均值
:param cluster_dict:
:return:
"""
centroid_list = list()
for key_ in cluster_dict.keys():
centroid = np.mean(np.array(cluster_dict[key_]), axis=0)
centroid_list.append(centroid)
return list(centroid_list)
def calculate_Var(cluster_dict: dict,
centroid_list: list) -> float:
"""
计算聚类间的均方误差\n
将类中各个向量与聚类中心的距离进行累加求和\n
:param cluster_dict:
:param centroid_list:
:return:
"""
sum_ = 0.0
for key_ in cluster_dict.keys():
vecA = np.array(centroid_list[key_])
distance = 0.0
for item in cluster_dict[key_]:
vecB = np.array(item)
distance += calculate_distance(vecA, vecB)
sum_ += distance
return sum_
def show_cluster(centroid_list: list,
cluster_list: dict):
"""
画聚类结果
:param centroid_list:
:param cluster_list:
:return:
"""
global ax
colorMark = ['or', 'ob', 'og', 'ok', 'oy', 'ow']
centroidMark = ['dr', 'db', 'dg', 'dk', 'dy', 'dw']
for key_ in cluster_list.keys():
cluster_centroids, = ax.plot(centroid_list[key_][0], centroid_list[key_][1], centroidMark[key_], markersize=12)
for item in cluster_list[key_]:
points, = ax.plot(item[0], item[1], colorMark[key_])
ax.legend([cluster_centroids, points], ['centroids', 'points'])
num_of_points = 200
num_of_classes = 5
data = np.random.rand(num_of_points, 2)*20
epsilon = 0.00001
pause = 1.0
if __name__ == '__main__':
centroidList = find_centroids(list(data), num_of_classes)
clusterDict = min_distance(data, centroidList)
newVar = calculate_Var(clusterDict, centroidList)
oldVar = -epsilon
while abs(newVar - oldVar) >= epsilon:
ax.cla()
centroidList = get_centroids(clusterDict)
clusterDict = min_distance(data, centroidList)
oldVar = newVar
newVar = calculate_Var(clusterDict, centroidList)
show_cluster(centroidList, clusterDict)
plt.pause(pause)