import numpy as np
import time
class KMeans:
def __init__(self):
return
def kmeans(self, data, k):
# input: data and k
# output: clusters and nearest
np.random.seed()
n, m = len(data), len(data[0]) # n * m
# initial clusters and nearest
clusters = np.empty((k, m))
nearest = np.empty((n)) # n
# idxs = np.random.choice(n, k)
idxs = [5, 11, 23]
clusters = data[idxs] # k * m
# expand data and cluster to avoid for-loop
data_expd = np.repeat(np.expand_dims(data, axis=1), k, axis=1) # n * k * m
while True:
clusters_expd = np.repeat(np.expand_dims(clusters, axis=0), n, axis=0) # n * k * m
# cal dist between each sample and each cluster
distances = np.sqrt(np.sum(np.power(data_expd - clusters_expd, 2), axis=2)) # n * k
# cal cur nearest and cmp the last and the cur
new_nearest = np.argmin(distances, axis=1)
if (new_nearest == nearest).all():
break
else:
# if need update, updata the clusters and the nearest
nearest = new_nearest
for i in range(k):
clusters[i] = np.mean(data[nearest == i], axis=0)
return clusters, nearest
data = [[0.697, 0.460], [0.774, 0.376], [0.634, 0.264], [0.608, 0.318], [0.556, 0.215],
[0.403, 0.237], [0.481, 0.149], [0.437, 0.211], [0.666, 0.091], [0.243, 0.267],
[0.245, 0.057], [0.343, 0.099], [0.639, 0.161], [0.657, 0.198], [0.360, 0.370],
[0.593, 0.042], [0.719, 0.103], [0.359, 0.188], [0.339, 0.241], [0.282, 0.257],
[0.748, 0.232], [0.714, 0.346], [0.483, 0.312], [0.478, 0.437], [0.525, 0.369],
[0.751, 0.489], [0.532, 0.472], [0.473, 0.376], [0.725, 0.445], [0.446, 0.459]]
data = np.array(data)
print("当K=3时:")
k = 3
KM = KMeans()
print("聚类结果为:")
print(KM.kmeans(data, 3))
聚类 西瓜
于 2023-12-12 14:10:55 首次发布