import time
import numpy as np
from sklearn.datasets import make_blobs
def euclidean_distance(a, b):
# 计算两个向量之间的欧氏距离
return np.sqrt(np.sum(np.power(a-b, 2), axis=-1))
def init_centers(x, k, seed=None):
#! 随机初始化 k 个中心点
if seed is not None:
return np.random.randomstate(seed=seed).permutation(x)[:k]
return np.random.permutation(x)[:k]
def classify_all_parallel(x, centers):
"""
x = [n, d]
centers = [k, d]
"""
n = x.shape[0]
k = centers.shape[0]
x = x.reshape((n, 1, -1))
#! [n, k]
distance_matrix = euclidean_distance(x, centers)
labels = np.argmin(distance_matrix, axis=-1)
return labels
def update_center(x, y):
#! 更新 k 个中心点
k = int(y.max()) + 1
centers = np.zeros(shape=(k, x.shape[-1]))
for i in range(k):
centers[i] = x[y==i].mean(axis=0)
return centers
def kmeans(x, k, max_iter=100, parallel=False, seed=None, cuda=False):
classifier = classify_all_parallel # if parallel else classify_all
#! 随机初始化中心点
centers = init_centers(x, k, seed=seed)
#! 对每个样本点进行归类
labels_new = classifier(x, centers)
labels_old = labels_new
for i in range(max_iter):
# print(i)
#! 更新中心点
centers = update_center(x, labels_new)
#! 对每个样本点进行归类
labels_new = classifier(x, centers)
#! 如果每个样本点的类别不再变化,则终止
if np.array_equal(labels_new, labels_old):
print("finish!")
break
else:
labels_old = labels_new
return labels_new,centers
def kmeansall(x, k, max_iter, package, parallel, seed, cuda=False):
st = time.time()
y,centers = kmeans(x, k, max_iter=max_iter, parallel=parallel, seed=seed, cuda=cuda)
et = time.time()
cost = round(et-st, 2)
print(f"耗时: {cost} 秒")
return y,centers,cost
if __name__ == '__main__':
k = 5
n_features = 2
n = 100000
seed = 234
max_iter = 1000
x,y = make_blobs(n_samples=n, n_features=n_features, centers=k, cluster_std=range(1,k+1))
print(f"n_sample = {n}")
print("----------------------")
print("numpy speedup")
print("********************")
print(f"numpy normal")
y1,c1,cost1 = kmeansall(x, k, max_iter, package="np", parallel=True, seed=seed, cuda=False)
python 手动实现kmeans
最新推荐文章于 2024-04-29 18:16:07 发布