python 手动实现kmeans

import time
import numpy as np
from sklearn.datasets import make_blobs
def euclidean_distance(a, b):
    # 计算两个向量之间的欧氏距离
    return np.sqrt(np.sum(np.power(a-b, 2), axis=-1))

def init_centers(x, k, seed=None):
    #! 随机初始化 k 个中心点
    if seed is not None:
        return np.random.randomstate(seed=seed).permutation(x)[:k]
    return np.random.permutation(x)[:k]


def classify_all_parallel(x, centers):
    """
        x       = [n, d]
        centers = [k, d]
    """
    n = x.shape[0]
    k = centers.shape[0]
    x = x.reshape((n, 1, -1))
    #! [n, k]
    distance_matrix = euclidean_distance(x, centers)
    labels = np.argmin(distance_matrix, axis=-1)
    return labels

def update_center(x, y):
    #! 更新 k 个中心点
    k = int(y.max()) + 1
    centers = np.zeros(shape=(k, x.shape[-1]))
    for i in range(k):
        centers[i] = x[y==i].mean(axis=0)
    return centers

def kmeans(x, k, max_iter=100, parallel=False, seed=None, cuda=False):
    classifier = classify_all_parallel # if parallel else classify_all
    #! 随机初始化中心点
    centers = init_centers(x, k, seed=seed)
    #! 对每个样本点进行归类
    labels_new = classifier(x, centers)
    labels_old = labels_new
    for i in range(max_iter):
        # print(i)
        #! 更新中心点
        centers = update_center(x, labels_new)
        #! 对每个样本点进行归类
        labels_new = classifier(x, centers)
        #! 如果每个样本点的类别不再变化,则终止
        if np.array_equal(labels_new, labels_old):
            print("finish!")
            break
        else:
            labels_old = labels_new
    return labels_new,centers


def kmeansall(x, k, max_iter, package, parallel, seed, cuda=False):
    
    st = time.time()
    y,centers = kmeans(x, k, max_iter=max_iter, parallel=parallel, seed=seed, cuda=cuda)
    et = time.time()
    cost = round(et-st, 2)
    print(f"耗时: {cost} 秒")
    return y,centers,cost

if __name__ == '__main__':

    k = 5
    n_features = 2
    n = 100000
    seed = 234
    max_iter = 1000
    x,y = make_blobs(n_samples=n, n_features=n_features, centers=k, cluster_std=range(1,k+1))
    print(f"n_sample = {n}")
    print("----------------------")
    
    print("numpy speedup")
    


    print("********************")
    print(f"numpy normal")
    y1,c1,cost1 = kmeansall(x, k, max_iter, package="np", parallel=True, seed=seed, cuda=False)

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

samoyan

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值