python实现K-means聚类

lebrongao

于 2024-10-05 14:45:13 发布

阅读量273

点赞数 5

文章标签： python kmeans 聚类

本文链接：https://blog.csdn.net/lebrongao/article/details/142714221

版权

概念

在算法开始时，需要随机选择K个数据点作为初始的聚类中心。

对于数据集中的每个数据点，计算其与每个聚类中心的距离，并将其分配给距离最近的聚类中心。

新的聚类中心是该聚类内所有数据点的均值。

重复分配和更新步骤，直到满足终止条件

代码实现

安装sklearn库

pip install -U scikit-learn

完整代码

用GPU运行

import math
import torch
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
def kmeans(fx, n, metric='cosine'):
    device = fx.device
    if metric == 'cosine':
        fn = fx / torch.clamp(torch.norm(fx, dim=1, keepdim=True), min=1e-20)
    elif metric == 'euclidean':
        fn = fx
    else:
        raise KeyError
    fn = fn.detach().cpu().numpy()
    fx = fx.detach().cpu().numpy()

    labels = KMeans(n_clusters=n).fit_predict(fn)
    for li in np.unique(labels):
        print("li:",li)
    fp = np.stack([np.mean(fx[labels == li], axis=0) for li in np.unique(labels)]) #每类特征取平均
    fp = torch.FloatTensor(fp).to(device)
    return fp, labels

def kmeans1(fx, n, metric='cosine'):
    if metric == 'cosine':
        fn = fx / torch.clamp(torch.norm(fx, dim=1, keepdim=True), min=1e-20)
    elif metric == 'euclidean':
        fn = fx
    else:
        raise KeyError
    fn = fn.detach().cpu().numpy()
    # 初始化KMeans对象
    kmeans = KMeans(n_clusters=n, random_state=0)
    # 对数据进行拟合和预测
    kmeans.fit(fn)
    labels = kmeans.predict(fn)
    centroids = kmeans.cluster_centers_
    return centroids
fx = torch.randn(20,3).to(device)
print("fx：",fx)
print("fx的第一列：",fx[:, 0])  #第一列
print("fx的第二列：",fx[:, 1])  #第二列
fp,labels = kmeans(fx, 3, 'cosine')
centroids = kmeans1(fx, 3, 'cosine')
print("labels:",labels)
print("fp",fp)
print("centroids",centroids)

运行结果

fx： tensor([[-1.2288, -0.2048, -0.8777],
        [ 1.3793,  1.1049, -0.5403],
        [-2.1222, -0.4840, -1.5972],
        [-0.8414, -1.3761, -1.7024],
        [-1.2135, -2.0154, -0.3673],
        [ 2.7906, -1.0475, -0.6564],
        [-0.8736, -0.6809,  0.9324],
        [ 1.2725, -0.0381,  0.5944],
        [ 0.8706,  1.0110, -0.1519],
        [-0.4159,  0.6281, -0.0885],
        [-0.3400,  0.8238, -1.4164],
        [ 0.0862,  0.3788,  0.1898],
        [-0.6671,  1.6845, -0.5524],
        [-2.3690,  0.8054,  0.3174],
        [-0.5590, -1.2533, -0.5782],
        [-1.1654, -0.7832,  2.4393],
        [ 0.9433,  1.0865, -1.5112],
        [-0.2004,  0.0918, -0.3463],
        [-0.6177,  1.4732,  0.8773],
        [ 0.8457, -1.6843, -1.6098]], device='cuda:0')
fx的第一列： tensor([-1.2288,  1.3793, -2.1222, -0.8414, -1.2135,  2.7906, -0.8736,  1.2725,
         0.8706, -0.4159, -0.3400,  0.0862, -0.6671, -2.3690, -0.5590, -1.1654,
         0.9433, -0.2004, -0.6177,  0.8457], device='cuda:0')
fx的第二列： tensor([-0.2048,  1.1049, -0.4840, -1.3761, -2.0154, -1.0475, -0.6809, -0.0381,
         1.0110,  0.6281,  0.8238,  0.3788,  1.6845,  0.8054, -1.2533, -0.7832,
         1.0865,  0.0918,  1.4732, -1.6843], device='cuda:0')
li: 0
li: 1
li: 2
labels: [2 0 2 2 2 0 1 0 0 1 2 1 1 1 2 1 0 2 1 2]
fp tensor([[ 1.4513,  0.4234, -0.4531],
        [-0.8604,  0.5008,  0.5879],
        [-0.7074, -0.7628, -1.0619]], device='cuda:0')
centroids [[-0.4281336   0.42465085  0.30694795]
 [ 0.73335594  0.30023545 -0.18434533]
 [-0.39830536 -0.31902665 -0.5969306 ]]

无GPU

import math
import torch
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
#device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
def kmeans(fx, n, metric='cosine'):
    device = fx.device
    if metric == 'cosine':
        fn = fx / torch.clamp(torch.norm(fx, dim=1, keepdim=True), min=1e-20)
    elif metric == 'euclidean':
        fn = fx
    else:
        raise KeyError
    fn = fn.detach().cpu().numpy()
    fx = fx.detach().cpu().numpy()

    labels = KMeans(n_clusters=n).fit_predict(fn)
    for li in np.unique(labels):
        print("li:",li)
    fp = np.stack([np.mean(fx[labels == li], axis=0) for li in np.unique(labels)]) #每类特征取平均
    fp = torch.FloatTensor(fp).to(device)
    return fp, labels

def kmeans1(fx, n, metric='cosine'):
    if metric == 'cosine':
        fn = fx / torch.clamp(torch.norm(fx, dim=1, keepdim=True), min=1e-20)
    elif metric == 'euclidean':
        fn = fx
    else:
        raise KeyError
    fn = fn.detach().cpu().numpy()
    # 初始化KMeans对象
    kmeans = KMeans(n_clusters=n, random_state=0)
    # 对数据进行拟合和预测
    kmeans.fit(fn)
    labels = kmeans.predict(fn)
    centroids = kmeans.cluster_centers_
    return centroids
fx = torch.randn(20,3)
print("fx：",fx)
print("fx的第一列：",fx[:, 0])  #第一列
print("fx的第二列：",fx[:, 1])  #第二列
fp,labels = kmeans(fx, 3, 'cosine')
centroids = kmeans1(fx, 3, 'cosine')
print("labels:",labels)
print("fp",fp)
print("centroids",centroids)

# 可视化结果
plt.scatter(fx[:, 0], fx[:, 1], c=labels, cmap='viridis')
plt.scatter(centroids[:, 0], centroids[:, 1], c='red', s=300, alpha=0.5)
plt.title('K-means Clustering')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()

运行结果

fx： tensor([[-0.4175,  0.5362, -0.1313],
        [-0.3649,  0.6443, -0.9595],
        [-0.1892, -1.9962, -1.5457],
        [-0.4292,  1.0525, -0.5355],
        [-0.6457, -0.3774, -1.6356],
        [-0.2087, -2.3619, -0.1109],
        [-0.0064,  2.1141, -1.4321],
        [-0.3927, -0.4896,  0.0839],
        [-0.4398, -0.3246, -0.0715],
        [ 0.2503,  0.3189,  0.5989],
        [-0.9159, -0.6033, -0.6962],
        [-0.3305, -0.2540,  0.6537],
        [ 1.2764,  0.0415,  0.8948],
        [-0.5654, -0.0995, -0.4954],
        [-1.4912,  1.2908,  0.7805],
        [-1.1452,  0.3472, -0.2968],
        [ 0.0497,  0.7422,  2.5382],
        [-0.2174,  0.0231, -1.9122],
        [ 0.3302,  0.2542,  0.1803],
        [ 0.4918,  0.8304,  0.9369]])
fx的第一列： tensor([-0.4175, -0.3649, -0.1892, -0.4292, -0.6457, -0.2087, -0.0064, -0.3927,
        -0.4398,  0.2503, -0.9159, -0.3305,  1.2764, -0.5654, -1.4912, -1.1452,
         0.0497, -0.2174,  0.3302,  0.4918])
fx的第二列： tensor([ 0.5362,  0.6443, -1.9962,  1.0525, -0.3774, -2.3619,  2.1141, -0.4896,
        -0.3246,  0.3189, -0.6033, -0.2540,  0.0415, -0.0995,  1.2908,  0.3472,
         0.7422,  0.0231,  0.2542,  0.8304])
li: 0
li: 1
li: 2
labels: [2 2 1 2 1 1 2 1 1 0 1 0 0 1 2 2 0 1 0 0]
fp tensor([[ 0.3446,  0.3222,  0.9671],
        [-0.4469, -0.7787, -0.7980],
        [-0.6424,  0.9975, -0.4291]])
centroids [[ 0.30834118  0.26631516  0.7164023 ]
 [-0.4379066  -0.49237508 -0.46834427]
 [-0.48004213  0.6436714  -0.3068624 ]]

图像中是用前两维作为坐标表示，并不准确。仅仅作为展示。