概念
在算法开始时,需要随机选择K个数据点作为初始的聚类中心。
对于数据集中的每个数据点,计算其与每个聚类中心的距离,并将其分配给距离最近的聚类中心。
新的聚类中心是该聚类内所有数据点的均值。
重复分配和更新步骤,直到满足终止条件
代码实现
安装sklearn库
pip install -U scikit-learn
完整代码
用GPU运行
import math
import torch
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
def kmeans(fx, n, metric='cosine'):
device = fx.device
if metric == 'cosine':
fn = fx / torch.clamp(torch.norm(fx, dim=1, keepdim=True), min=1e-20)
elif metric == 'euclidean':
fn = fx
else:
raise KeyError
fn = fn.detach().cpu().numpy()
fx = fx.detach().cpu().numpy()
labels = KMeans(n_clusters=n).fit_predict(fn)
for li in np.unique(labels):
print("li:",li)
fp = np.stack([np.mean(fx[labels == li], axis=0) for li in np.unique(labels)]) #每类特征取平均
fp = torch.FloatTensor(fp).to(device)
return fp, labels
def kmeans1(fx, n, metric='cosine'):
if metric == 'cosine':
fn = fx / torch.clamp(torch.norm(fx, dim=1, keepdim=True), min=1e-20)
elif metric == 'euclidean':
fn = fx
else:
raise KeyError
fn = fn.detach().cpu().numpy()
# 初始化KMeans对象
kmeans = KMeans(n_clusters=n, random_state=0)
# 对数据进行拟合和预测
kmeans.fit(fn)
labels = kmeans.predict(fn)
centroids = kmeans.cluster_centers_
return centroids
fx = torch.randn(20,3).to(device)
print("fx:",fx)
print("fx的第一列:",fx[:, 0]) #第一列
print("fx的第二列:",fx[:, 1]) #第二列
fp,labels = kmeans(fx, 3, 'cosine')
centroids = kmeans1(fx, 3, 'cosine')
print("labels:",labels)
print("fp",fp)
print("centroids",centroids)
运行结果
fx: tensor([[-1.2288, -0.2048, -0.8777],
[ 1.3793, 1.1049, -0.5403],
[-2.1222, -0.4840, -1.5972],
[-0.8414, -1.3761, -1.7024],
[-1.2135, -2.0154, -0.3673],
[ 2.7906, -1.0475, -0.6564],
[-0.8736, -0.6809, 0.9324],
[ 1.2725, -0.0381, 0.5944],
[ 0.8706, 1.0110, -0.1519],
[-0.4159, 0.6281, -0.0885],
[-0.3400, 0.8238, -1.4164],
[ 0.0862, 0.3788, 0.1898],
[-0.6671, 1.6845, -0.5524],
[-2.3690, 0.8054, 0.3174],
[-0.5590, -1.2533, -0.5782],
[-1.1654, -0.7832, 2.4393],
[ 0.9433, 1.0865, -1.5112],
[-0.2004, 0.0918, -0.3463],
[-0.6177, 1.4732, 0.8773],
[ 0.8457, -1.6843, -1.6098]], device='cuda:0')
fx的第一列: tensor([-1.2288, 1.3793, -2.1222, -0.8414, -1.2135, 2.7906, -0.8736, 1.2725,
0.8706, -0.4159, -0.3400, 0.0862, -0.6671, -2.3690, -0.5590, -1.1654,
0.9433, -0.2004, -0.6177, 0.8457], device='cuda:0')
fx的第二列: tensor([-0.2048, 1.1049, -0.4840, -1.3761, -2.0154, -1.0475, -0.6809, -0.0381,
1.0110, 0.6281, 0.8238, 0.3788, 1.6845, 0.8054, -1.2533, -0.7832,
1.0865, 0.0918, 1.4732, -1.6843], device='cuda:0')
li: 0
li: 1
li: 2
labels: [2 0 2 2 2 0 1 0 0 1 2 1 1 1 2 1 0 2 1 2]
fp tensor([[ 1.4513, 0.4234, -0.4531],
[-0.8604, 0.5008, 0.5879],
[-0.7074, -0.7628, -1.0619]], device='cuda:0')
centroids [[-0.4281336 0.42465085 0.30694795]
[ 0.73335594 0.30023545 -0.18434533]
[-0.39830536 -0.31902665 -0.5969306 ]]
无GPU
import math
import torch
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
#device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
def kmeans(fx, n, metric='cosine'):
device = fx.device
if metric == 'cosine':
fn = fx / torch.clamp(torch.norm(fx, dim=1, keepdim=True), min=1e-20)
elif metric == 'euclidean':
fn = fx
else:
raise KeyError
fn = fn.detach().cpu().numpy()
fx = fx.detach().cpu().numpy()
labels = KMeans(n_clusters=n).fit_predict(fn)
for li in np.unique(labels):
print("li:",li)
fp = np.stack([np.mean(fx[labels == li], axis=0) for li in np.unique(labels)]) #每类特征取平均
fp = torch.FloatTensor(fp).to(device)
return fp, labels
def kmeans1(fx, n, metric='cosine'):
if metric == 'cosine':
fn = fx / torch.clamp(torch.norm(fx, dim=1, keepdim=True), min=1e-20)
elif metric == 'euclidean':
fn = fx
else:
raise KeyError
fn = fn.detach().cpu().numpy()
# 初始化KMeans对象
kmeans = KMeans(n_clusters=n, random_state=0)
# 对数据进行拟合和预测
kmeans.fit(fn)
labels = kmeans.predict(fn)
centroids = kmeans.cluster_centers_
return centroids
fx = torch.randn(20,3)
print("fx:",fx)
print("fx的第一列:",fx[:, 0]) #第一列
print("fx的第二列:",fx[:, 1]) #第二列
fp,labels = kmeans(fx, 3, 'cosine')
centroids = kmeans1(fx, 3, 'cosine')
print("labels:",labels)
print("fp",fp)
print("centroids",centroids)
# 可视化结果
plt.scatter(fx[:, 0], fx[:, 1], c=labels, cmap='viridis')
plt.scatter(centroids[:, 0], centroids[:, 1], c='red', s=300, alpha=0.5)
plt.title('K-means Clustering')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()
运行结果
fx: tensor([[-0.4175, 0.5362, -0.1313],
[-0.3649, 0.6443, -0.9595],
[-0.1892, -1.9962, -1.5457],
[-0.4292, 1.0525, -0.5355],
[-0.6457, -0.3774, -1.6356],
[-0.2087, -2.3619, -0.1109],
[-0.0064, 2.1141, -1.4321],
[-0.3927, -0.4896, 0.0839],
[-0.4398, -0.3246, -0.0715],
[ 0.2503, 0.3189, 0.5989],
[-0.9159, -0.6033, -0.6962],
[-0.3305, -0.2540, 0.6537],
[ 1.2764, 0.0415, 0.8948],
[-0.5654, -0.0995, -0.4954],
[-1.4912, 1.2908, 0.7805],
[-1.1452, 0.3472, -0.2968],
[ 0.0497, 0.7422, 2.5382],
[-0.2174, 0.0231, -1.9122],
[ 0.3302, 0.2542, 0.1803],
[ 0.4918, 0.8304, 0.9369]])
fx的第一列: tensor([-0.4175, -0.3649, -0.1892, -0.4292, -0.6457, -0.2087, -0.0064, -0.3927,
-0.4398, 0.2503, -0.9159, -0.3305, 1.2764, -0.5654, -1.4912, -1.1452,
0.0497, -0.2174, 0.3302, 0.4918])
fx的第二列: tensor([ 0.5362, 0.6443, -1.9962, 1.0525, -0.3774, -2.3619, 2.1141, -0.4896,
-0.3246, 0.3189, -0.6033, -0.2540, 0.0415, -0.0995, 1.2908, 0.3472,
0.7422, 0.0231, 0.2542, 0.8304])
li: 0
li: 1
li: 2
labels: [2 2 1 2 1 1 2 1 1 0 1 0 0 1 2 2 0 1 0 0]
fp tensor([[ 0.3446, 0.3222, 0.9671],
[-0.4469, -0.7787, -0.7980],
[-0.6424, 0.9975, -0.4291]])
centroids [[ 0.30834118 0.26631516 0.7164023 ]
[-0.4379066 -0.49237508 -0.46834427]
[-0.48004213 0.6436714 -0.3068624 ]]
图像中是用前两维作为坐标表示,并不准确。仅仅作为展示。