k-means聚类算法是一种经典的基于距离的聚类算法,它的基本思想就是先指定需要划分的簇的个数k,在数据集中选取k个数据作为初始的聚类中心,接着计算这k个聚类中心与其他数据之间存在的距离,根据最近邻原则,划分簇并继续调整,分别对新的聚类中心进行计算,直到算法收敛或者到达指定的迭代次数,算法流程如下:
算法流程图:
Python实现
import torch
import random
import numpy as np
from sklearn import datasets
def kmeans(data, N_way, iteration):
# data : N*S, N为样本数,S为特征维数,tensor
# N_way : 类别数
#iteration:迭代次数
#初始化 N_way个聚类中心
N = data.shape[0]
index = range(N)
C = data[random.sample(index, N_way),:] #C*S,C为类别,N为样本
D = torch.zeros(N_way, N) #样本和类中心的距离矩阵
count = 0
while True:
G = torch.zeros(N_way, N) #类别硬化分矩阵,每行一个类别,每列一个样本
P = torch.zeros(N_way, N) #距离按列归一化矩阵,可用作软分类
#计算各个样本到各类的距离,及所属的类别
for i in range(N_way):
c_i = C[i,:]
c_i = c_i.unsqueeze(0).expand(N, -1)
d = (c_i - data)*(c_i - data)
d = d.sum(dim=1)
D[i,:] = d
for i in range(N):
P[:,i] = D[:, i]/D[:, i].sum()
index = torch.min(D[:, i], 0)[1]
G[index, i] = 1
#更新类中心
C = torch.mm(G,data)
N = [] #每个簇样本数
for i in range(N_way):
num = len(torch.nonzero(G[i, :]))
C[i, :] = C[i, :]/num
N.append(num)
#判断是否满足终止条件
if(count>iteration):
break
count += 1
return C,G,P
if __name__ == '__main__':
iris = datasets.load_iris()
n_sample, n_feature = iris.data.shape
data = iris.data
data = data.astype(np.float32)
data = torch.from_numpy(data)
C,G,P = kmeans(data,3,100)
label = iris.target
count = 0
for i in range(150):
pre = torch.max(P[:,i],0)[1]
print(pre)
if pre==label[i]:
count += 1
print(count)