import numpy as np
import pandas as pd
from sklearn import datasets as DS
import matplotlib.pyplot as plt
def euclideanDist(A, B):
return np.sqrt(sum((A - B) ** 2))
def RandomCenters(dataSet, k):
n = dataSet.shape[0]
centerIndex = np.random.choice(range(n), size=k, replace=False)
centers = dataSet[centerIndex]
return centers
def KMeans(dataSet, k):
Centers = RandomCenters(dataSet, k)
n, m = dataSet.shape
DistMatrix = np.zeros((n, 2)) #n*2的矩阵用于存储 类簇索引
centerChanged = True
while centerChanged == True:
centerChanged = False
for i in range(n):
minDist = np.inf
minIndex = -1
for j in range(k):
dist = euclideanDist(dataSet[i, :], Centers[j, :])
if dist < minDist: #获取每个样本聚类最近的聚类中心点及其聚类
minDist = dist
minIndex = j
if DistMatrix[i, 0] != minIndex:
centerChanged = True
DistMatrix[i, 0] = minIndex #存储的是索引
DistMatrix[i, 1] = minDist #存储的是距离
if centerChanged == True: # 如何聚类中心有变化,那么接下来就要更新聚类中心
for i in range(k):
dataMean = dataSet[DistMatrix[:, 0] == i] # dataMean中是相同类簇的样本
Centers[i] = np.mean(dataMean, axis=0)
return Centers, DistMatrix
def PointSelection(DistMatrix,k,n):
points = []
for i in range(k):
minDist = np.inf
closeIndex = -1
for j in range(n):
if DistMatrix[j,0] == i:
if DistMatrix[j,1] < minDist:
minDist = DistMatrix[j,1]
closeIndex = j
points.append(closeIndex)
return points
if __name__ == "__main__":
path = r"D:\dataset\clusterData\bolbs_1.csv"
Data = np.array(pd.read_csv(path, header=None))
X = Data[:, :2]
n = len(X)
k = 2
Center, DistMat = KMeans(X, k)
Points = PointSelection(DistMat,k,n)
plt.scatter(X[:,0],X[:,1], c=DistMat[:,0] )
CP = X[Points]
plt.scatter(CP[:,0],CP[:,1],marker="*",s=200)
plt.show()