#DeniuHe手工代码之KMeans
import numpy as np
import pandas as pd
from sklearn import datasets
import matplotlib.pyplot as plt
from scipy.spatial.distance import pdist,squareform
def randomCenters(dataSet,k):
n_row,n_col = dataSet.shape
centers = np.zeros((k,n_col))
selected_index = np.random.choice(range(n_row),k)
for k,index in zip(range(k),selected_index):
centers[k,:] = dataSet[index,:]
return centers
def euclideanDist(A,B):
return np.sqrt(sum((A-B) ** 2))
def K_Means(dataSet,k):
n,m = dataSet.shape
random_centers = randomCenters(dataSet,k)
distMatrix = np.zeros((n,2)) #存放样本点的质心标识及其到质心的距离
centerChanged = True
while centerChanged:
centerChanged = False
for i in range(n):
minDist = np.inf
minIndex = -1
for j in range(k):
distMean = euclideanDist(dataSet[i,:],random_centers[j,:])
if distMean < minDist:
minDist = distMean
minIndex = j
if distMatrix[i,0] != minIndex: # 聚类结果是否有变化,是否需要继续循环
centerChanged = True
distMatrix[i,0] = minIndex
distMatrix[i,1] = minDist
#更新聚类中心点
for j in range(k):
cluster = dataSet[distMatrix[:,0] == j]
random_centers[j] = np.mean(cluster,axis=0)
return random_centers,distMatrix
if __name__ == "__main__":
iris = datasets.load_iris()
X = iris.data
y = iris.target
DataSet = X[:,2:4]
RandomCents,DistMat = K_Means(DataSet,3)
for j in range(len(DataSet)):
if 0 == DistMat[j,0]:
plt.scatter(DataSet[j,0],DataSet[j,1],c='red',marker='o')
elif 1 == DistMat[j,0]:
plt.scatter(DataSet[j,0],DataSet[j,1],c='green',marker='*')
elif 2 == DistMat[j,0]:
plt.scatter(DataSet[j, 0], DataSet[j,1], c='blue', marker='+')
plt.ylabel('petal length')
plt.xlabel('petal width')
plt.legend(loc=2)
plt.show()
该版本是不是会抛异常。
版本2:无异常版
import numpy as np
import pandas as pd
from sklearn import datasets as DS
import matplotlib.pyplot as plt
def euclideanDist(A,B):
return np.sqrt(sum( (A -B ) ** 2 ))
def RandomCenters(dataSet,k):
n = dataSet.shape[0]
centerIndex = np.random.choice(range(n),size=k,replace=False)
centers = dataSet[centerIndex]
return centers
def KMeans(dataSet,k):
Centers = RandomCenters(dataSet,k)
n,m = dataSet.shape
DistMatrix = np.zeros((n,2))
centerChanged = True
while centerChanged == True:
centerChanged = False
for i in range(n):
minDist = np.inf
minIndex = -1
for j in range(k):
dist = euclideanDist(dataSet[i,:],Centers[j,:])
if dist < minDist:
minDist = dist
minIndex = j
if DistMatrix[i,0] != minIndex:
centerChanged = True
DistMatrix[i,0] = minIndex
DistMatrix[i,1] = minDist
if centerChanged == True: #如何聚类中心有变化,那么接下来就要更新聚类中心
for i in range(k):
dataMean = dataSet[DistMatrix[:,0]==i] #dataMean中是相同类簇的样本
Centers[i] = np.mean(dataMean,axis=0)
return Centers,DistMatrix
if __name__ == "__main__":
Data = np.array(pd.read_csv(r'E:\data set\clusterData\bolbs_1.csv',header=None))
X = Data[:,:2]
N = len(X)
k = 24
Center,DistMat = KMeans(X,k)
print(set(DistMat[:,0]))
for i in range(N):
plt.scatter(X[:,0],X[:,1],c=DistMat[:,0])
plt.show()