这篇文章写一下如何用Python算法实现k-means应用
还是继续上一篇的例子:
import numpy as np
def kmeans(X,k,maxIt):#X,数据集;k,聚类中心点个数;maxIt:最大迭代次数
numPoints,numDim=X.shape#行,列
dataSet=np.zeros((numPoints,numDim+1))#给原来数据多加一列
dataSet[:,:-1]=X #除了最后一列都与X相等
centroids=dataSet[np.random.randint(numPoints,size=k)]#中心点的选择,随机选择
#centroids=dataSet[0:2,:]
centroids[:,-1]=range(1,k+1)
iterations=0 #循环次数变量
oldCentroids=None #旧中心点
while not shouldStop(oldCentroids,centroids,iterations,maxIt):#不停止循环条件
print('iteration:\n',iterations)
print('dataset:\n',dataSet)
print('centroids:\n',centroids)
oldCentroids=np.copy(centroids)
iterations+=1
updateLabels(dataSet,centroids)#更新label
centroids=getCentroids(dataSet,k)#返回新的中心点
return dataSet
def shouldStop(oldCentroids,centroids,iterations,maxIt):#停止循环条件
if iterations > maxIt:
return True
return np.array_equal(oldCentroids,centroids)
def updateLabels(dataSet,Centroids):
numPoints,numDim=dataSet.shape
for i in range(0,numPoints):
dataSet[i,-1]=getLabelFromClosestCentroid(dataSet[i,:-1],Centroids)
def getLabelFromClosestCentroid(dataSetRow,centroids):
label=centroids[0,-1]
minDist=np.linalg.norm(dataSetRow-centroids[0,:-1])#最小距离
for i in range(1,centroids.shape[0]):
dist=np.linalg.norm(dataSetRow-centroids[i,:-1])
if dist<minDist:
minDist=dist
label=centroids[i,-1]
print('minDist:',minDist)
return label
def getCentroids(dataSet,k):
result=np.zeros((k,dataSet.shape[1]))
for i in range(1,k+1):
oneCluster=dataSet[dataSet[:,-1]==i,:-1]
result[i-1,:-1]=np.mean(oneCluster,axis=0)
result[i-1,-1]=i
return result
x1=np.array([1,1])
x2=np.array([2,1])
x3=np.array([4,3])
x4=np.array([5,4])
testX=np.vstack((x1,x2,x3,x4))#纵向堆起来组成一个矩阵
result=kmeans(testX,2,10)
print('final result:',result)
也就是把上一篇文章的算法实现了一下