#coding=utf-8
from numpy import *
defloadDataSet(fileName):
dataMat=[]
fr=open(fileName)for line infr.readlines():
curLine= line.strip().split('\t')
fltLine=map(float, curLine)
dataMat.append(fltLine)returndataMat#计算两个向量的距离,用的是欧几里得距离
defdistEclud(vecA, vecB):return sqrt(sum(power(vecA - vecB, 2)))#随机生成初始的质心(ng的课说的初始方式是随机选K个点)
defrandCent(dataSet, k):
n= shape(dataSet)[1]
centroids=mat(zeros((k,n)))for j inrange(n):
minJ=min(dataSet[:,j])
rangeJ= float(max(array(dataSet)[:,j]) -minJ)
centroids[:,j]= minJ + rangeJ * random.rand(k,1)returncentroidsdef kMeans(dataSet, k, distMeas=distEclud, createCent=randCent):
m=shape(dataSet)[0]
clusterAssment= mat(zeros((m,2)))#create mat to assign data points
#to a centroid, also holds SE of each point
centroids =createCent(dataSet, k)
clusterChanged=TruewhileclusterChanged:
clusterChanged=Falsefor i in range(m):#for each data point assign it to the closest centroid
minDist =inf
minIndex= -1
for j inrange(k):
distJI=distMeas(centroids[j,:],dataSet[i,:])if distJI
minDist= distJI; minIndex =jif clusterAssment[i,0] !=minIndex:
clusterChanged=True
clusterAssment[i,:]= minIndex,minDist**2
printcentroidsfor cent in range(k):#recalculate centroids
ptsInClust = dataSet[nonzero(clusterAssment[:,0].A==cent)[0]]#get all the point in this cluster
centroids[cent,:] = mean(ptsInClust, axis=0) #assign centroid to mean
returncentroids, clusterAssmentdefshow(dataSet, k, centroids, clusterAssment):from matplotlib importpyplot as plt
numSamples, dim=dataSet.shape
mark= ['or', 'ob', 'og', 'ok', '^r', '+r', 'sr', 'dr', '
markIndex=int(clusterAssment[i, 0])
plt.plot(dataSet[i, 0], dataSet[i,1], mark[markIndex])
mark= ['Dr', 'Db', 'Dg', 'Dk', '^b', '+b', 'sb', 'db', '
plt.plot(centroids[i, 0], centroids[i,1], mark[i], markersize = 12)
plt.show()defmain():
dataMat= mat(loadDataSet('testSet.txt'))
myCentroids, clustAssing= kMeans(dataMat,4)printmyCentroids
show(dataMat,4, myCentroids, clustAssing)if __name__ == '__main__':
main()