#coding=utf-8 from numpy import * #数据集的读取 def loadDataSet(fileName): dataMat=[] fr=open(fileName) for line in fr.readlines(): currentLine=line.strip().split('\t') fltLine=map(float,currentLine) dataMat.append(fltLine) return dataMat #计算欧氏距离 def distEclud(vecA,vecB): return sqrt(sum(power(vecA-vecB,2))) #构建一个包含k个质心的集合 def randCent(dataSet,k): n=shape(dataSet)[1] centroids=mat(zeros((k,n))) for j in range(n): minJ=min(dataSet[:,j]) rangeJ=float(max(dataSet[:,j])-minJ) centroids[:,j]=minJ+rangeJ*random.rand(k,1) return centroids #k-均值聚类算法 def kMeans(dataSet,k,distMeas=distEclud,createCent=randCent): m=shape(dataSet)[0] clusterAssment=mat(zeros((m,2)))#一列存储簇索引值,一列存储误差 centroids=createCent(dataSet,k)#随机产生质心 clusterChanged=True while clusterChanged:#当簇分配发生改变时 clusterChanged=False for i in range(m):#计算每一个点与质心的距离 minDist=inf; minIndex=-1 for j in range(k): distJI=distMeas(centroids[j,:],dataSet[i,:]) if distJI<minDist: minDist=distJI;minIndex=j if clusterAssment[i,0]!=minIndex: clusterChanged=True clusterAssment[i,:]=minIndex,minDist**2 print centroids for cent in range(k):#更新质心的值 ptsInClust=dataSet[nonzero(clusterAssment[:,0].A==cent)[0]] centroids[cent,:]=mean(ptsInClust,axis=0) return centroids,clusterAssment dataMat=mat(loadDataSet('testSet.txt')) myCentroids,clustAssing=kMeans(dataMat,4)