''' 创建k个点作为初始的质心点(随机选择) 当任意一个点的簇分配结果发生改变时 对数据集中的每一个数据点 对每一个质心 计算质心与数据点的距离 将数据点分配到距离最近的簇 对每一个簇,计算簇中所有点的均值,并将均值作为质心 '''
def distEclud(vecA, vecB): #两个向量之间的距离 return sqrt(sum(power(vecA - vecB, 2))) #la.norm(vecA-vecB) 欧几里得距离
def distSLC(vecA, vecB):#Spherical Law of Cosines 余弦距离(球面距离) a = sin(vecA[0,1]*pi/180) * sin(vecB[0,1]*pi/180) b = cos(vecA[0,1]*pi/180) * cos(vecB[0,1]*pi/180) * \ cos(pi * (vecB[0,0]-vecA[0,0]) /180) return arccos(a + b)*6371.0 #pi is imported with numpy def randCent(dataSet, k): #创建k各随机点作为初始中心点 n = shape(dataSet)[1] centroids = mat(zeros((k,n))) #create centroid mat for j in range(n):#create random cluster centers, within bounds of each dimension minJ = min(dataSet[:,j]) rangeJ = float(max(dataSet[:,j]) - minJ) centroids[:,j] = mat(minJ + rangeJ * random.rand(k,1)) return centroids
def kMeans(dataSet, k, distMeas=distEclud, createCent=randCent): m = shape(dataSet)[0] clusterAssment = mat(zeros((m,2)))#create mat to assign data points
#to a centroid, also holds SE of each point centroids = createCent(dataSet, k) clusterChanged = True while clusterChanged: clusterChanged = False for i in range(m): #for each data point assign it to the closest centroid minDist = inf minIndex = -1 for j in range(k): distJI = distMeas(centroids[j,:],dataSet[i,:]) if distJI < minDist: minDist = distJI; minIndex = j if clusterAssment[i,0] != minIndex: clusterChanged = True clusterAssment[i,:] = minIndex,minDist**2 # print centroids for cent in range(k):#recalculate centroids ptsInClust = dataSet[nonzero(clusterAssment[:,0].A==cent)[0]]#get all the point in this cluster centroids[cent,:] = mean(ptsInClust, axis=0) #assign centroid to mean return centroids, clusterAssment