from numpy import *
from pandas import *
def randomCenters(dataSet,k):
n = shape(dataSet)[1]
print "shape",shape(dataSet)
centers = mat(zeros((k,n)))
for j in range(n):
minJ = min(dataSet[:,j])
print "dataSet[:,j]", dataSet[:,j]
rangeJ = float(max(dataSet[:,j]) - minJ)
centers[:,j] = minJ+rangeJ+random.rand(k,1)
print "centers[:,j] ", centers[:,j]
return centers
def calcDist(A,B):
return sqrt(sum(power((A-B),2)))
def kMeans(dataSet, k):
n = shape(dataSet)[0]
m = shape(dataSet)[1]
#初始化距离矩阵
distMat = mat(zeros((n,2)))
print "distMat",distMat
randomCents = randomCenters(dataSet, k)
centerChanged = True
while centerChanged:
centerChanged = False
for i in range(n):
minDist = inf
minIndex = -1
for j in range(k):
#对每一个点,计算它到质心的距离
distMean = calcDist(dataSet[i,:], randomCents[j,:])
if distMean<minDist:
minDist = distMean
minIndex = j
if distMat[i,0] == minIndex:
centerChanged = True
#其实是将它所属的质心记录下来
distMat[i,0] = minIndex
distMat[i,1] = distMean
print randomCents
#相当于打标签
dataNew = column_stack((dataSet, distMat[:,0]))
print "dataNew", dataNew
# 可以理解为增加表头
dataUse = DataFrame(dataNew)
print "dataUse", dataUse
for i in range(k):
dataMean = dataUse[dataUse[m] == i]
l = []
for j in range(m):
means = mean(dataMean[j])
l.append(means)
randomCents[i] = l
return randomCents, distMat
c1 = [1,2]
c2 = [10,11]
c3 = [2,3]
c4 = [9,10]
dataTest = mat([c1,c2,c3,c4])
print "dataTest",dataTest
kMeans(dataTest,2)