kMean.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
#coding=utf-8
from numpy import *
#导入数据
def loadDataSet(fileName):
dataMat = []
fr = open(fileName)
for line in fr.readlines():
curLine = line.strip().split('\t')
fltLine = map(float, curLine)
dataMat.append(fltLine)
return dataMat
#计算两个向量的欧氏距离
def distEclud(vecA, vecB):
return sqrt(sum(power(vecA-vecB, 2)))
#为给定数据集构建一个包含k个随机质心的集合
def randCent(dataSet, k):
n = shape(dataSet)[1]
centroids = mat(zeros((k, n)))
for j in range(n):
minJ = min(dataSet[:, j])
randJ = float(max(dataSet[:,j] - minJ))
#random.rand(k, 1)生成k个[0,1]中的随机数
centroids[:, j] = minJ + randJ * random.rand(k, 1)
return centroids
#k均值聚类算法
def kMeans(dataSet, k, distMeas=distEclud, createCent=randCent):
m = shape(dataSet)[0] #数据点总数
clusterAssment = mat(zeros((m, 2))) #簇分配结果矩阵,一列记录簇索引值,一列存储误差
centroids =createCent(dataSet, k)
clusterChanged = True
#按 计算质心-分配-重新计算 反复迭代
while clusterChanged:
clusterChanged = False
for i in range(m):
minDist = inf
minIndex = -1
#寻找最近的质心
for j in range(k):
distJI = distMeas(centroids[j,:], dataSet[i,:])
if distJI < minDist:
minDist = distJI
minIndex = j
if clusterAssment[i,0] != minIndex:
clusterChanged = True
clusterAssment[i,:] = minIndex, minDist**2
#更新质心的位置
for cent in range(k):
ptsInClust = dataSet[nonzero(clusterAssment[:,0].A==cent)[0]]
centroids[cent,:] = mean(ptsInClust, axis=0)
return centroids, clusterAssment
#二分k-均值聚类算法
def biKeans(dataSet, k, distMeas=distEclud):
m = shape(dataSet)[0]
clusterAssment = mat(zeros((m, 2))) #第一例存储簇分配结果,第二列存储平方误差
centroid0 = mean(dataSet, axis=0).tolist()[0] #创建一个初始簇
cenList = [centroid0] #存储所有质心
for j in range(m):
clusterAssment[j,1] = distMeas(mat(centroid0), dataSet[j,:]) ** 2
while (len(cenList) < k):
lowestSSE = inf
for i in range(len(cenList)):
ptsInCurrCluster = dataSet[nonzero(clusterAssment[:,0].A==i)[0],:] #将簇中的所有点看成一个小的数据集
centroidMat, splitClustAss = kMeans(ptsInCurrCluster, 2, distMeas) #生成2个质心簇
sseSplit = sum(splitClustAss[:,1]) #误差和
sseNotSplit = sum(clusterAssment[nonzero(clusterAssment[:,0].A != i)[0], 1]) #剩余数据集误差和
if (sseSplit + sseNotSplit) < lowestSSE:
bestCentToSplit = i
bestNewCents = centroidMat
bestClusAss = splitClustAss.copy()
lowestSSE = sseSplit + sseNotSplit
#将要划分的簇中所有点的簇分配结果进行修改
bestClusAss[nonzero(bestClusAss[:,0].A==0)[0], 0] = bestCentToSplit #将编号为0的簇改为划分簇的编号
bestClusAss[nonzero(bestClusAss[:,0].A==1)[0], 0] = len(cenList) #将编号为1的簇改为添加簇的编号
print 'the bestCentToSplit is: ', bestCentToSplit
print 'the len of bestClustAss is: ', len(bestClusAss)
cenList[bestCentToSplit] = bestNewCents[0,:].tolist()[0] #修改质心列表
cenList.append(bestNewCents[1,:].tolist()[0]) #添加到质心列表
clusterAssment[nonzero(clusterAssment[:,0].A == bestCentToSplit)[0],:] = bestClusAss
return mat(cenList), clusterAssment
测试
>>> import kMean
>>> dataMat = mat(loadDataSet('testSet2.txt'))
>>> centList, myAssments = biKeans(dataMat, 3)
the bestCentToSplit is: 0
the len of bestClustAss is: 60
the bestCentToSplit is: 0
the len of bestClustAss is: 40
>>> centList
matrix([[-2.94737575, 3.3263781 ],
[-0.45965615, -2.7782156 ],
[ 2.93386365, 3.12782785]])
>>> myAssments
matrix([[ 2.00000000e+00, 1.45461050e-01],
[ 0.00000000e+00, 6.80213825e-01],
[ 1.00000000e+00, 1.02184582e+00],
[ 2.00000000e+00, 1.34548760e+00],
[ 0.00000000e+00, 1.35376464e+00],
[ 1.00000000e+00, 3.87167519e+00],
[ 2.00000000e+00, 8.37259951e-01],
[ 0.00000000e+00, 2.20116272e-01],
... ...