1、从文件加载数据集
import numpy as np
inf = 0x3f3f3f3f
def loadDataSet(fileName):
dataMat = []
fr = open(fileName)
for line in fr.readlines():
curLine = line.strip().split('\t')
fltLine = list(map(float,curLine))
dataMat.append(fltLine)
return dataMat
2、计算两个向量的欧氏距离
def disEclud(vecA,vecB):
return np.sqrt(np.sum(np.power(vecA-vecB,2)))
3、构建一个包含 K 个随机质心的集合
def randCent(dataSet,k):
n = np.shape(dataSet)[1]
centroids = np.mat(np.zeros((k,n)))
for j in range (n):
minJ = min(dataSet[:,j])
rangeJ = float(max(dataSet[:,j]) - minJ)
centroids[:,j] = np.mat(minJ +rangeJ * np.random.rand(k,1))
return centroids
在完成了以上的编程后,我们可以在主程序中先测试一下这些函数
def testBasicFunc():
datMat = np.mat(loadDataSet('KMeans-testSet.txt'))
print('min(datMat[:,0])=',min(datMat[:,0]))
print('min(datMat[:,1])=',min(datMat[:,1]))
print('max(datMat[:,1])=',max(datMat[:,1]))
print('max(datMat[:,0])=',max(datMat[:,0]))
print('randCent(datMat,2) = ',randCent(datMat,2))
print('disEclud(datMat[0],datMat[1])=',disEclud(datMat[0],datMat[1]))
4、K-Means 聚类算法(注意:此算法很容易在编写过程中出现错误,请细心察看)
def kmeans(dataSet, k, distMeas=disEclud, createCent=randCent):
m = np.shape(dataSet)[0]
clusterAssment = np.mat(np.zeros((m,2)))
centroids = createCent(dataSet, k )
clusterChanged = True
num = 1
while clusterChanged:
clusterChanged = False
for i in range (m):
minDist = inf
minIndex = -1
for j in range(k):
distJI= distMeas(centroids[j,:],dataSet[i,:])
if distJI < minDist:
minDist = distJI
minIndex = j
if clusterAssment[i,0]!= minIndex:
clusterChanged = True
clusterAssment[i,:]= minIndex,minDist**2
print('第%d次运行结果为:'% num)
num+=1
print(centroids)
for cent in range(k):
ptsInClust = dataSet[np.nonzero(clusterAssment[:,0])[0]]
centroids[cent,:] = np.mean(ptsInClust,axis=0)
return centroids,clusterAssment
def testKMeans():
datMat = np.mat(loadDataSet('KMeans-testSet.txt'))
myCentroids , clustAssing = kmeans(datMat,4)
print('centroids=',myCentroids)
if __name__ == '__main__':
testKMeans()
运行上述代码,得到结果如图所示
(注意:这只是其中一次可能的随机结果,实际个人的随即结果会有各种可能,每一次运行 也 都会产生新的结果,请至少截图其中两次结果)
5、K-Means 聚类算法的缺陷 在 kMeans 的函数测试中,可能偶尔会陷入局部最小值(局部最优的结果,但不是全局 最 优的结果)。出现这个问题有很多原因,可能是 k 值取的不合适,可能是距离函数不合 适,可能是最初随机选取的质心靠的太近,也可能是数据本身分布的问题。 为了解决这个问题,我们可以对生成的簇进行后处理,一种方法是将具有最大 SSE 值 的簇划分成两个簇。具体实现时可以将最大簇包含的点过滤出来并在这些点上运行 K-均值 算法,令 k 设为 2。 为了保持簇总数不变,可以将某两个簇进行合并。从上图中很明显就可以看出,应该将 上图下部两个出错的簇质心进行合并。那么问题来了,我们可以很容易对二维数据上的聚类 进行可视化, 但是如果遇到 40 维的数据应该如何去做? 有两种可以量化的办法:合并最近的质心,或者合并两个使得 SSE 增幅最小的质心。第 一种思路通过计算所有质心之间的距离, 然后合并距离最近的两个点来实现。第二种方法 需要合并两个簇然后计算总 SSE 值。必须在所有可能的两个簇上重复上述处理过程,直到 找到合并最佳的两个簇为止。 因为上述后处理过程实在是有些繁琐,所以有人提出了另一个称之为二分 K-均值 (bisectingK-Means)的算法.
6、二分 K-Means 聚类算法 该算法首先将所有点作为一个簇,然后将该簇一分为二。 之后选择其中一个簇继续进 行划分,选择哪一个簇进行划分取决于对其划分时候可以最大程 度降低 SSE(平方和误差) 的值。
上述基于 SSE 的划分过程不断重复,直到得到用户指定的簇数目为止。
7、二分 K-Means 聚类算法代码(注意:此算法很容易在编写过程中 出现错误,请细心察看)
def biKMeans(dataSet , k,distMeas = disEclud):
m = np.shape(dataSet)[0]
clusterAssment = np.mat(np.zeros((m,2)))
centroid = np.mean(dataSet,axis = 0).tolist()[0]
centList = [centroid]
for j in range (m):
clusterAssment[j,1] = distMeas(np.mat(centroid),dataSet[j,:])**2
while(len(centList)<k):
lowestSS = inf
for i in range(len(centList)):
ptsInCurrCluster = dataSet[np.nonzero(clusterAssment[:,0].A==i)[0],:]
centroidMat,splitClustAss = kmeans(ptsInCurrCluster,2,distMeas)
sseSplit = np.sum(clusterAssment[:,1])
sseNotSplit = np.sum(clusterAssment[np.nonzero(clusterAssment[:,0].A != i)[0],1])
print("sseSplit and notSplit:",sseSplit,sseNotSplit)
if sseSplit + sseNotSplit <lowestSS:
bestCentToSplit = i
bestNewCents = centroidMat
bewsClustAss = splitClustAss.copy()
lowestSS = sseSplit + sseNotSplit
bewsClustAss[np.nonzero(bewsClustAss[:,0].A==1)[0],0]=len(centList)
bewsClustAss[np.nonzero(bewsClustAss[:, 0].A == 0)[0], 0] = bestCentToSplit
print("the best is :",bestCentToSplit)
print("the len of is :",len(bewsClustAss))
centList[bestCentToSplit] = bestNewCents[0,:].tolist()[0]
centList.append(bestNewCents[1,:].tolist()[0])
clusterAssment[np.nonzero(clusterAssment[:,0].A==bestCentToSplit)[0],:] = bewsClustAss
return np.mat(centList),clusterAssment
def testBiKMeans():
datMat = np.mat(loadDataSet('KMeans-testSet.txt'))
centList , myNewAssments = biKMeans(datMat,3)
print('centList = ' , centList)
if __name__ == '__main__':
testBiKMeans()
上述函数可以运行多次,聚类会收敛到全局最小值,而原始的 kMeans() 函数偶尔会 陷入局部最小值。
完整代码
import numpy as np
inf = 0x3f3f3f3f
def loadDataSet(fileName):
dataMat = []
fr = open(fileName)
for line in fr.readlines():
curLine = line.strip().split('\t')
fltLine = list(map(float,curLine))
dataMat.append(fltLine)
return dataMat
def disEclud(vecA,vecB):
return np.sqrt(np.sum(np.power(vecA-vecB,2)))
def randCent(dataSet,k):
n = np.shape(dataSet)[1]
centroids = np.mat(np.zeros((k,n)))
for j in range (n):
minJ = min(dataSet[:,j])
rangeJ = float(max(dataSet[:,j]) - minJ)
centroids[:,j] = np.mat(minJ +rangeJ * np.random.rand(k,1))
return centroids
def testBasicFunc():
datMat = np.mat(loadDataSet('KMeans-testSet.txt'))
print('min(datMat[:,0])=',min(datMat[:,0]))
print('min(datMat[:,1])=',min(datMat[:,1]))
print('max(datMat[:,1])=',max(datMat[:,1]))
print('max(datMat[:,0])=',max(datMat[:,0]))
print('randCent(datMat,2) = ',randCent(datMat,2))
print('disEclud(datMat[0],datMat[1])=',disEclud(datMat[0],datMat[1]))
def kmeans(dataSet, k, distMeas=disEclud, createCent=randCent):
m = np.shape(dataSet)[0]
clusterAssment = np.mat(np.zeros((m,2)))
centroids = createCent(dataSet, k )
clusterChanged = True
num = 1
while clusterChanged:
clusterChanged = False
for i in range (m):
minDist = inf
minIndex = -1
for j in range(k):
distJI= distMeas(centroids[j,:],dataSet[i,:])
if distJI < minDist:
minDist = distJI
minIndex = j
if clusterAssment[i,0]!= minIndex:
clusterChanged = True
clusterAssment[i,:]= minIndex,minDist**2
print('第%d次运行结果为:'% num)
num+=1
print(centroids)
for cent in range(k):
ptsInClust = dataSet[np.nonzero(clusterAssment[:,0])[0]]
centroids[cent,:] = np.mean(ptsInClust,axis=0)
return centroids,clusterAssment
def testKMeans():
datMat = np.mat(loadDataSet('KMeans-testSet.txt'))
myCentroids , clustAssing = kmeans(datMat,4)
print('centroids=',myCentroids)
def biKMeans(dataSet , k,distMeas = disEclud):
m = np.shape(dataSet)[0]
clusterAssment = np.mat(np.zeros((m,2)))
centroid = np.mean(dataSet,axis = 0).tolist()[0]
centList = [centroid]
for j in range (m):
clusterAssment[j,1] = distMeas(np.mat(centroid),dataSet[j,:])**2
while(len(centList)<k):
lowestSS = inf
for i in range(len(centList)):
ptsInCurrCluster = dataSet[np.nonzero(clusterAssment[:,0].A==i)[0],:]
centroidMat,splitClustAss = kmeans(ptsInCurrCluster,2,distMeas)
sseSplit = np.sum(clusterAssment[:,1])
sseNotSplit = np.sum(clusterAssment[np.nonzero(clusterAssment[:,0].A != i)[0],1])
print("sseSplit and notSplit:",sseSplit,sseNotSplit)
if sseSplit + sseNotSplit <lowestSS:
bestCentToSplit = i
bestNewCents = centroidMat
bewsClustAss = splitClustAss.copy()
lowestSS = sseSplit + sseNotSplit
bewsClustAss[np.nonzero(bewsClustAss[:,0].A==1)[0],0]=len(centList)
bewsClustAss[np.nonzero(bewsClustAss[:, 0].A == 0)[0], 0] = bestCentToSplit
print("the best is :",bestCentToSplit)
print("the len of is :",len(bewsClustAss))
centList[bestCentToSplit] = bestNewCents[0,:].tolist()[0]
centList.append(bestNewCents[1,:].tolist()[0])
clusterAssment[np.nonzero(clusterAssment[:,0].A==bestCentToSplit)[0],:] = bewsClustAss
return np.mat(centList),clusterAssment
def testBiKMeans():
datMat = np.mat(loadDataSet('KMeans-testSet.txt'))
centList , myNewAssments = biKMeans(datMat,3)
print('centList = ' , centList)
if __name__ == '__main__':
testBasicFunc()
testKMeans()
testBiKMeans()