K-均值算法算法可以实现收敛,但是存在一个问题是,K-均值算法会收敛到局部最优解而不是全局最优。
一种用于度量聚类效果的指标是SSE(sum of Squared Error,误差平方和)。SSE值越小表示数据点越接近它们的质心,聚类效果也就越好。应为误差取了平方,因此更加重视那些远离中心的点。一种肯定可以降低SSE值的方法是增加簇的个数,但这违背了聚类的目标。聚类的目标是在簇数目不变的情况下提高簇的质量。
对K-均值进行改进的方案之一是对生成的簇进行后处理,一种方法是将具有最大SSE值的簇划分两个簇。
二分K-均值算法
概算发首先将所有点作为一个簇,然后将该簇一分为二。之后选择其中一个簇进行划分,选择哪一个簇进行划分取决于其划分结果是否可以最大程度降低SSE的值。上述基于SSE的划分过程不断重复,直到得到用户指定的簇的数目为止。
二分K-均值算法的伪代码形式如下:
- 将所有点看成一个簇
- 当簇数目小于k时:
- 对每一个簇
- 计算总误差
- 在给定的簇上面进行K-均值划分(k=2)
- 计算将该簇一分为二之后的总误差
- 选择使得总误差最小的那个簇的进行划分
代码:
import numpy as np
import matplotlib.pyplot as plt
def loadDataSet(fileName):
dataList = []
with open(fileName) as fr:
for line in fr.readlines():
currentLine = line.strip().split('\t')
floatLine = list(map(np.float, currentLine))
dataList.append(floatLine)
return np.array(dataList)
def calcEcludDist(vecA, vecB):
return np.sqrt(np.sum(np.power(vecA - vecB, 2)))
def randCent(dataSet, k):
n = np.shape(dataSet)[1] #n: 特征数量
centroids = np.zeros((k, n)) #centroids的维度(类别数, 特征数)
for j in range(n):
minJ = np.min(dataSet[:, j])
rangeJ = float(np.max(dataSet[:, j]) - minJ)
mean = (minJ + np.max(dataSet[:, j])) / 2
centroids[:, j] = minJ + rangeJ * np.random.rand(k, 1).T
return centroids
def KMeans(dataSet, k, distMeans=calcEcludDist, createCent = randCent):
m = np.shape(dataSet)[0]
clusterAssment = np.array(np.zeros((m, 2)))
centroids = createCent(dataSet, k)
clusterChanged = True
while clusterChanged:
clusterChanged = False
for i in range(m):
minDist = np.inf
minIndex = -1
for j in range(k):
distJI = distMeans(centroids[j, :], dataSet[i, :])
if distJI < minDist:
minDist = distJI
minIndex = j
if clusterAssment[i, 0] != minIndex:
clusterChanged = True
clusterAssment[i, :] = minIndex, minDist ** 2
for cent in range(k):
ptsInClust = dataSet[np.nonzero(clusterAssment[:, 0] == cent)[0]]
centroids[cent, :] = np.mean(ptsInClust, axis=0)
return centroids, clusterAssment
def biKMeans(dataSet, k, distMeans = calcEcludDist):
"""
:param dataSet:
:param k: the number of expected classes
:param distMeans: function that describes how to calculate distance
:return:
"""
global bestNewCents, bestCentToSplit, bestClustAss
m = np.shape(dataSet)[0] # m: the number of data set
clusterAssment = np.array(np.zeros((m, 2))) # (类别,距离质心的距离平方)
centroid0 = np.mean(dataSet, axis=0) # 选择均值作为最开始的质心
centroidList = [centroid0] # 质心向量
for j in range(m):
clusterAssment[j, 1] = distMeans(centroid0, dataSet[j, :]) ** 2 # 计算每个样本距离质心的距离平方
while len(centroidList) < k: # 如果当前的质心小于期望的质心数量(分类数量)
lowestSSE = np.inf
for i in range(len(centroidList)):
ptsInCurrentCluster = dataSet[np.nonzero(clusterAssment[:, 0] == i)[0], :] # 第i类的数据放入集合ptsInCurrentCluster
newCentroids, splitClustAss = KMeans(ptsInCurrentCluster, 2, calcEcludDist) # 将第i类再分2类
sseSplit = np.sum(splitClustAss[:, 1]) # 已分类数据的SSE总和
sseNotSplit = np.sum(clusterAssment[np.nonzero(clusterAssment[:, 0] != i)[0], 1]) # 未分类数据的SSE总和
print("sse split and not split: ", sseSplit, sseNotSplit)
if sseSplit + sseNotSplit < lowestSSE: # 如果SSE小于最小SSE,则说明应该对第i类数据再分类
bestCentToSplit = i
bestNewCents = newCentroids
bestClustAss = splitClustAss.copy()
lowestSSE = sseSplit + sseNotSplit
bestClustAss[np.nonzero(bestClustAss[:, 0] == 1)[0], 0] = len(centroidList) # 选中的那类再分两类,如果类别为1,则新的类别号为len(centroidList)
bestClustAss[np.nonzero(bestClustAss[:, 0] == 0)[0], 0] = bestCentToSplit # 如果类别为0,则新的类别号为bestCenToSplit
print("the bestCentToSplit is: ", bestCentToSplit)
print("the len of bestClustAss is: ", len(bestClustAss))
print("new cluster assessment: \n", bestClustAss)
centroidList[bestCentToSplit] = bestNewCents[0, :]
centroidList.append(bestNewCents[1, :])
clusterAssment[np.nonzero(clusterAssment[:, 0] == bestCentToSplit)[0], :] = bestClustAss
return centroidList, clusterAssment
if __name__ == "__main__":
dataList = loadDataSet('testSet2.txt')
sseSplit, sseAssments = biKMeans(dataList, 3)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(dataList[np.nonzero(sseAssments[:, 0] == 0)[0], 0], dataList[np.nonzero(sseAssments[:, 0] == 0)[0], 1],
c="blue")
ax.scatter(dataList[np.nonzero(sseAssments[:, 0] == 1)[0], 0], dataList[np.nonzero(sseAssments[:, 0] == 1)[0], 1],
c="red")
ax.scatter(dataList[np.nonzero(sseAssments[:, 0] == 2)[0], 0], dataList[np.nonzero(sseAssments[:, 0] == 2)[0], 1],
c="green")
plt.show()
划分结果:
CONTACT INFORMATION
E-Mail: birdguan@seu.edu.cn
QQ: 46611253