K-Means概念
- K均值聚类算法(k-means clustering algorithm)是一种迭代求解的聚类分析算法,其步骤是,预将数据分为K组,则随机选取K个对象作为初始的聚类中心,然后计算每个对象与各个种子聚类中心之间的距离,把每个对象分配给距离它最近的聚类中心。聚类中心以及分配给它们的对象就代表一个聚类。每分配一个样本,聚类的聚类中心会根据聚类中现有的对象被重新计算。这个过程将不断重复直到满足某个终止条件。终止条件可以是没有(或最小数目)对象被重新分配给不同的聚类,没有(或最小数目)聚类中心再发生变化,误差平方和局部最小。
概述
- 此篇代码为Python生成指定数量的随机点,同时内定几个初始随机点,遍历计算各点到这些随机点的距离(坐标系下两点间距离公式),最终将各点归类,此即K-Means算法。最后通过Python的PLT库进行可视化。
参数
- K-Means的分类数K(小于随机点数目即可);
- 需要的随机点数目(尽可能多,最好超过K的20倍);
- 可视化坐标轴刻度(一般取随机点数的一半,不固定)。
运行过程提示
运行结果可视化
- 图中三个区域中心点为三个不同颜色,与其相邻且同色的点归为一簇。
代码
import math
import time
import random
import matplotlib.pyplot as plt
class KMeans:
def __init__(self, k, pointsAcount, dataRange):
self.k = int(k)
self.pointsAcount = int(pointsAcount)
self.dataRange = int(dataRange)
self.dataset = []
self.startPooints = []
def generateDataset(self):
Xs = []
Ys = []
a = 0
while a < self.pointsAcount:
Xs.append(random.randint(0, self.dataRange))
Ys.append(random.randint(0, self.dataRange))
a += 1
self.dataset.append(Xs)
self.dataset.append(Ys)
return self.dataset
def distanceFormula(self, point1, point2):
distance = math.sqrt((point1[0] - point2[0])**2 + (point1[1] - point2[1])**2)
return distance
def getMin(self, compareList):
minNum = compareList[0]
minIndex = 0
for i in range(len(compareList)):
if compareList[i] < minNum:
minNum = compareList[i]
minIndex = i
return minNum, minIndex
def randomStart(self):
newDataset = self.dataset.copy()
newDataset.append([])
a = 0
print()
while a < self.k:
positon = random.randint(0, self.pointsAcount - 1)
newDataset[2].append([newDataset[0][positon], newDataset[1][positon]])
a += 1
print('起始中心点 {} 为: ( {:.5f} , {:.5f} )'.format(str(a), newDataset[0][positon], newDataset[1][positon]))
return newDataset
def generateClusters(self, newDataset):
a = 0
clusterList = []
while a < self.k:
clusterList.append([])
a += 1
distanceDict = {}
startPointIndex = 0
for startPoint in newDataset[2]:
distanceDict.setdefault(str(startPointIndex), [])
for p in range(self.pointsAcount):
if [newDataset[0][p], newDataset[1][p]] in newDataset[2]:
pass
else:
anotherPoint = [newDataset[0][p], newDataset[1][p]]
distance = self.distanceFormula(startPoint, anotherPoint)
distanceDict[str(startPointIndex)].append(distance)
startPointIndex += 1
for p in range(self.pointsAcount-self.k):
compareList = []
for k in range(self.k):
compareList.append(distanceDict[str(k)][p])
minDistance, minIndex = self.getMin(compareList)
clusterList[minIndex].append(p)
newDataset[2].clear()
a = 0
for cluster in clusterList:
Xs = 0
Ys = 0
for p in cluster:
Xs += newDataset[0][p]
Ys += newDataset[1][p]
newStartPoint = [Xs/len(cluster), Ys/len(cluster)]
newDataset[2].append(newStartPoint)
a += 1
print('新的中心点 {} 为: ( {:.5f} , {:.5f} )'.format(str(a), Xs/len(cluster), Ys/len(cluster)))
return newDataset[2], clusterList
def randomColor(self):
colorArr = ['1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F']
color = ''
for i in range(6):
color += colorArr[random.randint(0, 14)]
return '#' + color
def drawDataset(self, startPointList, clusterList):
plt.title('K-Means Clustering Results')
plt.xlabel('X')
plt.ylabel('Y')
for startPoint in startPointList:
startPointColor = self.randomColor()
plt.plot(startPoint[0], startPoint[1], color=startPointColor, marker='o')
for cluster in clusterList:
pointColor = self.randomColor()
for p in cluster:
plt.plot(self.dataset[0][p], self.dataset[1][p], color=pointColor, marker='.')
plt.show()
def runKMeans(self):
startTime = time.time()
self.generateDataset()
newDataset = self.randomStart()
lastStartPointList = []
print('\n开始第 1 次迭代...')
newStartPointList, clusterList = self.generateClusters(newDataset)
iteration = 2
while lastStartPointList != newStartPointList:
print('\n开始第 ' + str(iteration) + ' 次迭代...')
newDataset = self.dataset.copy()
newDataset.append(newStartPointList)
lastStartPointList = newStartPointList.copy()
newStartPointList, clusterList = self.generateClusters(newDataset)
iteration += 1
print()
k = 0
while k < self.k:
print('迭代完毕,最终中心点 {} 为: ( {:.5f} , {:.5f} )'.format(str(k+1), newStartPointList[k][0], newStartPointList[k][1]))
k += 1
endTime = time.time()
print('\n共耗费时间: ' + str(endTime-startTime))
self.drawDataset(newStartPointList, clusterList)
if __name__ == '__main__':
KMeans = KMeans(
k=input('请输入K值:'),
pointsAcount=input('请输入随机点个数:'),
dataRange=input('请输入坐标轴最大刻度:')
)
KMeans.runKMeans()