-
算法思想:
以空间中K个点为中心进行聚类,对最靠近他们的对象归类。通过迭代的方法,逐次更新各聚类中心的值,直至得到最好的聚类效果。 -
算法描述:
(1)适当选择x个类的初始中心;
(2)在第k次迭代中,对任意一个样本,求其到c各中心的距离,将该样本归到距离最短的中心所在的类;
(3)利用均值等方法更新该类的中心值;
(4)对于所有的x个聚类中心,如果利用上述(2)(3)的迭代法更新后,值保持不变,则迭代结束,否则继续迭代。 -
算法流程:
输入:k,data[n]
(1)选择k个中心点,例如x[O] = data[O]…x[k-1] = data[k-1];
(2)对于data[O]…data[n],分别与x[O]…x[k-1]比较,假定与c[i]差值最少,就标记为i;
(3)对于所有标记为i点,重新计算c[i] = {所有标记为i的data[j]之和}/标记为i的个数;
(4) 重复(2)(3),直到所有x[i]值得变化小于给定的阈值。
下面是代码实现:
@Author:xiaozhi
@Date: 2019-08-12
import random
import numpy
import matplotlib.pyplot as plt
class KMeans():
def __init__(self, k = 1):
'''
:param k: k 代表分类数
'''
self.__k = k
self.__data = [] # 存放原始数据
self.__pointCenter = [] #存放中心点,第一次获得的中心点通过随机方式在__data里随机出来
self.__result = []
for i in range(k):
self.__result.append([]) # [[], [], [], [], []]
pass
pass
def fit(self, data, threshold, times = 50000):
'''
进行模型训练
:param data: 训练数据
:param threshold:阈值,退出条件
:return:
'''
self.__data = data
self.randomCenter()
print(self.__pointCenter)
centerDistance = self.calPointCenterDistance(self.__pointCenter,self.__data)
# 对原始数据进行分类,将每个点分到对应的中心点,分到离它最近的中心点
i = 0
for temp in centerDistance:
index = temp.index(min(temp))
self.__result[index].append(self.__data[i])
i += 1
pass
# 打印分类结果
oldCenterPoint = self.__pointCenter
newCenterPoint = self.calNewPointCenter(self.__result)
while self.calCenterToCenterDistance(oldCenterPoint, newCenterPoint) > threshold:
times -= 1
result = []
for i in range(self.__k):
result.append([])
pass
# 保存上次的中心点
oldCenterPoint = newCenterPoint
centerDistance = self.calPointCenterDistance(newCenterPoint, self.__data)
# 对原始数据进行分类,将每个点分到离它最近的中心点
i = 0
for temp in centerDistance:
index = temp.index(min(temp))
result[index].append(self.__data[i]) # result = [[[10,20]]]
i += 1
pass
newCenterPoint = self.calNewPointCenter(result)
print(self.calCenterToCenterDistance(oldCenterPoint, newCenterPoint))
self.__result = result
pass
self.__pointCenter = newCenterPoint
return newCenterPoint, self.__result
pass
def calCenterToCenterDistance(self, old, new):
'''
计算两次中心点之间的距离,求和求均值
:param old: 上次的中心点
:param new: 新计算的中心点
:return:
'''
total = 0
for point1, point2 in zip(old, new):
total += self.distance(point1, point2)
pass
return total / len(old)
pass
def calPointCenterDistance(self, center, data):
'''
计算每个点和每个中心点之间的距离
:return:
'''
centerDistance = []
for temp in data:
centerDistance.append([self.distance(temp, point) for point in center])
pass
print(centerDistance)
return centerDistance
pass
def calNewPointCenter(self, result):
'''
计算新的中心点
:param result:
:return:
'''
newCenterPoint = []
for temp in result:
# 转置
temps = [[temp[x][i] for x in range(len(temp))] for i in range(len(temp[0]))]
point = []
for t in temps:
# 对每个维度求和,去平均
point.append(sum(t) / len(t)) # mean
pass
newCenterPoint.append(point)
pass
print(newCenterPoint)
return newCenterPoint
pass
def distance(self, pointer1, pointer2):
'''
计算两个点之间的距离,支持任意维度,欧式距离
:param pointer1:
:param pointer2:
:return:
'''
distance = (sum([(x1 - x2)** 2 for x1, x2 in zip(pointer1, pointer2)]))** 0.5
return distance
pass
def randomCenter(self):
'''
从原始的__data里随机出最开始进行计算的k个中心点
:return:
'''
while len(self.__pointCenter) < self.__k:
# 随机一个索引
index = random.randint(0, len(self.__data) - 1)
# 判断中心点是否重复,如果不重复,加入中心点列表
if self.__data[index] not in self.__pointCenter:
self.__pointCenter.append(self.__data[index])
pass
pass
pass
pass
if __name__ == "__main__":
data = [[random.randint(1, 100), random.randint(1, 100)] for i in range(1,100)]
for i in range(1):
kmeans = KMeans(k = 5)
centerPoint, result = kmeans.fit(data, 0.0001)
print(centerPoint)
plt.plot()
plt.title("KMeans Classification")
i = 0
tempx = []
tempy = []
color = []
for temp in result:
temps = [[temp[x][i] for x in range(len(temp))] for i in range(len(temp[0]))]
color += [i] * len(temps[0])
tempx += temps[0]
tempy += temps[1]
i += 2
pass
plt.scatter(tempx, tempy, c=color, s=30)
plt.show()
pass
pass