python----Kmeans 算法的简单实现

最新推荐文章于 2024-08-11 16:00:02 发布

米奇妙妙小黑屋

最新推荐文章于 2024-08-11 16:00:02 发布

阅读量372

点赞数

分类专栏：算法编程思想文章标签： Kmeans算法 python 人工智能

本文链接：https://blog.csdn.net/weixin_43576171/article/details/99231716

版权

算法同时被 2 个专栏收录

7 篇文章 0 订阅

订阅专栏

编程思想

7 篇文章 0 订阅

订阅专栏

Kmeans算法简介

（1）Kmeans算法是一种无监督聚类算法。
（2）算法的目标：对给定样本集，根据各个样本点与选中的k个簇类中心点之间的距离，从中选最短距离进行分类，让簇内的点距离尽可能近，让簇间的点距离尽可能员
（3）算法的主要公式与思路：
计算点之间的距离公式。
通过对样本集中各样本点与各簇类中心点的距离，选最短的进行分类。然后根据分类后的样本点进行最后的分类的优化，即根据各类的样本点分类结果，求出平均中心点（平均中心点的坐标值为该类样本点相应坐标值的平均值），以平均中心点为新的簇类中心点，继续进行分类，直至簇类中心点与上一次的簇类中心点足够小，则停止分类的优化。

import random
from math import sqrt
'''
    功能：数据类，存储Kmeans算法的分类结果 
'''
class Kind :
    def __init__(self):
        self.__centerPoint = None   #质心
        self.__kindList = []        #依照质心分类，属于该类的点的列表
    pass
    @property
    def centerPoint(self):
        return  self.__centerPoint
    @centerPoint.setter
    def centerPoint(self,values):
        self.__centerPoint = values
        pass

    @property
    def kindList(self):
        return  self.__kindList
    @kindList.setter
    def kindList(self,values):
        self.__kindList = values
        pass

'''
    方法类：按照质心进行分类以及更新质心递归重新分类
'''
class choiceKind :
    '''
        功能：按照质心和最短距离进行对点进行分类
        参数：centerList 存储Kind类对象，每个对象代表一类。
              pointList 存储需要分类的点的列表
    '''
    def choiceKind(self, centerList, pointList):
        #重新分类，将每一类中的kindList重置
        for i in range(5):
            centerList[i].kindList = []
            pass
        #遍历pointList列表中的每一个点，计算各点到centerList中的每一类的质心的距离
        #将距离存储在distanceList列表中
        for point in pointList:
            distanceList = []
            for kind in  centerList:
                distance = self.Distance(point, kind.centerPoint)
                distanceList.append(distance)
                pass
            # 根据distanceList列表使用min函数求解最短距离的下标。
            # 因为distanceList列表与centerList列表下标一一对应，求得最短距离下标即可知道该点属于centerList中的哪一个类
            centerList[distanceList.index(min(distanceList))].kindList.append(point)

    def Kmeans(self, centerList, pointList):
        self.choiceKind(centerList, pointList)
        # i = 1
        #测试内容：输出每次分类的情况
        # for kind in centerList:
        #     print('第{0}类质心为：{1},'.format(i, kind.centerPoint))
        #     print('第{0}类中的点为：{1}'.format(i, len(kind.kindList)))
        #     i += 1
        centerDistance = []
        for kind in centerList:
            x, y = 0, 0
            #对每一类的质心的更新
            for point in kind.kindList:
                x +=point[0]
                y +=point[1]
            if len(kind.kindList) != 0:
                newCenter = (x/len(kind.kindList),y/len(kind.kindList))
            else:
                newCenter = kind.centerPoint
            # 计算新质心与旧质心的距离，并将其加入centerDistance列表中
            centerDistance.append(self.Distance(newCenter,kind.centerPoint))
            kind.centerPoint = newCenter
            pass
        #当centerDistance列表中的各元素的总和大于一定值时，代表分类可进一步优化，进行递归，优化结果
        if sum(centerDistance) >= 0.000000001:
            self.Kmeans(centerList,pointList)
        pass
    # 功能：计算两点距离。 参数：两个元组，一个元组代表一个点
    def Distance(self,point,center):
        return sqrt((point[0] - center[0])**2+(point[1] - center[1])**2)

    pass


pointList = []
count = 0
#随机100个点，一元组代表一个点
while count<100:
    x = random.randint(0,120)
    y = random.randint(0,120)
    point = (x,y)
    #避免随机点重复出现
    if point not in pointList:
        pointList.append(point)
        count += 1
    pass


centerList =[]
#选取5个簇类中心点
for i in range(5):
    A = Kind()
    centerPoint = random.choice(pointList)
    #避免选取同一个中心点
    for center in centerList:
        if centerPoint == center.centerPoint:
            break
    else:
        A.centerPoint = centerPoint
        centerList.append(A)
    pass

for kind in centerList:
    print(kind.centerPoint)
B = choiceKind()
B.Kmeans(centerList,pointList)

i = 1
for kind  in centerList :
    print('第{0}类质心为：{1},'.format(i,kind.centerPoint))
    print('第{0}类中的点为：{1}'.format(i,kind.kindList))
    i += 1

随机生成100个点，通过kmeans算法将点进行分类。
情景：代码中将随机生成的100个点分成5类，以一个Kind类对象代表一类。
开始：随机选取5个点为五个类的中心点，用5个中心点，与100个点分别计算距离，每个点分类至中心点距离它最短的类中。
分类优化：根据上一分类中每个类中的点的x与y取平均值，得到新的质心，即新的中心点。根据新的中心点重新与100个点分别计算距离，每个点分类至中心点距离它最短的类中。
结束：当新的中心点与上一个中心点的距离无限小时，停止分类优化，否则继续分类优化。
该实现算法不够灵活，是不成熟的简单实现，只能针对二维点的分类，若需适应更多情况，应进行优化。
为了适应任意维度对代码进行修改：

import random
'''
    功能：数据类，存储Kmeans算法的分类结果 
'''

class Kind :
    def __init__(self):
        self.__centerPoint = None   #质心
        self.__kindList = []        #依照质心分类，属于该类的点的列表
    pass
    @property
    def centerPoint(self):
        return  self.__centerPoint
    @centerPoint.setter
    def centerPoint(self,values):
        self.__centerPoint = values
        pass

    @property
    def kindList(self):
        return  self.__kindList
    @kindList.setter
    def kindList(self,values):
        self.__kindList = values
        pass

'''
    方法类：按照质心进行分类以及更新质心递归重新分类
'''
class choiceKind :
    '''
        功能：按照质心和最短距离进行对点进行分类
        参数：centerList 存储Kind类对象，每个对象代表一类。
              pointList 存储需要分类的点的列表
    '''
    def choiceKind(self, centerList, pointList,Num):
        #重新分类，将每一类中的kindList重置
        for i in range(Num):
            centerList[i].kindList = []
            pass
        #遍历pointList列表中的每一个点，计算各点到centerList中的每一类的质心的距离
        #将距离存储在distanceList列表中
        for point in pointList:
            distanceList = []
            for kind in  centerList:
                distance = self.Distance(point, kind.centerPoint)
                distanceList.append(distance)
                pass
            # 根据distanceList列表使用min函数求解最短距离的下标。
            # 因为distanceList列表与centerList列表下标一一对应，求得最短距离下标即可知道该点属于centerList中的哪一个类
            centerList[distanceList.index(min(distanceList))].kindList.append(point)

    def Kmeans(self, centerList, pointList,Num):
        self.choiceKind(centerList, pointList,Num)
        i = 1
        # 测试内容：输出每次分类的情况
        # for kind in centerList:
        #     print('第{0}类质心为：{1},'.format(i, kind.centerPoint))
        #     print('第{0}类中的点为：{1}'.format(i, len(kind.kindList)))
        #     i += 1
        centerDistance = []
        for kind in centerList:
            newCenter = kind.kindList[0]
            #对每一类的质心的更新
            for point in kind.kindList[1:]:
                newCenter = [(x1+x2) for x1, x2 in zip(newCenter,point)]
            else:
                for i in range(len(newCenter)):
                    newCenter[i] = newCenter[i] / len(kind.kindList)
                else:
                    newCenter = tuple(newCenter)
            # 计算新质心与旧质心的距离，并将其加入centerDistance列表中
            centerDistance.append(self.Distance(newCenter,kind.centerPoint))
            kind.centerPoint = newCenter
            pass
        #当centerDistance列表中的各元素的总和大于一定值时，代表分类可进一步优化，进行递归，优化结果
        if sum(centerDistance) >= 0.000000001:
            print('1')
            self.Kmeans(centerList,pointList,Num)
        pass
    # 功能：计算两点距离。 参数：两个元组，一个元组代表一个点
    def Distance(self,point,center):
        distance = sum([ (x1-x2)**2 for x1, x2 in zip(point,center)])**0.5
        return distance

    def kindNum(self,num):
        centerList = []
        # 选取5个簇类中心点
        for i in range(num):
            A = Kind()
            centerPoint = random.choice(pointList)
            # 避免选取同一个中心点
            for center in centerList:
                if centerPoint == center.centerPoint:
                    break
            else:
                A.centerPoint = centerPoint
                centerList.append(A)
            pass
        return centerList
    pass


pointList = []
count = 0
#随机100个点，一元组代表一个点
while count<1000:
    x = random.randint(0,100)
    y = random.randint(0,100)

    point = (x,y)
    #避免随机点重复出现
    if point not in pointList:
        pointList.append(point)
        count += 1
    pass

B = choiceKind()
centerList = B.kindNum(6)
for kind in centerList:
    print(kind.centerPoint)
B.Kmeans(centerList,pointList,6)

i = 1
for kind  in centerList :
    print('第{0}类质心为：{1},'.format(i,kind.centerPoint))
    print('第{0}类中的点为：{1}'.format(i,kind.kindList))
    i += 1

为适应任意维度，只需要更新两点：
1、求两点距离的方法Distance()
2、质心的更新，即在Kmeans方法中的质心更换的for循环
为将数据分为若干类，将原代码中选取中心点的for循环封装成choiceKind类中的kindNum方法，加入Num参数

画图检验一次分类结果：

一次分类结果
画图代码，在原基础代码中加入以下代码：

from matplotlib import pyplot as plt
col = 0
color =[]
listx, listy = [],[]
for kind in centerList:
    col +=1
    for point in kind.kindList:
        listx.append(point[0])
        listy.append(point[1])
    else:
        color += [col]*len(kind.kindList)


plt.scatter(listx, listy ,c = color,s = 30)
plt.show()