Python K-means 聚类算法实现

 

 以上是K-means算法的大概流程图

import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import seaborn as sns

import random
import sys

from sklearn import metrics


# 初始化画板
def initPlot():
    sns.set(style="darkgrid", palette="muted",
            color_codes=True, font='SimHei')  # set( )设置主题,调色板更常用
    # mpl.rcParams['font.family'] = 'SimHei'
    plt.rcParams['axes.unicode_minus'] = False  # 解决坐标轴负数的负号显示问题


# 初始化聚类数据
def initData():
    df = pd.read_excel(r'./data/sort.xlsx', 'Sheet1')
    data = {}
    i = 0
    for index, row in df.iterrows():
        res = []
        for j in row:
            res.append(j)
        data[i] = res
        i += 1
    return data


# 选择新的聚类中心
def selectCenter(data=None, k=2, cluster=None, center=None):
    if data is None or cluster is None:
        return
    if len(center) <= 0:
        i = 0
        while i < k:
            rand = random.randint(0, len(data) - 1)
            if rand not in center:
                center.append(data[rand])
                subCluster = [data[rand]]
                cluster.append(subCluster)
                i += 1
    else:
        cluster.clear()
        for c in center:
            src = []
            src.append(c)
            cluster.append(src)


# 计算各个数据对象至各个数据集的聚类中心的最短距离
def calDistance(data=None, cluster=None, center=None):
    if data is None or cluster is None or center is None:
        return
    for obj in data:
        if obj not in center:
            src = data[obj]
            index = 0
            c = -1  # 聚类中心下标
            minED = sys.maxsize
            for cc in center:
                ed = calEuclideanDistance(data1=src, data2=cc)
                if ed < minED:
                    minED = ed
                    c = index
                index += 1
            cluster[c].append(src)


# 计算欧氏距离
def calEuclideanDistance(data1=None, data2=None):
    if data1 is None or data2 is None or len(data1) != len(data2):
        return -1
    res = 0
    for i in range(len(data1)):
        res += (data1[i] - data2[i]) ** 2
    return res ** .5


# 计算每个簇的数据对象的特征均值
def calClusterAvgDistance(cluster=None, k=2, center=None, dim=2):
    if cluster is None or center is None:
        return
    else:
        center.clear()
    for c in cluster:
        newCenter = [0] * dim
        for obj in c:
            index = 0
            for attr in obj:
                newCenter[index] += attr
                index += 1
        index = 0
        lenC = len(c)
        for newAttr in newCenter:
            newCenter[index] = newAttr / lenC
            index += 1
        center.append(newCenter)


def calSSE(center=None, cluster=None):
    if center is None or cluster is None:
        return
    SSE = []
    index = 0
    for obj in range(len(cluster)):
        res = 0
        for data in cluster[obj]:
            for attrc in center[index]:
                for attr in data:
                    res += (attr - attrc) ** 2
                    # print(res)
        SSE.append(res)
        index += 1
        
    return SSE


k = 2
data = initData()
center = []
cluster = []
iterators = 15
for i in range(iterators):
    selectCenter(data=data, k=k, cluster=cluster, center=center)
    calDistance(data=data, cluster=cluster, center=center)
    calClusterAvgDistance(cluster=cluster, k=k, center=center, dim=len(data[0]))
    
# print()
# for i in cluster:
#     print(len(i))
#     print(cluster)

initPlot()
SSE = calSSE(center=center, cluster=cluster)
print(SSE)

for c in cluster:
    X = []
    Y = []
    for obj in c:
        X.append(obj[0])
        Y.append(obj[1])
    plt.scatter(X, Y, s=75, alpha=.5)

# plt.scatter(X1, Y1, s=75, alpha=.5)
# plt.scatter(X2, Y2, s=10, alpha=.5)
# plt.xlabel('二氧化硅(SiO2)')
# plt.ylabel('氧化铝(Al2O3)')

plt.show()

# CH指标检测
index = 0
y = []
for i in cluster:
    for j in i:
        y.append(index)
    index += 1
src = []
for i in cluster:
    for j in i:
        src.append(j)
print(metrics.calinski_harabasz_score(src, y))

可以发现在循环迭代到一定次数时,不管如何增加迭代次数聚类结果并不会再发生改变,此时聚类过程才算是结束。

以下是k=2时的结果图,CH指标为123.60521036512122

 以下是k=3时的结果图,CH指标为97.93651104016449

 

  • 1
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值