以上是K-means算法的大概流程图
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import seaborn as sns
import random
import sys
from sklearn import metrics
# 初始化画板
def initPlot():
sns.set(style="darkgrid", palette="muted",
color_codes=True, font='SimHei') # set( )设置主题,调色板更常用
# mpl.rcParams['font.family'] = 'SimHei'
plt.rcParams['axes.unicode_minus'] = False # 解决坐标轴负数的负号显示问题
# 初始化聚类数据
def initData():
df = pd.read_excel(r'./data/sort.xlsx', 'Sheet1')
data = {}
i = 0
for index, row in df.iterrows():
res = []
for j in row:
res.append(j)
data[i] = res
i += 1
return data
# 选择新的聚类中心
def selectCenter(data=None, k=2, cluster=None, center=None):
if data is None or cluster is None:
return
if len(center) <= 0:
i = 0
while i < k:
rand = random.randint(0, len(data) - 1)
if rand not in center:
center.append(data[rand])
subCluster = [data[rand]]
cluster.append(subCluster)
i += 1
else:
cluster.clear()
for c in center:
src = []
src.append(c)
cluster.append(src)
# 计算各个数据对象至各个数据集的聚类中心的最短距离
def calDistance(data=None, cluster=None, center=None):
if data is None or cluster is None or center is None:
return
for obj in data:
if obj not in center:
src = data[obj]
index = 0
c = -1 # 聚类中心下标
minED = sys.maxsize
for cc in center:
ed = calEuclideanDistance(data1=src, data2=cc)
if ed < minED:
minED = ed
c = index
index += 1
cluster[c].append(src)
# 计算欧氏距离
def calEuclideanDistance(data1=None, data2=None):
if data1 is None or data2 is None or len(data1) != len(data2):
return -1
res = 0
for i in range(len(data1)):
res += (data1[i] - data2[i]) ** 2
return res ** .5
# 计算每个簇的数据对象的特征均值
def calClusterAvgDistance(cluster=None, k=2, center=None, dim=2):
if cluster is None or center is None:
return
else:
center.clear()
for c in cluster:
newCenter = [0] * dim
for obj in c:
index = 0
for attr in obj:
newCenter[index] += attr
index += 1
index = 0
lenC = len(c)
for newAttr in newCenter:
newCenter[index] = newAttr / lenC
index += 1
center.append(newCenter)
def calSSE(center=None, cluster=None):
if center is None or cluster is None:
return
SSE = []
index = 0
for obj in range(len(cluster)):
res = 0
for data in cluster[obj]:
for attrc in center[index]:
for attr in data:
res += (attr - attrc) ** 2
# print(res)
SSE.append(res)
index += 1
return SSE
k = 2
data = initData()
center = []
cluster = []
iterators = 15
for i in range(iterators):
selectCenter(data=data, k=k, cluster=cluster, center=center)
calDistance(data=data, cluster=cluster, center=center)
calClusterAvgDistance(cluster=cluster, k=k, center=center, dim=len(data[0]))
# print()
# for i in cluster:
# print(len(i))
# print(cluster)
initPlot()
SSE = calSSE(center=center, cluster=cluster)
print(SSE)
for c in cluster:
X = []
Y = []
for obj in c:
X.append(obj[0])
Y.append(obj[1])
plt.scatter(X, Y, s=75, alpha=.5)
# plt.scatter(X1, Y1, s=75, alpha=.5)
# plt.scatter(X2, Y2, s=10, alpha=.5)
# plt.xlabel('二氧化硅(SiO2)')
# plt.ylabel('氧化铝(Al2O3)')
plt.show()
# CH指标检测
index = 0
y = []
for i in cluster:
for j in i:
y.append(index)
index += 1
src = []
for i in cluster:
for j in i:
src.append(j)
print(metrics.calinski_harabasz_score(src, y))
可以发现在循环迭代到一定次数时,不管如何增加迭代次数聚类结果并不会再发生改变,此时聚类过程才算是结束。
以下是k=2时的结果图,CH指标为123.60521036512122
以下是k=3时的结果图,CH指标为97.93651104016449