一次相对完整的K-means聚类流程

最新推荐文章于 2024-09-15 08:01:47 发布

microlite

最新推荐文章于 2024-09-15 08:01:47 发布

阅读量612

点赞数 19

文章标签： kmeans 聚类算法

本文链接：https://blog.csdn.net/microlite/article/details/140498373

版权

数据结构(第一题数据)如下:

no	x1	x2	x3
1	125	20	44
2	121	18	43
3	120	17	42
4	124	20	45
5	122	18	43
6	120	19	44
7	121	17	41
8	122	19	43
9	122	17	42
10	121	19	45

首先是导入的一些准备工作:

# 科学计算,启动!
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import mpl_toolkits.mplot3d as Axes3D
# 解决显示中文的问题
plt.rcParams['axes.unicode_minus'] = False 
plt.rcParams["font.sans-serif"] = ["SimHei"]

# 可视化主题
import seaborn as sns
sns.set(font="Kaiti",style="ticks",font_scale=1.4)

# 导入数据
df = pd.read_csv("第一题数据.csv")
df = df.sort_values(["x1","x2","x3"])
df.head()

运行结果:

# 首先可视化数据
fig = plt.figure(figure = (15,10))
ax = plt.subplot(111,projection="3d")
x1 = df.loc[:,"x1"]
x2 = df.loc[:,"x2"]
x3 = df.loc[:,"x3"]
ax.set_xlabel("肩宽/髋宽*100",size =12)
ax.set_ylabel("胸厚/胸围*100",size =12)
ax.set_zlabel("腿长/身长*100",size =12)
ax.view_init(elev = 20,azim = 25)
ax.scatter3D(x1,x2,x3,s = 20)
plt.savefig("游泳运动员体测数据散点图")

from sklearn.cluster import * 
# 使用肘方法搜索合适的聚类数目
kmax = 10
K = np.arange(1,kmax)
iner = [] # 类内误差平方和
df1 = df.iloc[:,1:]
print(df1.head())
for ii in K:
    kmean = KMeans(n_clusters=ii,random_state = 1)
    kmean.fit(df1)
    # 计算iner
    iner.append(kmean.inertia_)
# 可视化类内误差平方和的变化情况
plt.figure(figsize = (10,6))
plt.plot(K,iner,"r-o")
plt.xlabel("聚类数目")
plt.ylabel("类内误差平方和")
plt.title("K-means聚类")
plt.annotate("转折点",xy=(3,iner[2]),xytext=(4,iner[2]+10),arrowprops=dict(facecolor='blue',shrink=0.1))
plt.savefig("游泳运动员体测数据聚类肘图")

肘点为3,聚成3类

# 使用K-means聚类将数据聚为3类
kmean = KMeans(n_clusters = 3,random_state = 1)
k_pre = kmean.fit_predict(df1)
print("每簇包含的样本数量:",np.unique(k_pre,return_counts = True))
print("每个簇的聚类中心为:\n",kmean.cluster_centers_)

第一聚类:聚类中心(124.5,20,44.5),两个点

第二聚类:聚类中心(121,17,41.7),三个点

第三聚类:聚类中心(121.2,18.6,43.6),五个点

# 聚类点
lis1=[]
lis2=[]
lis3=[]
for i in range(len(k_pre)):
    if k_pre[i] == 0:
        lis1.append(df.loc[i,"no"])
    if k_pre[i] == 1:
        lis2.append(df.loc[i,"no"])
    if k_pre[i] == 2:
        lis3.append(df.loc[i,"no"])
print("第一聚类的运动员编号: ",lis1)
print("第二聚类的运动员编号: ",lis2)
print("第三聚类的运动员编号: ",lis3)

colors = ["red", "blue", "green"]  
shapes = ["o", "s", "*"]  
fig = plt.figure(figsize=(15, 6))  
ax1 = fig.add_subplot(121, projection="3d")  
  
# 绘制数据点  
for ii, y in enumerate(k_pre):  
    ax1.scatter(df1.iloc[ii, 0], df1.iloc[ii, 1], df1.iloc[ii, 2],  
                s=40, c=colors[y], marker=shapes[y], alpha=0.5)  
  
# 绘制聚类中心，并连接各点到其簇中心  
unique_labels = np.unique(k_pre)  
for label in unique_labels:  
    center = kmean.cluster_centers_[label]  
    x, y, z = center  
    ax1.scatter(x, y, z, c="gray", marker="o", s=20)  # 绘制聚类中心点  
    ax1.text(x, y, z, f"簇{label+1}")  # 标注簇编号  
      
    # 找出属于当前簇的所有点，并绘制到簇中心的连线  
    cluster_points = df1[k_pre == label]  
    for index, point in cluster_points.iterrows():  
        ax1.plot([point[0], x], [point[1], y], [point[2], z], color=colors[label], linestyle='--')  
  
ax1.set_xlabel("特征1", rotation=20, size=13)  
ax1.set_ylabel("特征2", rotation=-20, size=13)  
ax1.set_zlabel("特征3", rotation=90, size=13)  
ax1.azim = 225  
ax1.set_title("K-means聚为3簇")  
plt.show()  # 显示图像  
plt.savefig("K-means聚类_带连线.png")  # 保存图像