数据结构(第一题数据)如下:
no | x1 | x2 | x3 |
1 | 125 | 20 | 44 |
2 | 121 | 18 | 43 |
3 | 120 | 17 | 42 |
4 | 124 | 20 | 45 |
5 | 122 | 18 | 43 |
6 | 120 | 19 | 44 |
7 | 121 | 17 | 41 |
8 | 122 | 19 | 43 |
9 | 122 | 17 | 42 |
10 | 121 | 19 | 45 |
首先是导入的一些准备工作:
# 科学计算,启动!
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import mpl_toolkits.mplot3d as Axes3D
# 解决显示中文的问题
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams["font.sans-serif"] = ["SimHei"]
# 可视化主题
import seaborn as sns
sns.set(font="Kaiti",style="ticks",font_scale=1.4)
# 导入数据
df = pd.read_csv("第一题数据.csv")
df = df.sort_values(["x1","x2","x3"])
df.head()
运行结果:
# 首先可视化数据
fig = plt.figure(figure = (15,10))
ax = plt.subplot(111,projection="3d")
x1 = df.loc[:,"x1"]
x2 = df.loc[:,"x2"]
x3 = df.loc[:,"x3"]
ax.set_xlabel("肩宽/髋宽*100",size =12)
ax.set_ylabel("胸厚/胸围*100",size =12)
ax.set_zlabel("腿长/身长*100",size =12)
ax.view_init(elev = 20,azim = 25)
ax.scatter3D(x1,x2,x3,s = 20)
plt.savefig("游泳运动员体测数据散点图")
from sklearn.cluster import *
# 使用肘方法搜索合适的聚类数目
kmax = 10
K = np.arange(1,kmax)
iner = [] # 类内误差平方和
df1 = df.iloc[:,1:]
print(df1.head())
for ii in K:
kmean = KMeans(n_clusters=ii,random_state = 1)
kmean.fit(df1)
# 计算iner
iner.append(kmean.inertia_)
# 可视化类内误差平方和的变化情况
plt.figure(figsize = (10,6))
plt.plot(K,iner,"r-o")
plt.xlabel("聚类数目")
plt.ylabel("类内误差平方和")
plt.title("K-means聚类")
plt.annotate("转折点",xy=(3,iner[2]),xytext=(4,iner[2]+10),arrowprops=dict(facecolor='blue',shrink=0.1))
plt.savefig("游泳运动员体测数据聚类肘图")
肘点为3,聚成3类
# 使用K-means聚类将数据聚为3类
kmean = KMeans(n_clusters = 3,random_state = 1)
k_pre = kmean.fit_predict(df1)
print("每簇包含的样本数量:",np.unique(k_pre,return_counts = True))
print("每个簇的聚类中心为:\n",kmean.cluster_centers_)
第一聚类:聚类中心(124.5,20,44.5),两个点
第二聚类:聚类中心(121,17,41.7),三个点
第三聚类:聚类中心(121.2,18.6,43.6),五个点
# 聚类点
lis1=[]
lis2=[]
lis3=[]
for i in range(len(k_pre)):
if k_pre[i] == 0:
lis1.append(df.loc[i,"no"])
if k_pre[i] == 1:
lis2.append(df.loc[i,"no"])
if k_pre[i] == 2:
lis3.append(df.loc[i,"no"])
print("第一聚类的运动员编号: ",lis1)
print("第二聚类的运动员编号: ",lis2)
print("第三聚类的运动员编号: ",lis3)
colors = ["red", "blue", "green"]
shapes = ["o", "s", "*"]
fig = plt.figure(figsize=(15, 6))
ax1 = fig.add_subplot(121, projection="3d")
# 绘制数据点
for ii, y in enumerate(k_pre):
ax1.scatter(df1.iloc[ii, 0], df1.iloc[ii, 1], df1.iloc[ii, 2],
s=40, c=colors[y], marker=shapes[y], alpha=0.5)
# 绘制聚类中心,并连接各点到其簇中心
unique_labels = np.unique(k_pre)
for label in unique_labels:
center = kmean.cluster_centers_[label]
x, y, z = center
ax1.scatter(x, y, z, c="gray", marker="o", s=20) # 绘制聚类中心点
ax1.text(x, y, z, f"簇{label+1}") # 标注簇编号
# 找出属于当前簇的所有点,并绘制到簇中心的连线
cluster_points = df1[k_pre == label]
for index, point in cluster_points.iterrows():
ax1.plot([point[0], x], [point[1], y], [point[2], z], color=colors[label], linestyle='--')
ax1.set_xlabel("特征1", rotation=20, size=13)
ax1.set_ylabel("特征2", rotation=-20, size=13)
ax1.set_zlabel("特征3", rotation=90, size=13)
ax1.azim = 225
ax1.set_title("K-means聚为3簇")
plt.show() # 显示图像
plt.savefig("K-means聚类_带连线.png") # 保存图像