import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# 自选,看个人电脑性能。
# 如果分配给KMeans算法的“chunks”数量少于可用的线程数,可能会遇到内存泄漏问题。
import os
os.environ['OMP_NUM_THREADS'] = '2'
数据准备
data = pd.read_csv('data.csv')
data
0 | 1 | |
---|---|---|
0 | 1.658985 | 4.285136 |
1 | -3.453687 | 3.424321 |
2 | 4.838138 | -1.151539 |
3 | -5.379713 | -3.362104 |
4 | 0.972564 | 2.924086 |
... | ... | ... |
75 | -2.793241 | -2.149706 |
76 | 2.884105 | 3.043438 |
77 | -2.967647 | 2.848696 |
78 | 4.479332 | -1.764772 |
79 | -4.905566 | -2.911070 |
80 rows × 2 columns
可视化
plt.scatter(x=data.iloc[:, 0], y=data.iloc[:, 1])
plt.show()
1. 自定义kmeans
def kmeans_(df, k, max_iterations=100):
data1 = df.values
# 随机选择k个初始聚类中心
centroids = data1[np.random.choice(range(data1.shape[0]), size=k, replace=False)]
for _ in range(max_iterations):
# 计算每个样本到聚类中心的距离
distances = np.linalg.norm(
x=data1[:, np.newaxis] - centroids, axis=-1, ord=2)
# 获取每个样本所属的最近聚类中心索引
labels = np.argmin(distances, axis=-1)
# 更新聚类中心位置
new_centroids = np.array([data1[labels == label].mean(axis=0) for label in range(k)])
# 检查聚类中心是否变化
if np.all(centroids == new_centroids):
break
centroids = new_centroids
# 将分类结果添加到DataFrame中
df['label'] = labels
return df
res = kmeans_(data, 4)
res
0 | 1 | label | |
---|---|---|---|
0 | 1.658985 | 4.285136 | 0 |
1 | -3.453687 | 3.424321 | 1 |
2 | 4.838138 | -1.151539 | 3 |
3 | -5.379713 | -3.362104 | 2 |
4 | 0.972564 | 2.924086 | 0 |
... | ... | ... | ... |
75 | -2.793241 | -2.149706 | 2 |
76 | 2.884105 | 3.043438 | 0 |
77 | -2.967647 | 2.848696 | 1 |
78 | 4.479332 | -1.764772 | 3 |
79 | -4.905566 | -2.911070 | 2 |
80 rows × 3 columns
plt.scatter(res['0'], res['1'], c=res['label'], cmap='viridis')
plt.colorbar()
plt.show()
2. 调库
使用欧式距离
KMeans(
n_clusters=8,
*,
init=‘k-means++’,
n_init=‘warn’,
max_iter=300,
tol=0.0001,
verbose=0,
random_state=None,
copy_x=True,
algorithm=‘lloyd’,
)
n_clusters
:指定聚类簇的数量,默认为8。init
:指定初始化簇中心的方法,默认为’k-means++‘。可以选择’k-means++’、'random’或自定义一个ndarray作为初始簇中心。n_init
:指定使用不同随机初始簇中心的次数,默认为’warn’。如果设置为一个整数值,则会使用该整数值作为随机初始化的次数。max_iter
:指定最大迭代次数,默认为300。算法会在达到最大迭代次数或收敛时停止迭代。tol
:指定收敛判定的容忍度,默认为0.0001。当簇中心的移动距离小于容忍度时,算法认为已经收敛。verbose
:指定详细程度,默认为0,不输出信息。设为1时,每一次迭代都会输出一些信息。random_state
:指定随机数生成器的种子,默认为None。当需要结果可重复时,可以设置一个整数值。copy_x
:在训练前是否复制输入数据,默认为True。如果设置为False,则在内部直接操作输入数据,可能会影响原始数据。algorithm
:指定算法用于计算簇中心,默认为’lloyd’。可以选择’lloyd’(Lloyd’s算法)或’elkan’(Elkan’s算法)。
from sklearn.cluster import KMeans
data = pd.read_csv('data.csv')
n_clusters = 4
km = KMeans(n_clusters=n_clusters, n_init=10, init='k-means++')
km.fit(data.values)
# 聚类中心,聚类结果标签
display(km.cluster_centers_, km.labels_)
a = np.array([[0, 0]])
km.predict(a)
# 预测报错升级threadpoolctl库,匹配sklearn库
# pip install -U threadpoolctl
array([[-2.46154315, 2.78737555],
[ 2.80293085, -2.7315146 ],
[-3.38237045, -2.9473363 ],
[ 2.6265299 , 3.10868015]])
array([3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0,
1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2,
3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0,
1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2])
array([0])
可视化
fig = plt.figure(figsize=(10, 4))
ax = fig.add_subplot(1, 2, 1)
ax.scatter(x=data.iloc[:, 0], y=data.iloc[:, 1])
ax = fig.add_subplot(1, 2, 2)
for k in range(n_clusters):
index = km.labels_ == k
cluster_center = km.cluster_centers_[k]
ax.scatter(data.loc[index, '0'], data.loc[index, '1'],
marker='.') # 将同一类的点表示出来
ax.scatter(cluster_center[0], cluster_center[1], marker='o') # 聚类中心
plt.show()
3. 聚类结果可视化
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
# 样本数,特征数,聚类中心数
X, y = make_blobs(n_samples=500, n_features=2, centers=4, random_state=1)
print(X.shape, y.shape)
fig = plt.figure(figsize=(10, 4))
ax = fig.add_subplot(1, 2, 1)
ax.scatter(X[:, 0], X[:, 1], marker='o', s=8)
color = ['red', 'pink', 'orange', 'gray']
ax = fig.add_subplot(1, 2, 2)
for i in range(4):
ax.scatter(X[y == i, 0], X[y == i, 1], marker='o', s=8, c=color[i])
plt.tight_layout()
plt.show()
(500, 2) (500,)
from sklearn.cluster import KMeans
n_clusters = 5
km1 = KMeans(n_clusters=n_clusters, n_init=10)
km1.fit(X)
fig = plt.figure(figsize=(10, 4))
ax = fig.add_subplot(1, 2, 1)
ax.scatter(x=X[:, 0], y=X[:, 1])
ax = fig.add_subplot(1, 2, 2)
color = ['red', 'pink', 'orange', 'green', 'blue']
for k in range(n_clusters):
index = km1.labels_ == k
cluster_center = km1.cluster_centers_[k]
ax.scatter(X[index, 0], X[index, 1], marker='.', s=8, c=color[k]) # 将同一类的点表示出来
ax.scatter(cluster_center[0], cluster_center[1], marker='o', c='black') # 聚类中心
plt.show()
# plt.scatter(X[:,0],X[:,1],marker='.',s=15,c=km1.labels_)
# plt.show()
4. 评价指标
误差平方和 SSE
# 每个样本到其所属簇中心的距离的总和,SSE
km1.inertia_
818.1063044387333
kmeans.score() 方法返回的是 SSE 的相反数,即 kmeans.score() 返回的值越大越好。
km1.score(X)
-818.1063044387333
4.2 轮廓系数 silhouette_score
# 越大越好
from sklearn.metrics import silhouette_score # 返回所有轮廓系数的均值
from sklearn.metrics import silhouette_samples # 返回每个样本的轮廓系数
n_clusters = [3, 4, 5]
for i in n_clusters:
cluster_ = KMeans(n_clusters=i, random_state=0, n_init=10).fit(X)
print(silhouette_score(X, cluster_.labels_))
0.5882004012129721
0.6505186632729437
0.5746932321727457
4.3 CH指标
CH指标是数据集的分离度与紧密度的比值,以各类中心点与数据集的中心点的距离平方和来度量数据集的分离度,以类内各点与其类中心的距离平方和来度量数据的紧密度。聚类效果越好,类间差距应该越大,类内差距越小,即类自身越紧密,类间越分散,CH指标值越大聚类效果越好。
from sklearn import metrics
from sklearn.cluster import KMeans
km = KMeans(n_clusters=4, n_init=10)
km.fit(X)
y1 = km.predict(X)
SSE = km.inertia_
km = KMeans(n_clusters=4, n_init=10)
km.fit(X)
y1 = km.predict(X)
# SC
sc = metrics.silhouette_score(X, y1)
# CH
ch = metrics.calinski_harabasz_score(X, y1)
display(SSE, sc, ch)
908.3855684760616
0.6505186632729437
2704.4858735121097
5. 根据轮廓系数选择n_cluster
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
for n_clusters in [2,3,4,5,6,7]:
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(18, 7)
ax1.set_xlim([-0.1, 1])
ax1.set_ylim([0, X.shape[0] + (n_clusters + 1) * 10])
clusterer = KMeans(n_clusters=n_clusters, random_state=10,n_init=10).fit(X)
cluster_labels = clusterer.labels_
silhouette_avg = silhouette_score(X, cluster_labels)
print("For n_clusters =", n_clusters,"The average silhouette_score is :", silhouette_avg)
sample_silhouette_values = silhouette_samples(X, cluster_labels)
y_lower = 10
for i in range(n_clusters):
ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]
ith_cluster_silhouette_values.sort()
size_cluster_i = ith_cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i
color = cm.nipy_spectral(float(i)/n_clusters)
ax1.fill_betweenx(np.arange(y_lower, y_upper)
,ith_cluster_silhouette_values
,facecolor=color
,alpha=0.7
)
ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
y_lower = y_upper + 10
ax1.set_title("The silhouette plot for the various clusters.")
ax1.set_xlabel("The silhouette coefficient values")
ax1.set_ylabel("Cluster label")
ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
ax1.set_yticks([])
ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
ax2.scatter(X[:, 0], X[:, 1]
,marker='o'
,s=8
,c=colors
)
centers = clusterer.cluster_centers_
ax2.scatter(centers[:, 0], centers[:, 1], marker='x',c="red", alpha=1, s=200)
ax2.set_title("The visualization of the clustered data.")
ax2.set_xlabel("Feature space for the 1st feature")
ax2.set_ylabel("Feature space for the 2nd feature")
plt.suptitle(("Silhouette analysis for KMeans clustering on sample data ""with n_clusters = %d" % n_clusters),fontsize=14, fontweight='bold')
plt.show()
图像y轴为类别,该图像由一条条水平方向的线组成。
观察聚类结果时,每一个类别的大致形状相同时,即为最佳簇数。
簇数为4时,每一个“刀片”的形状类似。
6. 数据文件kmeans_.csv
0,1
1.658985,4.285136
-3.453687,3.424321
4.838138,-1.151539
-5.379713,-3.362104
0.972564,2.924086
-3.567919,1.531611
0.450614,-3.302219
-3.487105,-1.724432
2.668759,1.594842
-3.156485,3.191137
3.165506,-3.999838
-2.786837,-3.099354
4.208187,2.984927
-2.123337,2.943366
0.704199,-0.479481
-0.39237,-3.963704
2.831667,1.574018
-0.790153,3.343144
2.943496,-3.357075
-3.195883,-2.283926
2.336445,2.875106
-1.786345,2.554248
2.190101,-1.90602
-3.403367,-2.778288
1.778124,3.880832
-1.688346,2.230267
2.592976,-2.054368
-4.007257,-3.207066
2.257734,3.387564
-2.679011,0.785119
0.939512,-4.023563
-3.674424,-2.261084
2.046259,2.735279
-3.18947,1.780269
4.372646,-0.822248
-2.579316,-3.497576
1.889034,5.1904
-0.798747,2.185588
2.83652,-2.658556
-3.837877,-3.253815
2.096701,3.886007
-2.709034,2.923887
3.367037,-3.184789
-2.121479,-4.232586
2.329546,3.179764
-3.284816,3.273099
3.091414,-3.815232
-3.762093,-2.432191
3.542056,2.778832
-1.736822,4.241041
2.127073,-2.98368
-4.323818,-3.938116
3.792121,5.135768
-4.786473,3.358547
2.624081,-3.260715
-4.009299,-2.978115
2.493525,1.96371
-2.513661,2.642162
1.864375,-3.176309
-3.171184,-3.572452
2.89422,2.489128
-2.562539,2.884438
3.491078,-3.947487
-2.565729,-2.012114
3.332948,3.983102
-1.616805,3.573188
2.280615,-2.559444
-2.651229,-3.103198
2.321395,3.154987
-1.685703,2.939697
3.031012,-3.620252
-4.599622,-2.185829
4.196223,1.126677
-2.133863,3.093686
4.668892,-2.562705
-2.793241,-2.149706
2.884105,3.043438
-2.967647,2.848696
4.479332,-1.764772
-4.905566,-2.91107