K-means聚类
1.原理
2.sklearn代码实战
2.1生成数据集
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
X, y = make_blobs(n_samples=150, n_features=2,centers=3, cluster_std=0.5,shuffle=True, random_state=0)
plt.scatter(X[:, 0], X[:, 1],c='white', marker='o',edgecolor='black', s=50)
plt.show()
2.2用kmeans聚类并可视化
model = KMeans(n_clusters=3, init='random',n_init=10, max_iter=300, tol=1e-04, random_state=0)
y_pred = model.fit_predict(X)
plt.scatter(
X[y_pred == 0, 0], X[y_pred == 0, 1],
s=50, c='lightgreen',
marker='s', edgecolor='black',
label='cluster 1'
)
plt.scatter(
X[y_pred == 1, 0], X[y_pred == 1, 1],
s=50, c='orange',
marker='o', edgecolor='black',
label='cluster 2'
)
plt.scatter(
X[y_pred == 2, 0], X[y_pred == 2, 1],
s=50, c='lightblue',
marker='v', edgecolor='black',
label='cluster 3'
)
plt.scatter(
model.cluster_centers_[:, 0], model.cluster_centers_[:, 1],
s=250, marker='*',
c='red', edgecolor='black',
label='centroids'
)
plt.legend(scatterpoints=1)
plt.grid()
plt.show()
2.3对比不同初始k值对聚类的影响
计算inertia随着k变化的情况
distortions = []
for i in range(1, 10):
model = KMeans(
n_clusters=i, init='random',
n_init=10, max_iter=300,
tol=1e-04, random_state=0
)
model.fit(X)
distortions.append(model.inertia_)
plt.plot(range(1, 10), distortions, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Distortion')
plt.show()
3.手写Kmeans++的代码实现
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler
import seaborn as sns
class KMeansCluster:
"""
基于原型聚类的,K-均值聚类
"""
def __init__(self, data, k=3, max_epochs=100, tol=1e-3, dist_method="euclidean"):
"""
:param k: 聚类簇数
:param max_epochs: 最大迭代次数
:param tol: 精度要求,即迭代停止条件
:param dist_method: 距离度量方法,默认按“欧氏距离”计算
"""
self.X = data
self.m = data.shape[0]
self.k = k
self.max_epochs = max_epochs
self.tol = tol
self.dist_method = dist_method
self.distance_fun = self.distance_function()
self.cluster_centers = dict()
def distance_function(self):
"""
距离度量函数:euclidean, manhattan, VDM, cos, mahalanobis...
:return:
"""
if self.dist_method == "euclidean":
return lambda x, y: np.sqrt(((x - y) ** 2).sum())
elif self.dist_method == "":
return None
def select_cluster_center(self):
"""
按照k-means++方法,初始化簇中心向量
1. 从样本中选择 1 个点作为初始质心(完全随机);
2. 对于任意一个非质心样本𝒙,计算𝒙与现有最近质心距离 𝑫(𝒙);
3. 基于距离计算概率,来选择下一个质心𝒙 ,选择距离当前质心远的点作为质心;
4. 重复步骤 2 与 3 ,直到选择 𝒌 个质心为止。
:return:
"""
sample_j = np.random.choice(self.m, 1)
self.cluster_centers[0] = self.X[sample_j]
select_center_vec = [sample_j]
while len(self.cluster_centers) < self.k:
sample_j, max_dist = None, 0
for j in range(self.m):
for key in self.cluster_centers.keys():
dist = self.distance_fun(self.cluster_centers[key], self.X[j])
if dist > max_dist and j not in select_center_vec:
sample_j, max_dist = j, dist
select_center_vec.append(sample_j)
self.cluster_centers[len(self.cluster_centers)] = self.X[sample_j]
print("k-means++算法,初始化簇中心向量为:")
for key in self.cluster_centers.keys():
print("簇" + str(key + 1), self.cluster_centers[key])
print("-" * 100)
def fit_kmeans(self):
"""
k均值算法的核心内容,实质就是更新簇中心向量
:return:
"""
for epochs in range(self.max_epochs):
cluster = dict()
for idx in range(self.k):
cluster[idx] = []
for j in range(self.m):
best_cluster_idx, min_dist = None, np.infty
for c_idx in self.cluster_centers.keys():
dist = self.distance_fun(self.cluster_centers[c_idx], self.X[j])
if dist < min_dist:
best_cluster_idx, min_dist = c_idx, dist
cluster[best_cluster_idx].append(j)
eps = 0
for c_idx in self.cluster_centers.keys():
vec_center = np.mean(X[cluster[c_idx]], axis=0)
eps += self.distance_fun(vec_center, self.cluster_centers[c_idx])
self.cluster_centers[c_idx] = vec_center
if eps < self.tol:
break
def predict(self, X):
"""
针对每个样本,根据各簇中心计算距离,距离哪个簇中心近,归于哪个簇
:param X: 预测样本数据
:return:
"""
cluster_labels = []
for i in range(X.shape[0]):
best_j, min_dist = None, np.infty
for idx in range(self.k):
dist = self.distance_fun(self.cluster_centers[idx], X[i])
if dist < min_dist:
min_dist, best_j = dist, idx
cluster_labels.append(best_j)
return np.asarray(cluster_labels)
def plt_classify(self):
"""
绘制分类结果图,并绘制分类边界
:return:
"""
x1_min, x2_min = self.X.min(axis=0)
x1_max, x2_max = self.X.max(axis=0)
t1 = np.linspace(x1_min, x1_max, 50)
t2 = np.linspace(x2_min, x2_max, 50)
x1, x2 = np.meshgrid(t1, t2)
x_show = np.stack((x1.flat, x2.flat), axis=1)
cm_light = ListedColormap(["g", "r", "b", "m", "c"])
cm_dark = ListedColormap(["g", "r", "b", "m", "c"])
y_show_hat = self.predict(x_show)
y_show_hat = y_show_hat.reshape(x1.shape)
plt.figure(facecolor='w')
plt.pcolormesh(x1, x2, y_show_hat, shading='auto', cmap=cm_light, alpha=0.3)
plt.scatter(self.X[:, 0], self.X[:, 1], c=self.predict(self.X).ravel(), s=20, cmap=cm_dark)
for key in self.cluster_centers.keys():
center = self.cluster_centers[key]
plt.scatter(center[0], center[1], c="k", marker="p", s=100)
plt.xlabel("X1", fontsize=11)
plt.ylabel("X2", fontsize=11)
plt.xlim(x1_min, x1_max)
plt.ylim(x2_min, x2_max)
plt.grid(b=True, ls=':', color='#606060')
plt.title('K-means classification boundary and Cluster Center Vec', fontsize=12)
plt.show()
if __name__ == '__main__':
X = pd.read_csv("datasets/consumption_data.csv").values
cluster_k = 3
kmc = KMeansCluster(X, k=cluster_k, tol=1e-8)
kmc.select_cluster_center()
kmc.fit_kmeans()
labels = kmc.predict(X)
print("K均值算法收敛到簇中心向量:")
for key in kmc.cluster_centers.keys():
print("簇" + str(key + 1), kmc.cluster_centers[key])
title = ["R index", "F index", "M index"]
plt.figure(figsize=(7, 10))
for f in range(X.shape[1]):
plt.subplot(311 + f)
for c in range(cluster_k):
sns.kdeplot(X[labels == c][:, f])
plt.grid()
plt.title(title[f])
plt.show()
from sklearn.cluster import KMeans
skm = KMeans(n_clusters=cluster_k).fit(X)
print(skm.cluster_centers_)
title = ["SR index", "SF index", "SM index"]
plt.figure(figsize=(7, 10))
for f in range(X.shape[1]):
plt.subplot(311 + f)
for c in range(cluster_k):
sns.kdeplot(X[skm.labels_ == c][:, f])
plt.grid()
plt.title(title[f])
plt.show()