datawhale-sklearn组队学习-task05

最新推荐文章于 2024-10-06 19:06:51 发布

康为

最新推荐文章于 2024-10-06 19:06:51 发布

阅读量120

点赞数

分类专栏：笔记文章标签： sklearn 聚类 python

原文链接：https://github.com/datawhalechina/machine-learning-toy-code/tree/main/ml-with-sklearn

版权

笔记专栏收录该内容

8 篇文章 0 订阅

订阅专栏

K-means聚类

1.原理

请添加图片描述

2.sklearn代码实战

2.1生成数据集

import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans

# make_blobs：生成聚类的数据集
# n_samples：生成的样本点个数，n_features：样本特征数，centers：样本中心数
# cluster_std：聚类标准差，shuffle：是否打乱数据，random_state：随机种子
X, y = make_blobs(n_samples=150, n_features=2,centers=3, cluster_std=0.5,shuffle=True, random_state=0)

# 散点图
# c：点的颜色，marker：点的形状，edgecolor：点边缘的形状，s：点的大小
plt.scatter(X[:, 0], X[:, 1],c='white', marker='o',edgecolor='black', s=50)
plt.show()

在这里插入图片描述

2.2用kmeans聚类并可视化

# 定义模型
# n_clusters：要形成的簇数，即k均值的k，init：初始化方式，tol：Frobenius 范数收敛的阈值
model = KMeans(n_clusters=3, init='random',n_init=10, max_iter=300, tol=1e-04, random_state=0)
# 训练加预测
y_pred = model.fit_predict(X)
# 画出预测的三个簇类
plt.scatter(
    X[y_pred == 0, 0], X[y_pred == 0, 1],
    s=50, c='lightgreen',
    marker='s', edgecolor='black',
    label='cluster 1'
)

plt.scatter(
    X[y_pred == 1, 0], X[y_pred == 1, 1],
    s=50, c='orange',
    marker='o', edgecolor='black',
    label='cluster 2'
)

plt.scatter(
    X[y_pred == 2, 0], X[y_pred == 2, 1],
    s=50, c='lightblue',
    marker='v', edgecolor='black',
    label='cluster 3'
)

# 画出聚类中心
plt.scatter(
    model.cluster_centers_[:, 0], model.cluster_centers_[:, 1],
    s=250, marker='*',
    c='red', edgecolor='black',
    label='centroids'
)
plt.legend(scatterpoints=1)
plt.grid()
plt.show()

请添加图片描述

2.3对比不同初始k值对聚类的影响

计算inertia随着k变化的情况

distortions = []
for i in range(1, 10):
    model = KMeans(
        n_clusters=i, init='random',
        n_init=10, max_iter=300,
        tol=1e-04, random_state=0
    )
    model.fit(X)
    distortions.append(model.inertia_)

请添加图片描述

# 画图可以看出k越大inertia越小，追求k越大对应用无益处
plt.plot(range(1, 10), distortions, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Distortion')
plt.show()

3.手写Kmeans++的代码实现

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler
import seaborn as sns


class KMeansCluster:
    """
    基于原型聚类的，K-均值聚类
    """

    def __init__(self, data, k=3, max_epochs=100, tol=1e-3, dist_method="euclidean"):
        """
        :param k: 聚类簇数
        :param max_epochs: 最大迭代次数
        :param tol: 精度要求，即迭代停止条件
        :param dist_method: 距离度量方法，默认按“欧氏距离”计算
        """
        self.X = data
        self.m = data.shape[0]  # 样本量
        self.k = k
        self.max_epochs = max_epochs
        self.tol = tol
        self.dist_method = dist_method
        self.distance_fun = self.distance_function()  # 距离度量函数
        self.cluster_centers = dict()  # 存储簇中心向量，以簇数为键

    def distance_function(self):
        """
        距离度量函数：euclidean, manhattan, VDM, cos, mahalanobis...
        :return:
        """
        if self.dist_method == "euclidean":
            return lambda x, y: np.sqrt(((x - y) ** 2).sum())
        elif self.dist_method == "":
            return None

    def select_cluster_center(self):
        """
        按照k-means++方法，初始化簇中心向量
        1. 从样本中选择 1 个点作为初始质心（完全随机）；
        2. 对于任意一个非质心样本𝒙，计算𝒙与现有最近质心距离 𝑫(𝒙)；
        3. 基于距离计算概率，来选择下一个质心𝒙 ，选择距离当前质心远的点作为质心；
        4. 重复步骤 2 与 3 ，直到选择 𝒌 个质心为止。
        :return:
        """
        sample_j = np.random.choice(self.m, 1)  # 随机选择一个簇中心的样本索引
        self.cluster_centers[0] = self.X[sample_j]
        select_center_vec = [sample_j]  # 已选择的簇中心样本索引存储，防止再被选到
        while len(self.cluster_centers) < self.k:
            sample_j, max_dist = None, 0
            for j in range(self.m):
                for key in self.cluster_centers.keys():
                    # 计算当前样本距离每个簇中心的距离
                    dist = self.distance_fun(self.cluster_centers[key], self.X[j])
                    if dist > max_dist and j not in select_center_vec:
                        sample_j, max_dist = j, dist  # 取最远距离
            select_center_vec.append(sample_j)
            self.cluster_centers[len(self.cluster_centers)] = self.X[sample_j]
        print("k-means++算法，初始化簇中心向量为：")
        for key in self.cluster_centers.keys():
            print("簇" + str(key + 1), self.cluster_centers[key])
        print("-" * 100)

    def fit_kmeans(self):
        """
        k均值算法的核心内容，实质就是更新簇中心向量
        :return:
        """
        for epochs in range(self.max_epochs):
            cluster = dict()  # 存储各簇样本索引，以簇索引为键
            for idx in range(self.k):
                cluster[idx] = []  # 每个簇存样本索引，值为列表
            for j in range(self.m):
                best_cluster_idx, min_dist = None, np.infty
                for c_idx in self.cluster_centers.keys():
                    # 计算每个样本到每个簇中心的距离
                    dist = self.distance_fun(self.cluster_centers[c_idx], self.X[j])
                    if dist < min_dist:
                        best_cluster_idx, min_dist = c_idx, dist  # 取最近的距离
                cluster[best_cluster_idx].append(j)

            # 更新簇中心均值向量
            eps = 0  # 簇中心更新前后的簇中心距离，累加和
            for c_idx in self.cluster_centers.keys():
                vec_center = np.mean(X[cluster[c_idx]], axis=0)  # 各簇中心均值向量
                eps += self.distance_fun(vec_center, self.cluster_centers[c_idx])
                self.cluster_centers[c_idx] = vec_center  # 更新

            # 簇中心更新过程的输出
            # print("iter", epochs + 1, ":", "簇中心与簇内样本索引：")
            # for key in cluster.keys():
            #     print("簇" + str(key + 1), "，中心", self.cluster_centers[key],
            #           "样本索引", cluster[key])
            # print("-" * 100)

            # 判断终止迭代的条件
            if eps < self.tol:
                break

    def predict(self, X):
        """
        针对每个样本，根据各簇中心计算距离，距离哪个簇中心近，归于哪个簇
        :param X: 预测样本数据
        :return:
        """
        cluster_labels = []  # 簇中心索引
        for i in range(X.shape[0]):
            best_j, min_dist = None, np.infty
            for idx in range(self.k):
                dist = self.distance_fun(self.cluster_centers[idx], X[i])
                if dist < min_dist:
                    min_dist, best_j = dist, idx
            cluster_labels.append(best_j)
        return np.asarray(cluster_labels)

    def plt_classify(self):
        """
        绘制分类结果图，并绘制分类边界
        :return:
        """
        # 绘制分类边界，模拟数据，生成网格点并预测，pcolormesh绘制
        x1_min, x2_min = self.X.min(axis=0)
        x1_max, x2_max = self.X.max(axis=0)
        t1 = np.linspace(x1_min, x1_max, 50)
        t2 = np.linspace(x2_min, x2_max, 50)
        x1, x2 = np.meshgrid(t1, t2)  # 生成网格采样点 50*50
        x_show = np.stack((x1.flat, x2.flat), axis=1)  # 测试点2500*2
        cm_light = ListedColormap(["g", "r", "b", "m", "c"])
        cm_dark = ListedColormap(["g", "r", "b", "m", "c"])
        y_show_hat = self.predict(x_show)  # 预测值2500*1
        y_show_hat = y_show_hat.reshape(x1.shape)  # 使之与输入的形状相同50*50
        plt.figure(facecolor='w')
        plt.pcolormesh(x1, x2, y_show_hat, shading='auto', cmap=cm_light, alpha=0.3)
        plt.scatter(self.X[:, 0], self.X[:, 1], c=self.predict(self.X).ravel(), s=20, cmap=cm_dark)  # 全部数据
        for key in self.cluster_centers.keys():
            center = self.cluster_centers[key]
            plt.scatter(center[0], center[1], c="k", marker="p", s=100)
        plt.xlabel("X1", fontsize=11)
        plt.ylabel("X2", fontsize=11)
        plt.xlim(x1_min, x1_max)
        plt.ylim(x2_min, x2_max)
        plt.grid(b=True, ls=':', color='#606060')
        plt.title('K-means classification boundary and Cluster Center Vec', fontsize=12)
        plt.show()


if __name__ == '__main__':
    # X = pd.read_csv("datasets/watermelon4.0.csv").values

    # centers = np.array([
    #     [0.2, 2.3], [-1.5, 2.3], [-2.8, 1.8], [-2.8, 2.8], [-2.8, 1.3]
    # ])
    # std = np.array([0.4, 0.3, 0.1, 0.1, 0.1])
    # X, _ = make_blobs(n_samples=2000, n_features=2, centers=centers, cluster_std=std, random_state=7)

    X = pd.read_csv("datasets/consumption_data.csv").values
    # X_scaler = StandardScaler().fit_transform(X)
    cluster_k = 3
    kmc = KMeansCluster(X, k=cluster_k, tol=1e-8)
    kmc.select_cluster_center()
    kmc.fit_kmeans()
    labels = kmc.predict(X)
    print("K均值算法收敛到簇中心向量：")
    for key in kmc.cluster_centers.keys():
        print("簇" + str(key + 1), kmc.cluster_centers[key])

    # kmc.plt_classify()

    # 可视化核密度估计
    title = ["R index", "F index", "M index"]
    plt.figure(figsize=(7, 10))
    for f in range(X.shape[1]):  # f表示特征
        plt.subplot(311 + f)
        for c in range(cluster_k):  # c表示簇索引
            sns.kdeplot(X[labels == c][:, f])
        plt.grid()
        plt.title(title[f])
    plt.show()

    from sklearn.cluster import KMeans
    skm = KMeans(n_clusters=cluster_k).fit(X)
    print(skm.cluster_centers_)

    # 可视化核密度估计
    title = ["SR index", "SF index", "SM index"]
    plt.figure(figsize=(7, 10))
    for f in range(X.shape[1]):  # f表示特征
        plt.subplot(311 + f)
        for c in range(cluster_k):  # c表示簇索引
            sns.kdeplot(X[skm.labels_ == c][:, f])
        plt.grid()
        plt.title(title[f])
    plt.show()