机器学习基础算法27-聚类实战

最新推荐文章于 2024-01-11 22:08:31 发布

哎呦-_-不错

最新推荐文章于 2024-01-11 22:08:31 发布

阅读量199

点赞数

文章标签：聚类机器学习 python

本BLOG上原创文章未经本人许可，不得用于商业用途，转载请注明出处。

本文链接：https://blog.csdn.net/weixin_46649052/article/details/108109413

版权

1.密度聚类

# 密度聚类
import numpy as np
import matplotlib.pyplot as plt
import sklearn.datasets as ds
import matplotlib.colors
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler


def expand(a, b):
    d = (b - a) * 0.1
    return a-d, b+d


if __name__ == "__main__":
    N = 1000
    centers = [[1, 2], [-1, -1], [1, -1], [-1, 1]]

    # ds.make_blobs为了生成数据集
    # n_features表示每一个样本有多少特征值
    # n_samples表示样本的个数
    # centers是聚类中心点的个数，可以理解为label的种类数
    # random_state是随机种子，可以固定生成的数据
    # cluster_std设置每个类别的方差
    data, y = ds.make_blobs(N, n_features=2, centers=centers, cluster_std=[0.5, 0.25, 0.7, 0.5], random_state=0)
    # 数据预处理-标准化
    print(data)
    data = StandardScaler().fit_transform(data)
    # 数据1的参数：(epsilon, min_sample)
    params = ((0.2, 5), (0.2, 10), (0.2, 15), (0.3, 5), (0.3, 10), (0.3, 15))


    matplotlib.rcParams['font.sans-serif'] = [u'SimHei']
    matplotlib.rcParams['axes.unicode_minus'] = False

    plt.figure(figsize=(12, 8), facecolor='w')
    plt.suptitle(u'DBSCAN聚类', fontsize=20)

    for i in range(6):
        eps, min_samples = params[i]
        model = DBSCAN(eps=eps, min_samples=min_samples)
        model.fit(data)
        # 预测值
        y_hat = model.labels_

        core_indices = np.zeros_like(y_hat, dtype=bool)
        core_indices[model.core_sample_indices_] = True

        # 对于一维数组或者列表，unique函数去除其中重复的元素，并按元素由大到小返回一个新的无元素重复的元组或者列表
        y_unique = np.unique(y_hat)
        n_clusters = y_unique.size - (1 if -1 in y_hat else 0)
        print(y_unique, '聚类簇的个数为：', n_clusters)

        # 颜色渐变
        # clrs = []
        # for c in np.linspace(16711680, 255, y_unique.size):
        #     clrs.append('#%06x' % c)
        plt.subplot(2, 3, i+1)
        clrs = plt.cm.Spectral(np.linspace(0, 0.8, y_unique.size))
        print(clrs)
        for k, clr in zip(y_unique, clrs):
            cur = (y_hat == k)
            if k == -1:
                plt.scatter(data[cur, 0], data[cur, 1], s=20, c='k')
                continue
            plt.scatter(data[cur, 0], data[cur, 1], s=30, c=clr, edgecolors='k')
            plt.scatter(data[cur & core_indices][:, 0], data[cur & core_indices][:, 1], s=60, c=clr, marker='o', edgecolors='k')
        x1_min, x2_min = np.min(data, axis=0)
        x1_max, x2_max = np.max(data, axis=0)
        x1_min, x1_max = expand(x1_min, x1_max)
        x2_min, x2_max = expand(x2_min, x2_max)
        plt.xlim((x1_min, x1_max))
        plt.ylim((x2_min, x2_max))
        plt.grid(True)
        plt.title('$\epsilon$ = %.1f  m = %d，聚类数目：%d' % (eps, min_samples, n_clusters), fontsize=16)
    plt.tight_layout()
    plt.subplots_adjust(top=0.9)
    plt.show()

[-1  0  1  2  3] 聚类簇的个数为： 4
[[0.61960784 0.00392157 0.25882353 1.        ]
 [0.95686275 0.42745098 0.2627451  1.        ]
 [0.99607843 0.87843137 0.54509804 1.        ]
 [0.90196078 0.96078431 0.59607843 1.        ]
 [0.4        0.76078431 0.64705882 1.        ]]

[-1  0  1  2  3  4] 聚类簇的个数为： 5
[[0.61960784 0.00392157 0.25882353 1.        ]
 [0.90442138 0.3479431  0.28304498 1.        ]
 [0.9928489  0.71695502 0.4094579  1.        ]
 [0.99915417 0.97377932 0.70503652 1.        ]
 [0.81122645 0.92387543 0.61453287 1.        ]
 [0.4        0.76078431 0.64705882 1.        ]]

[-1  0] 聚类簇的个数为： 1
[[0.61960784 0.00392157 0.25882353 1.        ]
 [0.4        0.76078431 0.64705882 1.        ]]

[-1  0  1] 聚类簇的个数为： 2
[[0.61960784 0.00392157 0.25882353 1.        ]
 [0.99607843 0.87843137 0.54509804 1.        ]
 [0.4        0.76078431 0.64705882 1.        ]]

[-1  0  1  2  3] 聚类簇的个数为： 4
[[0.61960784 0.00392157 0.25882353 1.        ]
 [0.95686275 0.42745098 0.2627451  1.        ]
 [0.99607843 0.87843137 0.54509804 1.        ]
 [0.90196078 0.96078431 0.59607843 1.        ]
 [0.4        0.76078431 0.64705882 1.        ]]

在这里插入图片描述

2.

from PIL import Image
import numpy as np
from sklearn.cluster import KMeans
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D


def restore_image(cb, cluster, shape):
    row, col, dummy = shape
    image = np.empty((row, col, 3))
    index = 0
    for r in range(row):
        for c in range(col):
            image[r, c] = cb[cluster[index]]
            index += 1
    return image


def show_scatter(a):
    N = 10
    print('原始数据：\n', a)
    density, edges = np.histogramdd(a, bins=[N,N,N], range=[(0,1), (0,1), (0,1)])
    density /= density.max()
    x = y = z = np.arange(N)
    d = np.meshgrid(x, y, z)

    fig = plt.figure(1, facecolor='w')
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(d[1], d[0], d[2], c='r', s=100*density, marker='o', depthshade=True)
    ax.set_xlabel(u'红色分量')
    ax.set_ylabel(u'绿色分量')
    ax.set_zlabel(u'蓝色分量')
    plt.title(u'图像颜色三维频数分布', fontsize=20)

    plt.figure(2, facecolor='w')
    den = density[density > 0]
    den = np.sort(den)[::-1]
    t = np.arange(len(den))
    plt.plot(t, den, 'r-', t, den, 'go', lw=2)
    plt.title(u'图像颜色频数分布', fontsize=18)
    plt.grid(True)

    plt.show()


if __name__ == '__main__':
    matplotlib.rcParams['font.sans-serif'] = [u'SimHei']
    matplotlib.rcParams['axes.unicode_minus'] = False

    num_vq = 60
    im = Image.open('flower2.png')     # son.bmp(100)/flower2.png(200)/son.png(60)/lena.png(50)
    # 保证数据都在0-1
    image = np.array(im).astype(np.float) / 255
    # 保证都有3个通道
    image = image[:, :, :3]
    image_v = image.reshape((-1, 3))

    show_scatter(image_v)

    N = image_v.shape[0]    # 图像像素总数
    # 选择足够多的样本(如1000个)，计算聚类中心
    idx = np.random.randint(0, N, size=1000)
    image_sample = image_v[idx]
    model = KMeans(num_vq)
    model.fit(image_sample)
    c = model.predict(image_v)  # 聚类结果
    print('聚类结果：\n', c)
    print('聚类中心：\n', model.cluster_centers_)

    plt.figure(figsize=(15, 8), facecolor='w')
    plt.subplot(121)
    plt.axis('off')
    plt.title(u'原始图片', fontsize=18)
    plt.imshow(image)
    # plt.savefig('1.png')

    plt.subplot(122)
    vq_image = restore_image(model.cluster_centers_, c, image.shape)
    plt.axis('off')
    plt.title(u'矢量量化后图片：%d色' % num_vq, fontsize=20)
    plt.imshow(vq_image)
    # plt.savefig('2.png')

    plt.tight_layout(1.2)
    plt.show()

原始数据：
 [[0.29411765 0.50588235 0.12941176]
 [0.30196078 0.51372549 0.1372549 ]
 [0.30588235 0.51764706 0.14117647]
 ...
 [0.09803922 0.35686275 0.01176471]
 [0.08627451 0.34117647 0.01176471]
 [0.07058824 0.3254902  0.        ]]
聚类结果：
 [23 23 23 ... 10 57 57]
聚类中心：
 [[0.14771242 0.01699346 0.44509804]
 [0.45139319 0.65634675 0.30443756]
 [0.22207698 0.44560639 0.09600581]
 [0.71279178 0.55406162 0.99010271]
 [0.3874183  0.24771242 0.76176471]
 [0.74154995 0.66610644 0.50289449]
 [0.02337255 0.21396078 0.00298039]
 [0.87098039 0.6927451  0.99647059]
 [0.38879552 0.61045752 0.24014939]
 [0.2611578  0.51951447 0.00737628]
 [0.11184314 0.3732549  0.00376471]
 [0.90653595 0.83594771 0.66601307]
 [0.439819   0.62594268 0.05339367]
 [0.70588235 0.84607843 0.46176471]
 [0.5054902  0.36862745 0.85411765]
 [0.52254902 0.50784314 0.32892157]
 [0.28872549 0.15392157 0.60294118]
 [0.01535948 0.07320261 0.0120915 ]
 [0.46117647 0.35058824 0.43058824]
 [0.3627451  0.61336898 0.00641711]
 [0.78431373 0.62913165 0.98804855]
 [0.62254902 0.78431373 0.10980392]
 [0.55098039 0.56862745 0.66470588]
 [0.27598039 0.50906863 0.12598039]
 [0.82712418 0.74183007 0.57418301]
 [0.96176471 0.92843137 0.98921569]
 [0.05709343 0.28512111 0.00253749]
 [0.34215686 0.27418301 0.62254902]
 [0.61568627 0.77039216 0.41705882]
 [0.44989107 0.32440087 0.78453159]
 [0.15869281 0.42474946 0.00522876]
 [0.56339869 0.41525054 0.92374728]
 [0.74901961 0.77703081 0.92156863]
 [0.77921569 0.89019608 0.52941176]
 [0.30292117 0.55910364 0.00792317]
 [0.15163399 0.1379085  0.02156863]
 [0.15042017 0.28459384 0.03361345]
 [0.56862745 0.63372549 0.24078431]
 [0.42009804 0.27696078 0.87303922]
 [0.23398693 0.09782135 0.53834423]
 [0.22352941 0.35098039 0.23333333]
 [0.91437908 0.78366013 0.99444444]
 [0.47058824 0.47607843 0.2054902 ]
 [0.32941176 0.1795207  0.71089325]
 [0.67736185 0.59215686 0.43707665]
 [0.17670127 0.37600923 0.03171857]
 [0.55238095 0.70644258 0.33697479]
 [0.5        0.65980392 0.17009804]
 [0.07712418 0.01568627 0.18562092]
 [0.31421569 0.35833333 0.08333333]
 [0.52091503 0.69803922 0.02875817]
 [0.58954248 0.53071895 0.81960784]
 [0.62229102 0.47389061 0.95190918]
 [0.29934641 0.18039216 0.36993464]
 [0.21597322 0.46791009 0.00966045]
 [0.59383754 0.5719888  0.43137255]
 [0.33411765 0.53427451 0.19701961]
 [0.0951634  0.33071895 0.00392157]
 [0.38966132 0.5885918  0.14153298]
 [0.48954248 0.41045752 0.62287582]]