【机器学习】聚类算法——K-Means(上)

from sklearn.cluster import KMeans

k = 4
model = KMeans(k,random_state = 420)
# 第一种:直接fit自动会把y标签存储到labels_属性下
model.fit(data)
y_pred = model.labels_
centers = model.cluster_centers_
SSE = model.inertia_

# 第二种:fit和predict一起,适用于小数据
y_pred_2 = model.fit_predict(data)

# 第三种:单独predict适用于大数据
model.fit(data)
y_pred_3 = model.predict(data)

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples,silhouette_score
import matplotlib.pyplot as plt
import matplotlib.cm as cm 
import numpy as np 
from sklearn.datasets import make_blobs

# 准备数据
data_x = pd.DataFrame(make_blobs()[0],columns = ['x1','x2'])

for i in range(2,5):
    # 作图准备
    fig,(ax1,ax2) = plt.subplots(1,2)
    fig.set_size_inches(18,7)
    ax1.set_xlim([-0.1,1])
    ax1.set_ylim(0,data_2_x.shape[0]+(i+1)*10)

    # 聚类
    model = KMeans(i,random_state=0).fit(data_x)
    y_pred = pd.DataFrame(model.labels_)
    silhouette_all = silhouette_score(data_x,y_pred)
    silhouette_one = pd.DataFrame(silhouette_samples(data_x,y_pred))
    # silhouette_one = silhouette_all.mean()
    data = pd.concat([data_x,y_pred,silhouette_one],axis = 1)
    data.columns = ['x1','x2','y_pred','silh']

    # 画轮廓系数图
    y_lower = 10
    for j in range(i) :
        # 提取轮廓系数并降序排序
        silh = data[data['y_pred']==j].sort_values(
            by = 'silh',ascending = False)['silh']
        # 画左边条形图
        size_silh = silh.shape[0]
        color = cm.nipy_spectral(float(j)/i)
        ax1.fill_betweenx(np.arange(y_lower, y_lower+size_silh)
                          ,silh
                          ,facecolor=color
                          ,alpha=0.7)
        # 标明y类别
        ax1.text(-0.05
                 , y_lower + 0.5 * size_silh
                 , str(j))
        y_lower = y_lower+size_silh
        # 标题
        ax1.set_title("The silhouette plot for the various clusters.")
        ax1.set_xlabel("The silhouette coefficient values")
        ax1.set_ylabel("Cluster label")
        # 画整体轮廓系数平均值
        ax1.axvline(x=silhouette_all, color="red", linestyle="--")
        # 画右边样本分布散点图
        colors = cm.nipy_spectral(y_pred[0].astype(float) / i)
        ax2.scatter(data.iloc[:,0],data.iloc[:,1],c = colors)
        # 画样本质心点
        centers = model.cluster_centers_
        ax2.scatter(centers[j][0], centers[j][1], marker='x',
                    c="red", alpha=1, s=200)
        # 标题
        ax2.set_title("The visualization of the clustered data.")
        ax2.set_xlabel("Feature space for the 1st feature")
        ax2.set_ylabel("Feature space for the 2nd feature")
        
        
    plt.suptitle(("For n_clusters = %d  " 
                  "The average silhouette_score is : %f" %(i,silhouette_all)),
                 fontsize=25, fontweight='bold')
    
#     plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
#                   "with n_clusters = %d" % n_clusters),
#                  fontsize=14, fontweight='bold')

  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值