KMeans 算法复习


声明:内容非原创,是学习内容的总结,版权所属姜老师

1.关于聚类

import pandas as pd
import numpy as np
from pandas import Series,DataFrame

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

聚类的基本使用

# 手动随机生成随机点来做实验
from sklearn.datasets import make_blobs
X, y = make_blobs(n_samples=150, n_features=2, centers=3 ,cluster_std=1.5, random_state=2)
X,y
(array([[  1.53194956,  -0.36022153],
        [ -1.56430585,  -9.59730336],
        [ -1.08878137,  -0.53673972],
        [  2.43442247,  -0.15599663],
        [  0.48019529, -12.99688015],
        [  0.57898032,  -2.06887799],
        [ -0.72443515,  -7.44202457],
        [ -1.85225071,  -3.98632318],
        [  2.54279316,  -1.7870558 ],
        [ -0.52731615, -10.74779592],
        [ -2.29661532, -11.6406339 ],
        [  0.92141506,  -9.98499137],
        [ -5.35083116,  -0.65431189],
        [ -0.88989127,   0.11369336],
        [ -0.59631184,  -2.29097658],
        [ -1.50195028,  -5.4011869 ],
        [ -0.76364841,  -4.6539681 ],
        [  2.19201955,   0.60036835],
        [  7.15628849,  -0.06187083],
        [  4.81890691,  -3.50331202],
        [  0.65087822,  -4.39797054],
        [ -3.24158992,  -4.41559955],
        [ -1.13496627,  -1.67121333],
        [  0.03606565,  -2.04003449],
        [ -2.01792323,  -2.58566719],
        [  1.355409  ,  -0.54741367],
        [ -3.12791644,  -4.06556581],
        [ -5.26927614,  -9.6186543 ],
        [ -0.02442698,  -1.33977954],
        [ -2.86703029, -10.84498679],
        [  2.29764685,  -2.92418801],
        [ -1.18679697,  -1.80057881],
        [ -0.73325486,  -1.93333585],
        [ -0.52577983, -11.34940749],
        [ -0.50461407,  -3.93251527],
        [  1.90846569,  -0.6583068 ],
        [ -2.06104996,  -0.17628645],
        [ -0.47151448, -10.37571491],
        [  1.26386427,  -0.46380574],
        [ -0.36309079,  -9.40951948],
        [ -0.53887254,  -0.6449586 ],
        [ -2.23212091,  -8.718881  ],
        [ -4.16374507,  -3.50826293],
        [ -0.70730261,  -8.6320622 ],
        [  1.18048503,  -0.15879893],
        [ -1.41098559,  -4.66354671],
        [ -1.90907668,  -9.67996871],
        [  2.03754653,  -0.24742774],
        [ -0.97378999,  -7.371431  ],
        [  2.99659881,  -0.83960814],
        [ -3.4119278 ,  -9.71171816],
        [  0.10647516,  -2.83784632],
        [  3.68213884,  -1.93707213],
        [ -1.88594036, -11.55825336],
        [  0.46218028,  -8.90235829],
        [ -0.04304745,  -7.60915598],
        [ -1.78833491,  -9.83575141],
        [ -0.95592795, -12.26939394],
        [ -1.32676236,  -4.41753005],
        [  0.22044687, -10.05311414],
        [ -4.0709444 ,  -4.40679626],
        [ -4.62768987,  -2.99134472],
        [ -4.36824992,  -2.89757148],
        [ -4.08223794,  -6.88469836],
        [  0.77102877,   0.95860323],
        [ -0.05463537,  -2.68689003],
        [  0.44653092,  -2.60752136],
        [ -0.49365731,  -8.378556  ],
        [  0.65278373,  -5.68997024],
        [ -3.32769271,  -1.54225156],
        [ -0.36011954,  -2.18001056],
        [  2.2374372 ,  -0.3476192 ],
        [  1.5880298 ,  -1.7654783 ],
        [ -2.31262163,  -4.92277723],
        [ -0.28638281,  -2.50409338],
        [ -0.62985746,  -7.56390652],
        [  1.30709149,  -4.99949807],
        [ -3.88704121,  -7.92023943],
        [  0.31190778,  -0.52199607],
        [ -0.70822817,  -2.35468348],
        [ -2.44971637,  -2.95465548],
        [ -1.71601202,  -3.85030346],
        [ -2.06618377,  -5.41830673],
        [ -1.46459731,  -2.39530216],
        [ -2.52380489,  -9.34991004],
        [ -1.83223015,  -2.56988374],
        [ -1.02782509,  -3.59652323],
        [ -2.51078608,  -3.92019727],
        [ -2.63990045,  -3.03337678],
        [ -1.73623162,  -5.60353306],
        [ -0.89524628, -10.96464394],
        [ -5.15424798,  -3.30552368],
        [ -3.3851438 ,  -4.1251994 ],
        [  2.67007966,  -1.70491528],
        [  2.12119683,  -2.78419362],
        [ -2.25997736,  -8.21779094],
        [  1.74015978,  -1.10379588],
        [  3.29089003,  -4.27232081],
        [ -1.54379575,  -5.85414392],
        [ -1.75036425,  -8.32495776],
        [ -1.33945732,  -8.99247021],
        [ -2.92821038,  -7.10474478],
        [ -1.00719928,  -1.93003946],
        [  0.29073017,  -3.17563261],
        [ -1.28008731,  -8.66794651],
        [  1.54082983,  -0.1324291 ],
        [ -1.84360609,  -9.59318151],
        [ -0.597949  ,  -0.40605237],
        [ -2.23658448, -11.26289379],
        [ -3.19324464,  -4.3727003 ],
        [ -1.21779287, -11.15836353],
        [ -2.86763721,  -4.67181627],
        [  0.82161761,  -2.04081344],
        [ -0.45292089,  -6.04316334],
        [ -0.709394  ,  -9.80717827],
        [ -4.93225332,  -9.31238561],
        [ -0.23742255, -12.53167518],
        [ -2.40190838,  -9.46793749],
        [  2.65696448,  -3.94092874],
        [  0.10261618,   0.4306987 ],
        [  0.77075118,  -7.65464691],
        [ -1.97310998,  -8.95514262],
        [ -1.23044866,  -0.02408431],
        [ -0.83889419,   1.41316281],
        [  1.89552328,  -1.28806291],
        [  3.74624864,  -0.63251734],
        [  0.50567512,  -2.13390391],
        [ -3.56899486,  -6.43169397],
        [ -1.30528349,  -4.3866171 ],
        [ -2.97980187,  -8.83183653],
        [ -1.85237668,  -9.38174185],
        [  3.60596784,  -1.96480346],
        [ -1.59595363,  -3.6022414 ],
        [ -1.32254393,  -3.49370015],
        [ -4.34058653,  -9.41209208],
        [ -0.05036661,  -4.47612317],
        [ -1.60404567,  -4.79404957],
        [ -2.52020719,  -2.82511188],
        [ -2.5972638 ,  -9.71612662],
        [ -0.82276679,  -3.89556977],
        [  5.91286766,   0.16273983],
        [ -1.85513922,  -5.54901873],
        [  0.7183647 ,   0.23622995],
        [ -1.6836874 ,  -6.13442518],
        [ -0.0856312 ,  -2.16867404],
        [ -1.3087977 ,  -7.71897353],
        [ -2.42206812,  -2.94401336],
        [  0.50787945,  -0.65781509],
        [ -3.18469257,  -3.55607882],
        [ -1.50676754,  -3.15467085]]),
 array([1, 0, 2, 1, 0, 1, 0, 2, 1, 0, 0, 0, 2, 1, 2, 2, 2, 1, 1, 1, 1, 2,
        1, 2, 2, 1, 2, 0, 2, 0, 1, 1, 1, 0, 2, 1, 2, 0, 1, 0, 1, 0, 2, 0,
        1, 2, 0, 1, 0, 1, 0, 2, 1, 0, 0, 0, 0, 0, 1, 0, 2, 2, 2, 0, 1, 1,
        1, 0, 2, 2, 1, 1, 1, 2, 2, 0, 2, 0, 1, 1, 2, 2, 2, 2, 0, 2, 2, 2,
        2, 0, 0, 2, 2, 1, 1, 0, 1, 1, 2, 0, 0, 0, 2, 1, 0, 1, 0, 1, 0, 2,
        0, 2, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 2, 2, 0, 0, 1,
        2, 2, 0, 2, 2, 2, 0, 2, 1, 2, 1, 0, 1, 0, 2, 1, 2, 2]))
sns.set()
X.shape
(150, 2)
y.shape
(150,)
plt.scatter(X[:,0],X[:,1],c=y,cmap=plt.cm.Accent_r)
<matplotlib.collections.PathCollection at 0x1a3478924f0>


请添加图片描述

# 导入kmeans
from sklearn.cluster import KMeans
# 1. 实例化Kmeans对象
# kmeans下的重要参数: n_clusters 默认是8 即8个随机点(随机种子) 分成8个簇 类型int
# k值怎么挑选: 看业务需求 根据业务需要
km = KMeans(n_clusters=3)
# km.fit_predict() 在kmeans这种聚类算法中 sklearn为我们集成了两种方式,上下两种都可以使用
#km.fit()
#km.predict()
# 2.fit_predict 无监督学习直接预测 训练预测二合一
y_ = km.fit_predict(X)
y_
array([2, 1, 2, 2, 1, 2, 1, 0, 2, 1, 1, 1, 0, 2, 0, 0, 0, 2, 2, 2, 0, 0,
       0, 2, 0, 2, 0, 1, 2, 1, 2, 0, 0, 1, 0, 2, 0, 1, 2, 1, 2, 1, 0, 1,
       2, 0, 1, 2, 1, 2, 1, 0, 2, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 2, 0,
       2, 1, 0, 0, 0, 2, 2, 0, 0, 1, 0, 1, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 2, 2, 1, 2, 2, 0, 1, 1, 1, 0, 0, 1, 2, 1, 2, 1, 0,
       1, 0, 2, 0, 1, 1, 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 2, 0, 0, 1, 1, 2,
       0, 0, 1, 0, 0, 0, 1, 0, 2, 0, 2, 0, 2, 1, 0, 2, 0, 0])
# 获取最后的聚类中心
cluster_centers = km.cluster_centers_
cluster_centers
array([[-1.80632868, -3.67173199],
       [-1.62473796, -9.4792349 ],
       [ 1.58887503, -1.06495221]])
plt.figure(figsize=(12,4))

ax1 = plt.subplot(1,2,1)
ax2 = plt.subplot(1,2,2)

ax1.scatter(X[:,0],X[:,1],c=y,cmap=plt.cm.cool)
ax2.scatter(X[:,0],X[:,1],c=y_,cmap=plt.cm.rainbow_r)
ax2.scatter(cluster_centers[:,0],cluster_centers[:,1],color ='red',marker = '*',s=300)

ax1.set_title('True')
ax2.set_title('Kmeans')

plt.show()


请添加图片描述

球队综合实力聚类分析

data = pd.read_csv('AsiaZoo.txt',header=None)
data
0123
0中国50509
1日本2894
2韩国17153
3伊朗25405
4沙特28402
5伊拉克50501
6卡塔尔50409
7阿联酋50409
8乌兹别克斯坦40405
9泰国50509
10越南50505
11阿曼50509
12巴林40409
13朝鲜403217
14印尼50509
data.columns = ['国家','2006年世界杯','2010年世界杯','2007年亚洲杯']
data
国家2006年世界杯2010年世界杯2007年亚洲杯
0中国50509
1日本2894
2韩国17153
3伊朗25405
4沙特28402
5伊拉克50501
6卡塔尔50409
7阿联酋50409
8乌兹别克斯坦40405
9泰国50509
10越南50505
11阿曼50509
12巴林40409
13朝鲜403217
14印尼50509
from mpl_toolkits.mplot3d import Axes3D
# 设置中文以及seaborn
sns.set_style(style='white')

plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
# 下面进行画图 用3D绘制 (3个特征)
plt.figure(figsize=(10,6))
ax = plt.subplot(projection='3d')

ax.scatter3D(data['2006年世界杯'],data['2010年世界杯'],data['2007年亚洲杯'],s = 200,cmap = plt.cm.rainbow)
ax.set_xlabel('2006年世界杯')
ax.set_ylabel('2010年世界杯')
ax.set_zlabel('2007年亚洲杯')
plt.show()


请添加图片描述

# 构造X的数据  解决‘国家’
X = data.iloc[:,1:]
X
2006年世界杯2010年世界杯2007年亚洲杯
050509
12894
217153
325405
428402
550501
650409
750409
840405
950509
1050505
1150509
1240409
13403217
1450509
# 1.实例化kmeans
km = KMeans(n_clusters=3)
# 2.预测 fit_predict
y_ = km.fit_predict(X)
y_
array([0, 1, 1, 2, 2, 0, 0, 0, 2, 0, 0, 0, 2, 2, 0])
# 下面进行画图 用3D绘制 (3个特征)
plt.figure(figsize=(10,6))
ax = plt.subplot(projection='3d')

ax.scatter3D(data['2006年世界杯'],data['2010年世界杯'],data['2007年亚洲杯'],s = 200,c = y_,cmap = plt.cm.rainbow)
ax.set_xlabel('2006年世界杯')
ax.set_ylabel('2010年世界杯')
ax.set_zlabel('2007年亚洲杯')
plt.show()

请添加图片描述

# 在元数据集中做增加column的操作 使得聚类结果展示出来
data['簇类'] = y_
data
国家2006年世界杯2010年世界杯2007年亚洲杯簇类
0中国505090
1日本28941
2韩国171531
3伊朗254052
4沙特284022
5伊拉克505010
6卡塔尔504090
7阿联酋504090
8乌兹别克斯坦404052
9泰国505090
10越南505050
11阿曼505090
12巴林404092
13朝鲜4032172
14印尼505090
data.groupby('簇类').groups
{0: [0, 5, 6, 7, 9, 10, 11, 14], 1: [1, 2], 2: [3, 4, 8, 12, 13]}
data.groupby('簇类').groups.items()
dict_items([(0, Int64Index([0, 5, 6, 7, 9, 10, 11, 14], dtype='int64')), (1, Int64Index([1, 2], dtype='int64')), (2, Int64Index([3, 4, 8, 12, 13], dtype='int64'))])
# 复习一下 for循环输出不同的国家和梯队 
for _,indexes in data.groupby('簇类').groups.items():
    countries = data.loc[indexes,'国家']
    for country in countries:
        print(country, end='  ')
    print()
中国  伊拉克  卡塔尔  阿联酋  泰国  越南  阿曼  印尼  
日本  韩国  
伊朗  沙特  乌兹别克斯坦  巴林  朝鲜  

2.kmeans中常见的错误

import pandas as pd
import numpy as np
from pandas import Series,DataFrame

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs

a. k值的不合理不合适

# k值的不合理不合适
X ,y = make_blobs(n_samples=150, n_features=2, centers=3, random_state=2, cluster_std=2)
sns.set()
# 将画图函数进行封装 画图时传入X y即可快速画图
def show_scatter(X,y):
    plt.scatter(X[:,0],X[:,1],c=y,cmap=plt.cm.rainbow)
    plt.show()
# 用这个封装的函数进行绘制
show_scatter(X,y)


请添加图片描述

# 实例化一个kmeans对象
km = KMeans(n_clusters=2)
y_ = km.fit_predict(X)
# 封装一个函数 展现y 和y_ 的对比 【 true 和 predict】
def show_predict(X,y,y_):
    
    # 设置画布比例
    plt.figure(figsize=(12,4))
    
    # 设置子图 121代表位置在左边
    plt.subplot(121)
    plt.scatter(X[:,0],X[:,1],c=y,s=100,cmap=plt.cm.rainbow)
    plt.title('True')
    
    # 设置子图 122代表位置在右边
    plt.subplot(122)
    plt.scatter(X[:,0],X[:,1],c=y_,s=100,cmap=plt.cm.cool)
    plt.title('Kmeans prediction')
    
    plt.show()

# 调用函数进行图像绘制
show_predict(X,y,y_)

请添加图片描述


km = KMeans(n_clusters=7)
y_ = km.fit_predict(X)
show_predict(X,y,y_)


请添加图片描述

b. 数据存在偏差 (必然存在)

# 去模仿数据存在偏差的状况(极端情况)
trans = [[0.6,-0.6],[-0.4,0.8]]
X1 = np.dot(X, trans)
X1
array([[  1.04655426,  -1.06619879],
       [  2.85894079,  -6.71330586],
       [ -0.7186756 ,   0.88485489],
       [  1.65961265,  -1.57033723],
       [  6.30764932, -11.97512202],
       [  1.19546232,  -2.12639029],
       [  2.38135532,  -5.08623838],
       [  0.51032678,  -2.18392533],
       [  2.61620742,  -3.39683023],
       [  4.30212858,  -8.77008969],
       [  3.3628695 ,  -8.30701087],
       [  5.05428445,  -9.11541647],
       [ -4.06561027,   4.1690844 ],
       [ -1.14367301,   1.37678308],
       [  0.61089302,  -1.38030672],
       [  1.5451611 ,  -3.97335364],
       [  1.7372859 ,  -3.76696174],
       [  1.06229566,  -0.56962559],
       [  5.38690504,  -5.2474292 ],
       [  5.35243507,  -7.04839453],
       [  2.49516333,  -4.668274  ],
       [ -0.3721972 ,  -1.53034875],
       [ -0.38778277,  -0.33105738],
       [  0.98295923,  -1.61853714],
       [ -0.36922777,  -0.55735425],
       [  1.00515763,  -1.12463797],
       [ -0.46794307,  -1.24791822],
       [ -0.09364828,  -3.77210397],
       [  0.56109582,  -0.82320444],
       [  2.4821924 ,  -7.00198864],
       [  3.02656089,  -4.41365421],
       [ -0.3602524 ,  -0.42758268],
       [  0.07338504,  -0.93202387],
       [  4.62421714,  -9.41303776],
       [  1.55973854,  -3.20463953],
       [  1.50674598,  -1.68536932],
       [ -1.68873222,   2.04715326],
       [  4.14832672,  -8.41784462],
       [  0.88733094,  -0.96222038],
       [  3.71976143,  -7.4739751 ],
       [ -0.45824364,   0.28673935],
       [  1.85619681,  -5.24206996],
       [ -1.59383419,   0.1752011 ],
       [  3.0297481 ,  -6.36931789],
       [  0.65795725,  -0.5701764 ],
       [  1.22452476,  -3.25930919],
       [  2.62721231,  -6.52566558],
       [  1.39087515,  -1.35036299],
       [  2.14422155,  -4.8114547 ],
       [  2.47394653,  -2.74926392],
       [  1.44186445,  -5.35725076],
       [  1.46478647,  -2.52586403],
       [  3.60769267,  -4.46832419],
       [  3.64747318,  -8.54767826],
       [  4.10949232,  -7.59322003],
       [  3.01560224,  -5.80962205],
       [  2.80688984,  -6.78842721],
       [  4.77075809, -10.05023814],
       [  0.92348261,  -3.10702502],
       [  4.52984205,  -8.62730621],
       [ -1.04037586,  -0.857475  ],
       [ -2.24067973,   1.09773636],
       [ -2.08314016,   0.99020918],
       [ -0.60212755,  -1.80551487],
       [ -0.26555557,   0.94928424],
       [  1.01817619,  -2.27871059],
       [  1.37677927,  -2.59498371],
       [  3.06546103,  -6.26982751],
       [  3.42296609,  -6.00517641],
       [ -1.97353169,   1.60343801],
       [  0.50345313,  -1.49365181],
       [  1.60422314,  -1.61714642],
       [  1.84088873,  -2.61000354],
       [  0.6414722 ,  -2.81451291],
       [  0.97249854,  -1.85557453],
       [  2.52202118,  -5.29190794],
       [  3.57816048,  -5.79211897],
       [  0.1063184 ,  -3.06624939],
       [  0.15680059,  -0.26272488],
       [  0.31812512,  -1.40148269],
       [ -0.51786853,  -0.60550725],
       [  0.54677387,  -2.14782857],
       [  1.10290489,  -3.540228  ],
       [ -0.02809505,  -0.79695896],
       [  1.95939845,  -5.68182042],
       [ -0.22909115,  -0.68907304],
       [  0.96197396,  -2.42767921],
       [ -0.05176867,  -1.58656273],
       [ -0.6280311 ,  -0.53732937],
       [  0.59138935,  -2.3157436 ],
       [  4.12343675,  -8.70705014],
       [ -2.49436411,   1.18385863],
       [ -0.64192038,  -1.10574549],
       [  2.67422834,  -3.41104287],
       [  2.8107372 ,  -4.12316684],
       [  1.56666362,  -4.68528874],
       [  1.60969543,  -2.02591295],
       [  4.54015959,  -6.64625707],
       [  1.7532618 ,  -4.42303141],
       [  2.03150974,  -5.2072905 ],
       [  2.71624194,  -6.24802934],
       [  0.43845259,  -2.96345309],
       [  0.08968327,  -0.66659717],
       [  1.55513133,  -3.07632843],
       [  2.59065863,  -5.94936671],
       [  0.93216919,  -0.83032442],
       [  2.63330227,  -6.48546903],
       [ -0.63292147,   0.58883382],
       [  3.20943279,  -7.95211276],
       [ -0.35640058,  -1.52326577],
       [  3.9687166 ,  -8.6556471 ],
       [  0.06361389,  -2.10280876],
       [  1.37460439,  -2.29056461],
       [  1.85250741,  -3.81133115],
       [  3.65480355,  -7.62110191],
       [  0.01262668,  -3.71503562],
       [  5.48544573, -10.90480912],
       [  2.11986363,  -5.90523358],
       [  3.85627671,  -5.78563176],
       [ -0.51873656,   0.92091616],
       [  3.69090297,  -6.50918461],
       [  2.18941176,  -5.70129111],
       [ -1.34263749,   1.50226615],
       [ -1.79592572,   2.72208617],
       [  1.83226198,  -2.34675524],
       [  2.96321796,  -3.12808692],
       [  1.17149864,  -2.13710711],
       [  0.44112921,  -3.41892551],
       [  1.16139064,  -3.04847928],
       [  1.318295  ,  -4.7644111 ],
       [  2.51351799,  -6.25291692],
       [  3.56154592,  -4.43696748],
       [  0.51052083,  -1.97927577],
       [  0.67135991,  -2.08222618],
       [  0.53913689,  -4.29472262],
       [  2.21306072,  -4.14788593],
       [  1.13967822,  -3.24406418],
       [ -0.6433511 ,  -0.41093476],
       [  2.09594684,  -6.01368432],
       [  1.28551209,  -2.91070882],
       [  4.27237602,  -4.01310782],
       [  1.34145359,  -3.84848977],
       [  0.07757892,   0.220884  ],
       [  0.91656719,  -2.9240639 ],
       [  0.71699766,  -1.70115019],
       [  2.0615714 ,  -4.91416057],
       [ -0.50142572,  -0.61627426],
       [  0.38601474,  -0.56437584],
       [ -0.78509038,  -0.65904451],
       [  0.3431654 ,  -1.57321605]])
show_scatter(X1,y)

请添加图片描述

show_scatter(X,y)

请添加图片描述

# 首先实例化
km2 =KMeans(n_clusters=3)
# 预测:根据存在偏差的数据进行预测
y2_ = km2.fit_predict(X1)
show_predict(X , y ,y2_)

请添加图片描述

c. 标准偏差不相同cluster_std

元 万元
8000 0.8

X , y = make_blobs(n_samples=150, n_features=2, random_state=2, cluster_std=[1,2,4])
show_scatter(X,y)

请添加图片描述

km3 = KMeans(n_clusters=3)
y3_ = km3.fit_predict(X)
show_predict(X,y,y3_)

请添加图片描述

from sklearn.preprocessing import StandardScaler
ss_X  = StandardScaler().fit_transform(X)
ss_X.std(axis=0)
array([1., 1.])

d. 样本数量不同

X,y = make_blobs(n_samples=1500, n_features=2, centers=3, random_state=5)
# 故意按照100 35 15 取数据 以使得样本数量不同
# 造X的假数据
XA = X[y==0][:100]
XB = X[y==1][:35]
XC = X[y==2][:15]
XX = np.concatenate((XA,XB,XC))

# 造y的数据
y = np.array([0]*100 + [1]*35 + [2]*15)
XX.shape
(150, 2)
y.shape
(150,)
show_scatter(XX,y)

请添加图片描述

km4 = KMeans(n_clusters=3)
y4_ = km4.fit_predict(XX)
show_predict(XX, y, y4_)


请添加图片描述

e. 使用轮廓系数来判断聚类的效果

X ,y = make_blobs(n_samples=150, n_features=2, centers=3, random_state=3,cluster_std=3)
show_scatter(X,y)


请添加图片描述

from sklearn.metrics import silhouette_score
kmeans = KMeans(n_clusters=3)
y_ = kmeans.fit_predict(X)
# 使用轮廓系数进行评分
silhouette_score(X,y_)
0.39860514202079084
# 定义一个函数进行绘图,以发现不同的轮廓系数的图像和得分
def show_clusters_edge(kmeans, X):
    # 画底图
    xmin, xmax = X[:,0].min(), X[:,0].max()
    ymin, ymax = X[:,1].min(), X[:,1].max()
    
    x = np.linspace(xmin, xmax, 200)
    y = np.linspace(ymin, ymax,200)
    
    xx , yy = np.meshgrid(x, y)
    
    kmeans.fit(X)

    X_test = np.c_[xx.ravel(), yy.ravel()]
    y_ = kmeans.predict(X)
    y1_ = kmeans.predict(X_test)
    
    plt.scatter(X_test[:,0],X_test[:,1], c=y1_,s=100, cmap=plt.cm.Accent)
    plt.scatter(X[:,0],X[:,1],c=y_,s= 100,cmap=plt.cm.cool)
    
    plt.title('silhouette_score:%.4f'%(silhouette_score(X,y_)))
    plt.show()    
kmeans = KMeans(n_clusters=3)
show_clusters_edge(kmeans,X)
D:\software\anaconda\lib\site-packages\matplotlib\backends\backend_agg.py:240: RuntimeWarning: Glyph 65306 missing from current font.
  font.set_text(s, 0.0, flags=flags)
D:\software\anaconda\lib\site-packages\matplotlib\backends\backend_agg.py:203: RuntimeWarning: Glyph 65306 missing from current font.
  font.set_text(s, 0, flags=flags)

请添加图片描述

kmeans = KMeans(n_clusters=4)
show_clusters_edge(kmeans,X)
D:\software\anaconda\lib\site-packages\matplotlib\backends\backend_agg.py:240: RuntimeWarning: Glyph 65306 missing from current font.
  font.set_text(s, 0.0, flags=flags)
D:\software\anaconda\lib\site-packages\matplotlib\backends\backend_agg.py:203: RuntimeWarning: Glyph 65306 missing from current font.
  font.set_text(s, 0, flags=flags)

请添加图片描述

kmeans = KMeans(n_clusters=5)
show_clusters_edge(kmeans,X)
D:\software\anaconda\lib\site-packages\matplotlib\backends\backend_agg.py:240: RuntimeWarning: Glyph 65306 missing from current font.
  font.set_text(s, 0.0, flags=flags)
D:\software\anaconda\lib\site-packages\matplotlib\backends\backend_agg.py:203: RuntimeWarning: Glyph 65306 missing from current font.
  font.set_text(s, 0, flags=flags)

请添加图片描述

kmeans = KMeans(n_clusters=6)
show_clusters_edge(kmeans,X)
D:\software\anaconda\lib\site-packages\matplotlib\backends\backend_agg.py:240: RuntimeWarning: Glyph 65306 missing from current font.
  font.set_text(s, 0.0, flags=flags)
D:\software\anaconda\lib\site-packages\matplotlib\backends\backend_agg.py:203: RuntimeWarning: Glyph 65306 missing from current font.
  font.set_text(s, 0, flags=flags)

请添加图片描述

kmeans = KMeans(n_clusters=7)
show_clusters_edge(kmeans,X)
D:\software\anaconda\lib\site-packages\matplotlib\backends\backend_agg.py:240: RuntimeWarning: Glyph 65306 missing from current font.
  font.set_text(s, 0.0, flags=flags)
D:\software\anaconda\lib\site-packages\matplotlib\backends\backend_agg.py:203: RuntimeWarning: Glyph 65306 missing from current font.
  font.set_text(s, 0, flags=flags)

请添加图片描述

kmeans = KMeans(n_clusters=2)
show_clusters_edge(kmeans,X)
D:\software\anaconda\lib\site-packages\matplotlib\backends\backend_agg.py:240: RuntimeWarning: Glyph 65306 missing from current font.
  font.set_text(s, 0.0, flags=flags)
D:\software\anaconda\lib\site-packages\matplotlib\backends\backend_agg.py:203: RuntimeWarning: Glyph 65306 missing from current font.
  font.set_text(s, 0, flags=flags)

请添加图片描述


  • 2
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值