0.k-means算法
1.概念
k-means属于无监督学习的聚类算法。 适用于:簇内相似性较高,簇间相似性较低。 k个初始聚簇中心的选择会影响结果。
2.实现过程:
选择初始的k个聚簇中心 把除开聚簇中心之外的点,依次计算到每个聚簇中心向量的距离。选择距离最近的聚簇中心,加入该簇。 更新聚簇中心:为簇内每个点的向量的平均值。 循环2、3,直到到达最大迭代次数或者聚簇中心不再发生改变。
1. 导入数据集
import numpy as np
import pandas as pd
data= pd. read_csv( "data/iris.csv" )
t= data. iloc[ : , : 4 ]
t
SepalLength SepalWidth PetalLength PetalWidth 0 5.1 3.5 1.4 0.2 1 4.9 3.0 1.4 0.2 2 4.7 3.2 1.3 0.2 3 4.6 3.1 1.5 0.2 4 5.0 3.6 1.4 0.2 ... ... ... ... ... 145 6.7 3.0 5.2 2.3 146 6.3 2.5 5.0 1.9 147 6.5 3.0 5.2 2.0 148 6.2 3.4 5.4 2.3 149 5.9 3.0 5.1 1.8
150 rows × 4 columns
2. K-Means算法实现
class KMeans :
"""使用python实现KMeans聚类"""
def __init__ ( self, k, times) :
"""初始化方法:
k:int 表示聚类的个数
times:int 表示循环迭代的最多次数
"""
self. k= k
self. times= times
def fit ( self, X) :
X= np. asarray( X)
np. random. seed( 666 )
self. cluster_center_= X[ np. random. randint( 0 , len ( X) , self. k) ]
self. labels_= np. zeros( len ( X) )
for t in range ( self. times) :
for index, x in enumerate ( X) :
dis= np. sqrt( np. sum ( ( x- self. cluster_center_) ** 2 , axis= 1 ) )
self. labels_[ index] = dis. argmin( )
for i in range ( self. k) :
self. cluster_center_[ i] = np. mean( X[ self. labels_== i] , axis= 0 )
def predict ( self, X) :
"""预测样本属于哪一个簇"""
X= np. asarray( X)
result= np. zeros( len ( X) )
for index , x in enumerate ( X) :
dis= np. sqrt( np. sum ( ( x- self. cluster_center_) ** 2 , axis= 1 ) )
result[ index] = dis. argmin( )
return result
3. 创建kmeans对象,进行分类
kmeans= KMeans( 3 , 50 )
kmeans. fit( t)
kmeans. cluster_center_
array([[6.85 , 3.07368421, 5.74210526, 2.07105263],
[5.006 , 3.418 , 1.464 , 0.244 ],
[5.9016129 , 2.7483871 , 4.39354839, 1.43387097]])
t[ kmeans. labels_== 0 ]
SepalLength SepalWidth PetalLength PetalWidth 52 6.9 3.1 4.9 1.5 77 6.7 3.0 5.0 1.7 100 6.3 3.3 6.0 2.5 102 7.1 3.0 5.9 2.1 103 6.3 2.9 5.6 1.8 104 6.5 3.0 5.8 2.2 105 7.6 3.0 6.6 2.1 107 7.3 2.9 6.3 1.8 108 6.7 2.5 5.8 1.8 109 7.2 3.6 6.1 2.5 110 6.5 3.2 5.1 2.0 111 6.4 2.7 5.3 1.9 112 6.8 3.0 5.5 2.1 115 6.4 3.2 5.3 2.3 116 6.5 3.0 5.5 1.8 117 7.7 3.8 6.7 2.2 118 7.7 2.6 6.9 2.3 120 6.9 3.2 5.7 2.3 122 7.7 2.8 6.7 2.0 124 6.7 3.3 5.7 2.1 125 7.2 3.2 6.0 1.8 128 6.4 2.8 5.6 2.1 129 7.2 3.0 5.8 1.6 130 7.4 2.8 6.1 1.9 131 7.9 3.8 6.4 2.0 132 6.4 2.8 5.6 2.2 134 6.1 2.6 5.6 1.4 135 7.7 3.0 6.1 2.3 136 6.3 3.4 5.6 2.4 137 6.4 3.1 5.5 1.8 139 6.9 3.1 5.4 2.1 140 6.7 3.1 5.6 2.4 141 6.9 3.1 5.1 2.3 143 6.8 3.2 5.9 2.3 144 6.7 3.3 5.7 2.5 145 6.7 3.0 5.2 2.3 147 6.5 3.0 5.2 2.0 148 6.2 3.4 5.4 2.3
kmeans. predict( [ [ 6.2 , 2.5 , 5.0 , 2.1 ] , [ 5.2 , 4.0 , 5.6 , 2.5 ] , [ 5.0 , 4.0 , 6.0 , 4.0 ] ] )
array([2., 0., 0.])
4. 进行可视化
t2= data. loc[ : , "SepalLength" : "SepalWidth" ]
my_kmeans= KMeans( 3 , 50 )
my_kmeans. fit( t2)
import matplotlib as mpl
import matplotlib. pyplot as plt
mpl. rcParams[ "font.family" ] = "SimHei"
mpl. rcParams[ "axes.unicode_minus" ] = False
plt. figure( figsize= ( 10 , 10 ) )
plt. scatter( t2[ kmeans. labels_== 0 ] . iloc[ : , 0 ] , t2[ kmeans. labels_== 0 ] . iloc[ : , 1 ] , label= "类别1" )
plt. scatter( t2[ kmeans. labels_== 1 ] . iloc[ : , 0 ] , t2[ kmeans. labels_== 1 ] . iloc[ : , 1 ] , label= "类别2" )
plt. scatter( t2[ kmeans. labels_== 2 ] . iloc[ : , 0 ] , t2[ kmeans. labels_== 2 ] . iloc[ : , 1 ] , label= "类别3" )
plt. scatter( my_kmeans. cluster_center_[ : , 0 ] , my_kmeans. cluster_center_[ : , 1 ] , marker= "+" , s= 300 )
plt. title( "聚类分析" )
plt. xlabel( "SepalLength" )
plt. ylabel( "SepalWidth" )
plt. legend( )
<matplotlib.legend.Legend at 0x26005241308>
5. 使用sklearn包的kMeans
import numpy as np
from sklearn import datasets
iris = datasets. load_iris( )
iris. feature_names
['sepal length (cm)',
'sepal width (cm)',
'petal length (cm)',
'petal width (cm)']
from sklearn. cluster import KMeans
kmeans= KMeans( n_clusters= 3 , init= 'k-means++' , random_state= 666 )
predict= kmeans. fit_predict( iris. data)
predict
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2,
2, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 0, 2, 0, 2, 2, 0, 0, 2, 2, 2, 2,
2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 0])
plt. figure( figsize= ( 10 , 10 ) )
import matplotlib as mpl
import matplotlib. pyplot as plt
mpl. rcParams[ "font.family" ] = "SimHei"
mpl. rcParams[ "axes.unicode_minus" ] = False
plt. scatter( iris. data[ predict == 0 , 0 ] , iris. data[ predict== 0 , 1 ] , label = '类型1' )
plt. scatter( iris. data[ predict == 1 , 0 ] , iris. data[ predict== 1 , 1 ] , label = '类型2' )
plt. scatter( iris. data[ predict == 2 , 0 ] , iris. data[ predict== 2 , 1 ] , label = '类型3' )
plt. scatter( kmeans. cluster_centers_[ : , 0 ] , kmeans. cluster_centers_[ : , 1 ] , marker= "+" , s= 300 )
plt. title( '鸢尾花聚簇分类' )
plt. xlabel( 'sepalLength' )
plt. ylabel( 'sepalWidth' )
plt. legend( )
<matplotlib.legend.Legend at 0x1c4d6076308>