Kmeans算法原理:
选择K个点作为初始质心
repeat
将每个点指派到最近的质心,形成K个簇
重新计算每个簇的质心
until 簇不发生变化或达到最大迭代次数
案例:对于NBA球员的平均每分钟助攻和平均每分钟得分数据,进行聚类。
1.利用python中自带的sklearn模块实现Kmeans算法:
#-*-coding:utf-8 -*-
from sklearn.cluster import Birch
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt
#X中存放的是NBA中部分运动员的平均每分钟助攻和平均每分钟得分数据
X=[[0.0888,0.585],
[0.1399,0.8291],
[0.0747,0.4974],
[0.0938,0.5772],
[0.1276,0.5703],
[0.1671,0.5835],
[0.1906,0.5276],
[0.1061,0.5523],
[0.2446,0.4007],
[0.1670,0.4770],
[0.2485,0.4313],
[0.1227,0.4909],
[0.1240,0.5668],
[0.1461,0.5113],
[0.2315,0.3788],
[0.0494,0.5590],
[0.1107,0.4799],
[0.2521,0.5735],
[0.1007,0.6318],
[0.1067,0.4326],
[0.1956,0.4280]
]
print X
clf=KMeans(n_clusters=3)
y_pred=clf.fit_predict(X)
print clf
print y_pred
x=[n[0] for n in X]
print x
y=[n[1] for n in X]
print y
#可视化显示聚类分布情况
plt.scatter(x,y,c=y_pred,marker='x')
plt.title('KMeans-Basketball Data')
plt.xlabel('assists_per_minute')
plt.ylabel('points_per_minute')
plt.legend(['Rank'])
plt.show()
最后得到的聚类结果如下图所示:
2.不调用python中的sklean模块,实现kmeans聚类算法:
kmeans.py程序:
from numpy import *
import time
import matplotlib.pyplot as plt
# calculate Euclidean distance
def euclDistance(vector1, vector2):
return sqrt(sum(power(vector2 - vector1, 2)))
# init centroids with random samples
def initCentroids(dataSet, k):
numSamples, dim = dataSet.shape
centroids = zeros((k, dim))
for i in range(k):
index = int(random.uniform(0, numSamples))
centroids[i,:] = dataSet[index,:]
return centroids
# k-means cluster
def kmeans(dataSet, k):
numSamples = dataSet.shape[0]
# first column stores which cluster this sample belongs to,
# second column stores the error between this sample and its centroid
clusterAssment = mat(zeros((numSamples, 2)))
clusterChanged = True
## step 1: init centroids
centroids = initCentroids(dataSet, k)
while clusterChanged:
clusterChanged = False
## for each sample
for i in xrange(numSamples):
minDist = 100000.0
minIndex = 0
## for each centroid
## step 2: find the centroid who is closest
for j in range(k):
distance = euclDistance(centroids[j, :], dataSet[i, :])
if distance < minDist:
minDist = distance
minIndex = j
## step 3: update its cluster
if clusterAssment[i,0] != minIndex:
clusterChanged = True
clusterAssment[i,:] = minIndex, minDist**2
## step 4: update centroids
for j in range(k):
pointsInCluster = dataSet[nonzero(clusterAssment[:, 0].A == j)[0]]
print clusterAssment[:, 0]
centroids[j, :] = mean(pointsInCluster, axis = 0)
print 'Congratulations, cluster complete!'
return centroids, clusterAssment
# show your cluster only available with 2-D data
def showCluster(dataSet, k, centroids, clusterAssment):
numSamples, dim = dataSet.shape
if dim != 2:
print "Error! the dimension of your data is not 2!"
return 1
mark = ['or', 'ob', 'og', 'ok', '^r', '+r', 'sr', 'dr', '<r', 'pr']
if k > len(mark):
print "Sorry! Your k is too large! "
return 1
# draw all samples
for i in xrange(numSamples):
markIndex = int(clusterAssment[i, 0])
plt.plot(dataSet[i, 0], dataSet[i, 1], mark[markIndex])
mark = ['Dr', 'Db', 'Dg', 'Dk', '^b', '+b', 'sb', 'db', '<b', 'pb']
# draw the centroids
for i in range(k):
plt.plot(centroids[i, 0], centroids[i, 1], mark[i], markersize = 12)
plt.show()
数据集dataSet1.csv:
assist,point
1.658985,4.285136
-3.453687,3.424321
4.838138,-1.151539
5.379713,-3.362104
0.972546,2.924086
-3.567919,1.531611
0.450614,-3.302219
-3.487105,-1.724432
2.668759,1.594842
-3.156485,3.191137
3.165506,-3.999838
-2.786837,-3.099354
4.208187,2.984927
-2.123337,2.943366
0.704199,-0.479481
-0.39237,-3.963704
2.831667,1.574018
-0.790153,3.343114
2.943496,-3.357075
-3.195883,-2.283926
2.336445,2.875106
-1.786345,2.554248
2.190101,-1.90602
-3.403367,-2.778288
1.778124,3.880832
-1.688346,2.230267
2.592976,-2.054368
-4.007257,-3.207066
2.257734,3.387564
-2.679011,0.785119
0.939512,-4.023563
-3.674424,-2.261084
2.046259,2.735279
-3.18947,1.780269
4.372646,-0.822248
-2.579316,-3.497576
1.889034,5.1904
-0.798747,2.185588
2.83652,-2.658556
-3.837877,-3.253815
2.096701,3.886007
-2.709034,2.923887
3.367037,-3.184789
-2.121479,-4.232586
2.329546,3.179764
-3.284816,3.273099
3.091414,-3.815232
-3.762093,-2.432191
3.54056,2.778832
-1.736822,4.241041
2.127073,-2.98368
-4.323818,-3.938116
3.792121,5.135768
-4.786473,3.358547
2.624081,-3.260715
-4.009299,-2.978115
2.493525,1.96371
-2.513661,2.642162
1.864375,-3.176309
-3.171184,-3.572452
2.89422,2.489128
-2.562539,2.884438
3.491078,-3.947487
-2.565729,-2.012114
3.332948,3.983102
-1.616805,3.573188
2.280615,-2.559444
-2.651229,-3.103198
2.321395,3.154987
-1.685703,2.939697
3.031012,-3.620252
-4.599622,-2.185829
4.196223,1.126677
-2.133863,3.093686
4.668892,-2.562705
-2.793241,-2.149706
2.884105,3.043438
-2.967647,2.848696
4.479332,-1.764772
-4.905566,-2.91107
test_kmeans.py程序:
import pandas as pd
from kmeans import *
## step 1: load data
print "step 1: load data..."
dataSet = pd.read_csv('I:/python_MachineLearning/kmeans/testSet1.csv')
print dataSet
## step 2: clustering...
print "step 2: clustering..."
dataSet = mat(dataSet)
k = 4
centroids, clusterAssment = kmeans(dataSet, k)
print centroids
print clusterAssment
## step 3: show the result
print "step 3: show the result..."
showCluster(dataSet, k, centroids, clusterAssment)
在test_kmeans.py程序中调用kmeans.py程序,得到聚类结果如下图所示: