#<span style = "font-size: 18px"># coding:utf-8from numpy import *
import time
import matplotlib.pyplot as plt
# calculate Euclidean distancedefeuclDistance(vector1, vector2):return sqrt(sum(power(vector1 - vector2, 2)))
# 0ρ = sqrt( (x1-x2)^2+(y1-y2)^2 ) |x| = √( x2 + y2 )# power 对列表计算2次方 求和后开方# init centroids with random samples 初始化质心随机样本definitCentroids(dataSet, k):
numSamples, dim = dataSet.shape
#numSamples为dataSet的行数 dim为dataSet的列数
centroids = zeros((k, dim))
#centroids为k行dim列的零矩阵for i in range(k):
index = int(random.uniform(0, numSamples))
# uniform(x,y) 方法将随机生成下一个实数,它在[x,y]范围内。
centroids[i, :] = dataSet[index, :]
return centroids
# k-means cluserdefkmeans(dataSet, l):
numSamples = dataSet.shape[0]
# first colum stores which cluster this sample belongs to,# second colum stores the error between this sample and its centroid
clusterAssment = mat(zeros((numSamples, 2)))
'''
zeros((numSamples, 2))生成数组
mat(zeros((numSamples, 2)))将生成的数组转换为矩阵
'''
clusterChanged = True# step 1: init centroidsimport
centroids = initCentroids(dataSet, k)
while clusterChanged:
clusterChanged = False## for each samplefor i in xrange(numSamples):
#range()直接生成一个list对象,#xrange()生成一个生成器,由于每次调用只返回一个值#xrang()的执行效率要高于range()
minDist = 100000.0
minIndex = 0## for each centroid##step 2: find the centroid who is closestfor j in range(k):
distance = euclDistance(centroids[j, :], dataSet[i, :])
if distance < minDist:
minDist = distance
minIndex = j
## step 3: update its clusterif clusterAssment[i, 0] != minIndex:
clusterChanged = True
clusterAssment[i, :] = minIndex, minDist**2## step 4: update centroidsfor j in range(k):
pointsInCluster = dataSet[nonzero(clusterAssment[:, 0].A == j)[0]]
#numpy.nonzero(a)[source]#Return the indices of the elements that are non-zero.#官网文档:http://docs.scipy.org/doc/numpy/reference/generated/numpy.nonzero.html
centroids[j, :] = mean(pointsInCluster, axis = 0)
print'Congratulatons, cluster complete!'return centroids, clusterAssment
# show your cluster only available with 2-D datadefshowCluster(dataSet, k, centroids, clusterAssment):
numSamples, dim = dataSet.shape
if dim != 2:
print"Sorry! I can not draw because the dimension of your data is not 2!"return1
mark = ['or', 'ob', 'og', 'ok', '^r', '+r', 'sr', 'dr', '<r', 'pr']
if k > len(mark):
print"Sorry your k is too large! Please contact Zouxy"return1# draw all samplesfor i in xrange(numSamples):
markIndex = int(clusterAssment[i, 0])
plt.plot(dataSet[i, 0], dataSet[i, 1], mark[markIndex])
mark = ['Dr', 'Db', 'Dg', 'Dk', '^b', '+b', 'sb', 'db', '<b', 'pb']
# draw the centroidsfor i in range(k):
plt.plot(centroids[i, 0], centroids[i, 1], mark[i], markersize = 12)
plt.show()
#plt.savefig('foo1.png')#from numpy import *#import time#import matplotlib.pyplot as plt## step 1: load dataprint"step 1: load data..."
dataSet = []
fileIn = open('/home/amos/machine_learning')
for line in fileIn.readlines():
lineArr = line.strip().split('\t')
dataSet.append([float(lineArr[0]), float(lineArr[1])])
## step 2: clustering...print"step 2: clustering..."
dataSet = mat(dataSet)
k = 4
centroids, clusterAssment = kmeans(dataSet, k)
## step 3: show the resultprint"step 3: show the result..."
showCluster(dataSet, k, centroids, clusterAssment)
#"setp 2:"可以通过直接调用官方库sklearn.cluster中的KMeans实现聚类print"step 2: clustering..."
dataSet = mat(dataSet)
k = 4#############################################################by importing KMeans from sklearn.cluster#call KMeans and get the same result as kmeans.py
kmeans = KMeans(n_clusters=k, random_state=0).fit(dataSet)
centroids = kmeans.cluster_centers_
clusterAssment = kmeans.labels_
#############################################################"step 3:"中需要将showCluster(dataSet, k, centroids, clusterAssment)中的# draw all samples部分的markIndex = int(clusterAssment[i, 0])#改为markIndex = int(clusterAssment[i])