转载自http://nathanlvzs.github.io/blog/Clustering-KMeans.html
实现代码基本参考K-Means聚类及其Python实现,中间加了一些对距离矩阵的理解,将源码自己研究一遍,逐渐掌握用python进行矩阵运算。其中包括:
import numpy as np
np.sum() #axis=0,1
np.outer()
np.dot()
np.mean()
np.square()
# -*- coding: utf-8 -*-
"""
Created on Mon Sep 19 22:23:34 2016
@author: soso
"""
import numpy as np
import matplotlib.pyplot as plt
from numpy import array
import matplotlib.cm as cm
import matplotlib
#%matplotlib inline
mean1 = (0, 8)
mean2 = (5, 5)
mean3 = (1, -1)
cov = [[1, 0], [0, 1]]
x1 = np.random.multivariate_normal(mean1, cov, 20)# shape: 20*2
#x1=array([[1,1],[2,2]])
y1=[0 for i in range(20)]
x2 = np.random.multivariate_normal(mean2, cov, 30)
#x2=array([[1,0],[2,0],[3,0]])
y2=[1 for i in range(30)]
x3 = np.random.multivariate_normal(mean3, cov, 20)
#x3=array([[0,1],[0,2]])
y3=[2 for i in range(20)]
x = np.concatenate((x1, x2, x3), axis=0)
print "x:"
print x
#print x.shape
y=y1+y2+y3
#colors=array([[1,0,0],[0,1,0],[0,0,1]])
colors=cm.rainbow(np.linspace(0, 1, 3))
color_label=[colors[label] for label in y]
#plt.scatter(x[:,0],x[:,1],c=color_label)
#plt.show()
#print cm.rainbow(np.linspace(0, 1, 3))
class kmeansclustering:
def __init__(self,data,k,maiter=100,epsilon=1e-12):
self.data=data
self.k=k
self.maiter=maiter
self.epsilon=epsilon
self.N=len(data)
self.colors=cm.rainbow(np.linspace(0,1,k))
self.classess=np.zeros(self.N,dtype=int)
#self.center = self.data[np.random.choice(self.N, self.k, replace=False), :]
self.center=array([[0,0],[1,1],[2,2]])
def getdismat(self):
#数据各个维度的平方和
data_sqrt_sum=np.sum(self.data*self.data,axis=1)
#print data_sqrt_sum
#质心的各个维度的平方和
cent_sqrt_sum=np.sum(self.center*self.center,axis=1)
#数据和质心的内积
#数据为N个,质心为K个
#内积结果为N*K矩阵
#[x11,x12] [[c11,c12],
# [c21,c22],...
# [ck1,ck2]]
#结果为
# [x11*c11+x12*c12, x11*c21+x11*c22.... x11*ck1+x12*ck2]
# [x21*c11+x22*c12, x21*c21+x22*c22..... x21*ck1+x22*ck2]
# ..............................................
# [xn1*c11+xn2*c12, xn1*c21+xn2*c22...... xn1*ck1+xn2*ck2]
dot_data_center=np.dot(self.data,self.center.T)
return np.outer(data_sqrt_sum,np.ones((1,self.k)))- \
2*dot_data_center+np.outer(np.ones((self.N,1)),cent_sqrt_sum)
def cal_cost(self):
cost=0;
for i in xrange(self.N):
cost+=np.sum((np.square(self.data[i]-self.center[self.classess[i]])))
return cost
def kmeans(self,plot=True):
numiter=0
pre_cost=self.cal_cost()
if plot:
self.draw()
while numiter<self.maiter:
distmat=self.getdismat()
self.classess=np.argmin(distmat,axis=1)
for c in xrange(self.k):
self.center[c]=np.mean(self.data[self.classess==c],axis=0)
now_cost=self.cal_cost()
if pre_cost-now_cost<self.epsilon:
print "break befor maxiter...."
break
if plot:
self.draw()
pre_cost=now_cost
numiter+=1
def draw(self,plotcen=True):
plt.figure(figsize=(10,10),facecolor='white')
colors_data=[self.colors[c] for c in self.classess]
plt.scatter(self.data[:,0],self.data[:,1],color=colors_data,marker=',',alpha=0.9,s=80)
plt.axis('equal')
if(plotcen):
plt.scatter(self.center[:,0],self.center[:,1],marker='o',color=self.colors,s=200)
plt.show()
kmeans=kmeansclustering(data=x,k=3)
print kmeans.getdismat()
print kmeans.cal_cost()
kmeans.kmeans()
参考: