from numpy import *
import time
import matplotlib.pyplot as plt
def e_distance(v1,v2):#两点距离公式
return sqrt(sum(power(v2-v1,2)))
def initcentroids(dataset,k):#聚类质心点初始化
numsamples,dim = dataset.shape()
centroids = zeros((k,dim))
for i in range(k):
index = int(random.uniform(0,numsamples))#随机选取样本中k个值作为当前聚类质心
centroids[i,:]=dataset[index,:]
return centroids
def kmeans(dataset,k):#K_mean算法
numsamples = dataset.shape[0]
clu_ass = mat(zeros((numsamples,2)))#第一列存储样本所属的簇,第二列存储对应距离
clu_change = True
centroids = initcentroids(dataset,k)#质心初始化
while clu_change :
clu_change = False
for i in xrange(numsamples):
min_dis = 1000000
min_index = 0
for j in range(k):
distance = e_distance(centroids[j,:],dataset[i,:])
if distance < min_dis:#更新最小距离以及对应的簇编号
min_dis = distance
min_index = j
if clu_ass[i,0] != min_index:
clu_change = True
clu_ass[i,:] = min_index,min_dis**2#更新ass列表
for j in range(k):
point_in_clu = dataset[nonzero(clu_ass[:,0].A == j)[0]]#属于j簇的所有样本这,句真难懂
centroids[j,:] = mean(point_in_clu,axis = 0)#更新质心为所属簇样本的均值
print 'mission complete'
return centroids,clu_ass
暂时还没有运算结果和数据可视化的程序。关于数据可视化后面单独写一篇笔记好了..