kmean python_python(之)kmean++算法

K-means++是一种改进的K-means聚类算法,旨在解决初始中心选择的问题,提高聚类效果。本文介绍了K-means++的基本原理,并提供了一个Python实现示例。
摘要由CSDN通过智能技术生成

【摘要】 K-mean是一个无监督的聚类算法(unsupervised clustering algorithm), 它简单快速, O(n)的运算复杂度。但是,该算法的有效性通常受到初始聚类中心点的影响。虽然学术界已经有很多方法被提出, 用来提高初始聚类中心点选取。但是,受数据集的影响,其效果也不理想。所以, 一直以来k-means聚类初始中心点选取问题一直被认为是一个 NP-hard k-mean 问题。 2007年,David Arthur 和 Sergei Vassilvitskii 提出了一种近似选取聚类初始中心点的算法, k-mean++, 该方法在概率上被证明具有很好的鲁棒性和健壮性. 详见K-means++

import numpy

import math

import random

import collections

#

# Support the multi-dimenstion vector clustering

#

class Kmean:

def init_centers(self):

pass

def do_cluster(self, mintoler, numiter):

pass

class Point:

def __init__(self, data):

self.data=data

self.cluster=None

self.dist=1e5

def compute_euclidean_distance(self, center):

if len(self.data) != len(center):

print "mismatch dimension!"

return

return math.sqrt(sum(numpy.power((numpy.subtract(self.data, center)), 2)))

def set_cluster(self, c, d):

if self.cluster:

self.cluster.remove_point(self)

self.cluster = c

self.cluster.add_point(self)

self.dist = d

def get_dist(self):

return self.dist

def set_dist(self, d):

self.dist = d

class Cluster:

def __init__(self, center):

self.center = center

self.points = collections.defaultdict(lambda:0)

def add_point(self, p):

self.points[p] = 1

def remove_point(self, p):

if self.points[p]:

del self.points[p]

def update_center(self):

if (len(self.points) == 0):

return

sumvect = [0] * len(self.center)

for p in self.points.keys():

sumvect = numpy.add(sumvect, p.data);

self.center = numpy.divide(sumvect, len(self.points))

class Kmeanplusplus(Kmean):

def __init__(self, ds, numdata, numCluster):

self.dataset = ds

self.numPoint = numdata

self.numCluster = numCluster

self.points= []

self.clusters = []

for i in range(self.numPoint):

point = Point(ds[i])

self.points.append(point)

def init_centers(self):

# random to get the first init center

newclusterindex = int(random.random() * self.numPoint)

token = [1] * self.numPoint

distsqusum = 0.0

k = 1

token[newclusterindex] = 0

seed_cluster = Cluster(self.dataset[newclusterindex])

self.clusters.append(seed_cluster)

newcluster = seed_cluster

for i in range(self.numPoint):

point = self.points[i]

dist = point.compute_euclidean_distance(newcluster.center)

point.set_cluster(newcluster, dist)

if (token[i]):

distsqusum += math.pow(dist, 2)

while (k < self.numCluster):

# random to select a point as the center of a new cluster

rdist = random.random() * distsqusum

newclusterindex = -1

tmp = 0

for i in range(self.numPoint):

tmp += self.points[i].get_dist()

if (tmp >= rdist):

newclusterindex = i

newcluster = Cluster(self.dataset[newclusterindex])

self.clusters.append(newcluster);

k += 1

distsqusum = 0.0

#update the euclidean distance between the point and its newest center

for i in range(self.numPoint):

point = self.points[i]

if (token[i]):

dist = point.compute_euclidean_distance(newcluster.center)

if (dist < point.get_dist()):

point.set_cluster(newcluster,dist)

distsqusum += math.pow(point.get_dist(), 2)

token[newclusterindex] = 0

def assign_point_cluster(self, point):

mindist = point.get_dist()

cluster = point.cluster

change = 0

for c in self.clusters:

dist = point.compute_euclidean_distance(c.center)

if (dist < mindist):

cluster = c

mindist = dist

change = 1

if change:

point.set_cluster(cluster, mindist)

return change

def do_cluster(self, numiter):

# Select the initial cluster center randomly

self.init_centers();

for i in range(numiter):

change = 0

# E step % update the center for each cluster

for cluster in self.clusters:

cluster.update_center()

# M step % assign the point to the newest cluster

for point in self.points:

change += self.assign_point_cluster(point)

if (change == 0):

break

if __name__ == "__main__":

# test code

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值