kmean python_python(之)kmean++算法

最新推荐文章于 2024-08-14 00:10:28 发布

gaocegege

最新推荐文章于 2024-08-14 00:10:28 发布

阅读量252

点赞数

文章标签： kmean python

本文链接：https://blog.csdn.net/weixin_36193690/article/details/111895324

版权

K-means++是一种改进的K-means聚类算法，旨在解决初始中心选择的问题，提高聚类效果。本文介绍了K-means++的基本原理，并提供了一个Python实现示例。

摘要由CSDN通过智能技术生成

【摘要】 K-mean是一个无监督的聚类算法(unsupervised clustering algorithm), 它简单快速， O(n)的运算复杂度。但是，该算法的有效性通常受到初始聚类中心点的影响。虽然学术界已经有很多方法被提出，用来提高初始聚类中心点选取。但是，受数据集的影响，其效果也不理想。所以，一直以来k-means聚类初始中心点选取问题一直被认为是一个 NP-hard k-mean 问题。 2007年，David Arthur 和 Sergei Vassilvitskii 提出了一种近似选取聚类初始中心点的算法， k-mean++，该方法在概率上被证明具有很好的鲁棒性和健壮性. 详见K-means++

import numpy

import math

import random

import collections

# Support the multi-dimenstion vector clustering

class Kmean:

def init_centers(self):

pass

def do_cluster(self, mintoler, numiter):

pass

class Point:

def __init__(self, data):

self.data=data

self.cluster=None

self.dist=1e5

def compute_euclidean_distance(self, center):

if len(self.data) != len(center):

print "mismatch dimension!"

return

return math.sqrt(sum(numpy.power((numpy.subtract(self.data, center)), 2)))

def set_cluster(self, c, d):

if self.cluster:

self.cluster.remove_point(self)

self.cluster = c

self.cluster.add_point(self)

self.dist = d

def get_dist(self):

return self.dist

def set_dist(self, d):

self.dist = d

class Cluster:

def __init__(self, center):

self.center = center

self.points = collections.defaultdict(lambda:0)

def add_point(self, p):

self.points[p] = 1

def remove_point(self, p):

if self.points[p]:

del self.points[p]

def update_center(self):

if (len(self.points) == 0):

return

sumvect = [0] * len(self.center)

for p in self.points.keys():

sumvect = numpy.add(sumvect, p.data);

self.center = numpy.divide(sumvect, len(self.points))

class Kmeanplusplus(Kmean):

def __init__(self, ds, numdata, numCluster):

self.dataset = ds

self.numPoint = numdata

self.numCluster = numCluster

self.points= []

self.clusters = []

for i in range(self.numPoint):

point = Point(ds[i])

self.points.append(point)

def init_centers(self):

# random to get the first init center

newclusterindex = int(random.random() * self.numPoint)

token = [1] * self.numPoint

distsqusum = 0.0

k = 1

token[newclusterindex] = 0

seed_cluster = Cluster(self.dataset[newclusterindex])

self.clusters.append(seed_cluster)

newcluster = seed_cluster

for i in range(self.numPoint):

point = self.points[i]

dist = point.compute_euclidean_distance(newcluster.center)

point.set_cluster(newcluster, dist)

if (token[i]):

distsqusum += math.pow(dist, 2)

while (k < self.numCluster):

# random to select a point as the center of a new cluster

rdist = random.random() * distsqusum

newclusterindex = -1

tmp = 0

for i in range(self.numPoint):

tmp += self.points[i].get_dist()

if (tmp >= rdist):

newclusterindex = i

newcluster = Cluster(self.dataset[newclusterindex])

self.clusters.append(newcluster);

k += 1

distsqusum = 0.0

#update the euclidean distance between the point and its newest center

for i in range(self.numPoint):

point = self.points[i]

if (token[i]):

dist = point.compute_euclidean_distance(newcluster.center)

if (dist < point.get_dist()):

point.set_cluster(newcluster,dist)

distsqusum += math.pow(point.get_dist(), 2)

token[newclusterindex] = 0

def assign_point_cluster(self, point):

mindist = point.get_dist()

cluster = point.cluster

change = 0

for c in self.clusters:

dist = point.compute_euclidean_distance(c.center)

if (dist < mindist):

cluster = c

mindist = dist

change = 1

if change:

point.set_cluster(cluster, mindist)

return change

def do_cluster(self, numiter):

# Select the initial cluster center randomly

self.init_centers();

for i in range(numiter):

change = 0

# E step % update the center for each cluster

for cluster in self.clusters:

cluster.update_center()

# M step % assign the point to the newest cluster

for point in self.points:

change += self.assign_point_cluster(point)

if (change == 0):

break

if __name__ == "__main__":

# test code

gaocegege

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫