在构成圆形的30000个随机样本点上,设置7个簇,使用K-Means算法聚类
from math import pi, sin, cos from collections import namedtuple from random import random, choice from copy import copy import matplotlib.pyplot as plt import numpy as np import sys try: import psyco psyco.full() except ImportError: pass FLOAT_MAX = 1e100 class Point: __slots__ = ["x", "y", "group"] def __init__(self, x=0.0, y=0.0, group=1): self.x, self.y, self.group = x, y, group def generate_points(npoints, radius): points = [Point() for _ in range(npoints)] # note: this is not a uniform 2-d distribution for p in points: r = random() * radius ang = random() * 2 * pi p.x = r * cos(ang) p.y = r * sin(ang) return points def nearest_cluster_center(point, cluster_centers): """Distance and index of the closest cluster center""" def sqr_distance_2D(a, b): return (a.x - b.x) ** 2 + (a.y - b.y) ** 2 min_index = point.group min_dist = FLOAT_MAX for i, cc in enumerate(cluster_centers): d = sqr_distance_2D(cc, point) if min_dist > d: min_dist = d min_index = i return (min_index, min_dist) def kpp(points, cluster_centers): cluster_centers[0] = copy(choice(points)) d = [0.0 for _ in range(len(points))] for i in range(1, len(cluster_centers)): sum = 0 for j, p in enumerate(points): d[j] = nearest_cluster_center(p, cluster_centers[:i])[1] sum += d[j] sum *= random() for j, di in enumerate(d): sum -= di if sum > 0: continue cluster_centers[i] = copy(points[j]) break for p in points: p.group = nearest_cluster_center(p, cluster_centers)[0] def lloyd(points, nclusters): cluster_centers = [Point() for _ in range(nclusters)] # call k++ init kpp(points, cluster_centers) lenpts10 = len(points) >> 10 changed = 0 while True: # group element for centroids are used as counters for cc in cluster_centers: cc.x = 0 cc.y = 0 cc.group = 0 for p in points: cluster_centers[p.group].group += 1 cluster_centers[p.group].x += p.x cluster_centers[p.group].y += p.y for cc in cluster_centers: cc.x /= cc.group cc.y /= cc.group # find closest centroid of each PointPtr changed = 0 for p in points: min_i = nearest_cluster_center(p, cluster_centers)[0] if min_i != p.group: changed += 1 p.group = min_i # stop when 99.9% of points are good if changed <= lenpts10: break for i, cc in enumerate(cluster_centers): cc.group = i return cluster_centers def kmeans(points, nclusters): cluster_centers = [copy(choice(points)) for _ in xrange(nclusters)] lenpts10 = len(points) >> 10 changed = 0 while True: for p in points: cluster_centers[p.group].group += 1 cluster_centers[p.group].x += p.x cluster_centers[p.group].y += p.y for cc in cluster_centers: #print cc.x,cc.y,cc.group cc.x /= cc.group cc.y /= cc.group changed = 0 for p in points: min_i = nearest_cluster_center(p, cluster_centers)[0] if min_i != p.group: changed += 1 p.group = min_i # stop when 99.9% of points are good if changed <= lenpts10: break for i, cc in enumerate(cluster_centers): cc.group = i return cluster_centers def print_eps(points, cluster_centers): c = np.random.rand(len(cluster_centers)+1,3) for i, cc in enumerate(cluster_centers): plot_x = [] plot_y = [] print (cc.x,cc.y,cc.group) for p in points: if p.group != i: continue plot_x.append(p.x) plot_y.append(p.y) plt.scatter(plot_x,plot_y,color=c[i],s=1) plt.scatter(cc.x,cc.y, color=c[len(c)-i-1], marker='^',s=50) plt.show() def main(): npoints = 30000 k = 7 # # clusters points = generate_points(npoints, 10) cluster_centers = lloyd(points, k) #cluster_centers = kmeans(points,k) print_eps(points, cluster_centers) main()
4.752567342690781 -4.334890736846155 0
-1.3573754262792683 6.986665631867061 1
-0.15969173199245487 2.506935816564994 2
-6.2690747305026315 0.7728991240509486 3
-2.713392739613833 -5.828585413438269 4
5.771515155816214 2.91539616422218 5
0.07082712668661448 -0.7504142150100154 6