好久没更新了,这一阵在学python和数据挖掘,晚上下班无聊用python实现了以下kmeans,发上来占个坑吧。
这个版本支持对多个不同维度的向量聚类,但是其坐标必须是数字。由于本人刚接触python,对其特性的理解还很欠缺,代码风格也可能还有很重java和c的痕迹,还望大伙多多指正。
下面这个是对向量类的定义:
import math
class Point:
MAX_DIST = 1<<60
def __init__(self,v):
self.point = v
def dist(self,other):
pa = self.point
pb = other.point
if(len(pa) != len(pb)):
return Point.MAX_DIST*max(len(pa),len(pb))
res = 0
for i in range(0,len(pa)):
res += (pa[i]-pb[i])*(pa[i]-pb[i])
return math.sqrt(res);
def __add__(self,other):
pa = self.point
pb = other.point
res = []
for i in range(min(len(pa),len(pb))):
res.append(pa[i]+pb[i])
for i in range(min(len(pa),len(pb)), max(len(pa),len(pb))):
res.append(Point.MAX_DIST)
return Point(tuple(res))
def __div__(self,num):
p = [1.0*n/num for n in self.point]
return Point(tuple(p))
if __name__ == '__main__':
a = Point((1,2,))
b = Point(tuple((3,4)))
print a.point,b.point
print (a+b).point
print a.dist(b)
print (a/2).pointkmeans算法的实现:
from point import Point
class KmeansAlgo:
def __init__(self,K,plist):
self.k = K
self.plist = plist
self.kinds = []
if len(self.plist) < K:
print 'plist member too few!'
return
for i in range(0,K):
self.kinds.append([self.plist[i],])
for i in range(K,len(plist)):
self.kinds[0].append(self.plist[i])
def adjust(self):
readjust = 0
midpoints = []
for sublist in self.kinds:
m = Point((0,0,))
for x in sublist:
m = m+x
m = m/len(sublist)
midpoints.append(m)
for ilist in range(self.k):
sublist = self.kinds[ilist]
for x in sublist:
minimal = midpoints[ilist].dist(x)
idx = ilist
for imid in range(self.k):
tmpdist = x.dist(midpoints[imid])
if tmpdist < minimal:
minimal = tmpdist
idx = imid
if idx != ilist:
sublist.remove(x)
self.kinds[idx].append(x)
readjust += 1
return readjust
def repeat(self):
times = 0
while True:
readjust = self.adjust()
if not readjust:
break
times += 1
print times,readjust
if __name__ == '__main__':
file = open('points.in','r')
plist = []
while True:
s = file.readline()
if len(s)==0:
break
points = s.split()
plist.append(Point(tuple([int(i) for i in points])))
km = KmeansAlgo(4,plist)
km.repeat()
for i in range(len(km.kinds)):
print 'kind ',i
for p in km.kinds[i]:
print p.point下面是个输入文件的样例:
1 1
0 1
1 0
-1 -1
-1 0
0 -1
100 101
101 100
99 100
100 99
1000 999
1001 999
1000 1000
999 999
1 2 3
2 3 4
2 3 100
1
2
1 2 3 4 5
**********************************over**************************************