import numpy as np
import matplotlib.pyplot as plt
import time
time1=time.time()
class Data:
def __init__(self,vector,set_id):
self._vector=vector
self._set=set_id
class Store:
def __init__(self):
self.list=[]
self.list.append([])
def add(self,data):
self.list[0].append(data)
def num_of_store(self):
return len(self.list[0])
#聚类数
k=4
store=Store()
file=open('data','r')
line=file.readline()
i=0
inputs=[]
while line:
string=line.strip('\n').split('\t')
inputs.append([float(string[0]),float(string[1])])
line=file.readline()
for point in inputs:
store.add(Data(point,-1))
while len(store.list)<k+1:
store.list.append([])
num=store.num_of_store()
candidate_index=[]
k_mean=[]
dist=[]
#随机产生k个中心点
for i in range(k):
random_index=np.random.randint(0,num,1)
if random_index in candidate_index:
while random_index in candidate_index:
random_index=np.random.randint(0,num,1)
candidate_index.append(random_index.tolist()[0])
else:
candidate_index.append(random_index.tolist()[0])
k_mean.append(store.list[0][random_index])
#把随机产生的中心点放入对应的集合中去
for i in range(k):
store.list[0][candidate_index[i]]._set=i+1
store.list[i+1].append(store.list[0][candidate_index[i]]._vector)
#把输入的点分配给随机产生的中心点
for i in range(num):
if i in candidate_index:
continue
dist=[]
for j in candidate_index:
dist.append(((np.array(store.list[0][i]._vector) - np.array(store.list[0][j]._vector))**2).sum())
min_dist_index=dist.index(min(dist))+1 #min_dist_index表示当前点里哪一个中心点最近
store.list[0][i]._set=min_dist_index
store.list[min_dist_index].append(store.list[0][i]._vector) #把当前点加入到第k个集合中
#对每个类计算中心点,调整类中数据点位置,直到不再发生改变
flag=1 #flag=0表示数据点位置不再发生改变
iter_time=0
while flag==1:
iter_time+=1
flag=0 #初始状态为0
candidate_mean_point = []
for i in range(k):
candidate_mean_point.append(np.array(store.list[i+1]).mean(0))
for data in store.list[0]:
dist=[]
for j in candidate_mean_point:
dist.append(((np.array(data._vector)-j)**2).sum())
prev_index=data._set
cur_index=dist.index(min(dist))+1
if prev_index!=cur_index: #有点进行位置调整
flag=1
data._set=cur_index #将当前要调整位置的点的set属性设置为新的集合序号
store.list[cur_index].append(data._vector) #将当前调整的点放入到新的类中
store.list[prev_index].remove(data._vector) #删除原先类中的该点的数据
for i in range(len(store.list) - 1):
print '第%s类为:' % (str(i + 1)), store.list[i + 1]
print 'iter_time:', iter_time
time2 = time.time()
print 'Running time:', time2 - time1
#如果聚类数大于8,则要增加下面两种点的标记类型
style=['or','ob','og','ok','oc','om','oy','ow'] #聚类点的标记类型
mean_mark=['Dr','Db','Dg','Dk','Dc','Dm','Dy','Dw'] #聚类中心的标记类型
#把不同类的点用不同颜色画出
for j in range(k):
for point in store.list[j+1]:
plt.plot(point[0],point[1],style[j])
#画出聚类中心
for j in range(k):
plt.plot(candidate_mean_point[j][0],candidate_mean_point[j][1],mean_mark[j],markersize=10)
plt.title('kmean algorithm')
plt.show()