# -*- coding: utf-8 -*-
#k均值算法 主要部分 简单实现
from random import random
'''
因为单次只能收敛到局部最优解 所以需要多次尝试 最所有对象的误差的平方和最小的结果。
二维数据
'''
#设定K值
K = 3
#从文本中获取对象列表
src_file_name = 'F:\\study\\cluster_algorithm\\k_average\\data.csv'
src_file = open(src_file_name)
data_dict = {}
num = 1
for line in src_file :
line = line.strip()
str_list = line.split(',')
data_dict[num] = (int(str_list[0]) , int(str_list[1]))
num += 1
#计算欧氏距离的函数
def get_dist(tp1 , tp2) :
return ((tp1[0] - tp2[0])**2 + (tp1[1] - tp2[1])**2)**0.5
#设置最大尝试次数
max_try = 5
#保存每次的划分结果
cluster_result_dict = {}
#保存每次划分的中心点
cluster_center_dict = {}
#记录每次的误差之和
error_dict = {}
try_cnt = 0
while try_cnt < max_try :
try_cnt += 1
#print '1'
center_dict = {}
#选取最初始中心点
for i in range(0 , K) :
x = int(random()*1000000000)%100
y = int(random()*1000000000)%100
center_dict[i] = (x , y)
#直到中心点不再改变
while True :
#print '2'
new_center = {}
#将所有对象指定到欧氏距离最近的中心点上
cluster_dict = {}
for obj_id in data_dict :
#与所有中心点求距离,得到最近的中心点,将其加入到此中心点所代表的簇中
min_dist = 10000000
which_cluster = -1
for i in range(0 , K) :
tmp = get_dist(data_dict[obj_id] , center_dict[i])
if tmp < min_dist :
min_dist = tmp
which_cluster = i
if which_cluster == -1 :
print 'error'
else :
if which_cluster not in cluster_dict :
cluster_dict[which_cluster] = []
cluster_dict[which_cluster].append(obj_id)
#计算新的中心点
for i in range(0 , K) :
sum_x = 0
sum_y = 0
cnt = 0
for obj_id in cluster_dict[i] :
sum_x += data_dict[obj_id][0]
sum_y += data_dict[obj_id][1]
cnt += 1
avg_x = (sum_x + 0.0)/cnt
avg_y = (sum_y + 0.0)/cnt
new_center[i] = (avg_x , avg_y)
#比较新旧中心点
break_flag = 1
for i in range(0 , K) :
if new_center[i] != center_dict[i] :
break_flag = 0
#如果中心点没有再改变
if break_flag == 1 :
#保存此时的簇
cluster_result_dict[try_cnt] = cluster_dict
cluster_center_dict[try_cnt] = new_center
break
#如果发生了改变,更新中心中
else :
for i in range(0 , K) :
center_dict[i] = new_center[i]
#计算簇内变差
sum_error = 0
for i in range(0 , K) :
print i
for obj_id in cluster_result_dict[try_cnt][i] :
dist = get_dist(data_dict[obj_id] , cluster_center_dict[try_cnt][i])
sum_error += dist**2
error_dict[try_cnt] = sum_error
base_dir = 'F:\\study\\cluster_algorithm\\k_average\\result\\'
for i in cluster_result_dict :
file_name = base_dir + str(i) + '.txt'
file_tmp = open(file_name , 'w')
str_buffer = ''
for j in cluster_result_dict[i] :
str_buffer += 'cluster ' + str(j) + '\n'
for obj_id in cluster_result_dict[i][j] :
str_buffer += str(data_dict[obj_id][0]) + ',' + str(data_dict[obj_id][1]) + '\n'
file_tmp.write(str_buffer)
file_tmp.close()
K_average聚类算法的Python简单实现
最新推荐文章于 2022-11-03 15:32:00 发布