K_average聚类算法的Python简单实现

# -*- coding: utf-8 -*-
#k均值算法 主要部分 简单实现
from random import random


'''
因为单次只能收敛到局部最优解 所以需要多次尝试 最所有对象的误差的平方和最小的结果。
二维数据


'''


#设定K值
K = 3


#从文本中获取对象列表
src_file_name = 'F:\\study\\cluster_algorithm\\k_average\\data.csv'


src_file = open(src_file_name)


data_dict = {}
num = 1
for line in src_file :
    line = line.strip()
    str_list = line.split(',')
    data_dict[num] = (int(str_list[0]) , int(str_list[1]))
    num += 1
    
    
#计算欧氏距离的函数
def get_dist(tp1 , tp2) :
    return ((tp1[0] - tp2[0])**2 + (tp1[1] - tp2[1])**2)**0.5    
    
    


#设置最大尝试次数
max_try = 5


#保存每次的划分结果
cluster_result_dict = {}
#保存每次划分的中心点
cluster_center_dict = {}


#记录每次的误差之和
error_dict = {}


try_cnt = 0
while try_cnt < max_try :
    try_cnt += 1
    #print '1'
    
    center_dict = {}
    #选取最初始中心点
    for i in range(0 , K) :
        x = int(random()*1000000000)%100
        y = int(random()*1000000000)%100
        center_dict[i] = (x , y)
    
    #直到中心点不再改变
    while True :
        #print '2'
        new_center = {}
        #将所有对象指定到欧氏距离最近的中心点上
        cluster_dict = {}
        for obj_id in data_dict :
            #与所有中心点求距离,得到最近的中心点,将其加入到此中心点所代表的簇中
            min_dist = 10000000
            which_cluster = -1 
            for i in range(0 , K) :
                tmp = get_dist(data_dict[obj_id] , center_dict[i])
                if tmp < min_dist :
                    min_dist = tmp
                    which_cluster = i
            if which_cluster == -1 :
                print 'error'
            else :
                if which_cluster not in cluster_dict :
                    cluster_dict[which_cluster] = []
                cluster_dict[which_cluster].append(obj_id)
        #计算新的中心点
        for i in range(0 , K) :
            sum_x = 0
            sum_y = 0
            cnt = 0
            for obj_id in cluster_dict[i] :
                sum_x += data_dict[obj_id][0]
                sum_y += data_dict[obj_id][1]
                cnt += 1
            avg_x = (sum_x + 0.0)/cnt
            avg_y = (sum_y + 0.0)/cnt
            
            new_center[i] = (avg_x , avg_y)
        
        #比较新旧中心点
        break_flag = 1
        for i in range(0 , K) :
            if new_center[i] != center_dict[i] :
                break_flag = 0    
            
        #如果中心点没有再改变
        if break_flag == 1 :
            #保存此时的簇
            cluster_result_dict[try_cnt] = cluster_dict
            cluster_center_dict[try_cnt] = new_center
            break
        #如果发生了改变,更新中心中
        else :
            for i in range(0 , K) :
                center_dict[i] = new_center[i]
            
    #计算簇内变差
    sum_error = 0
    for i in range(0 , K) :
        print i
        for obj_id in cluster_result_dict[try_cnt][i] :
            dist = get_dist(data_dict[obj_id] , cluster_center_dict[try_cnt][i])
            sum_error += dist**2
    error_dict[try_cnt] = sum_error
    
    
base_dir = 'F:\\study\\cluster_algorithm\\k_average\\result\\'
for i in cluster_result_dict :
    file_name = base_dir + str(i) + '.txt'
    file_tmp = open(file_name , 'w')
    
    str_buffer = ''
    for j in cluster_result_dict[i] :
        str_buffer += 'cluster ' + str(j) + '\n'
        for obj_id in cluster_result_dict[i][j] :
            str_buffer += str(data_dict[obj_id][0]) + ',' + str(data_dict[obj_id][1]) + '\n'
    
    file_tmp.write(str_buffer)
    file_tmp.close()

以下是一个基于Python的层次聚类算法实现: ```python import numpy as np def euclidean_distance(x, y): """ 计算欧几里得距离 """ return np.sqrt(np.sum((x - y) ** 2)) def hierarchical_clustering(data, method='single'): """ 层次聚类算法实现 :param data: 二维数据集,每行代表一个样本 :param method: 距离计算方法,包括'single', 'complete', 'average'和'centroid' :return: 聚类结果,每个元素代表一个聚类,元素为聚类中心的下标 """ n = data.shape[0] # 初始化距离矩阵 distances = np.zeros((n, n)) for i in range(n): for j in range(i+1, n): distances[i, j] = euclidean_distance(data[i], data[j]) distances[j, i] = distances[i, j] # 初始化聚类标号 clusters = np.arange(n) # 合并聚类 for k in range(n-1): # 找到距离最小的两个聚类 i, j = np.unravel_index(np.argmin(distances), distances.shape) # 合并聚类 clusters[clusters == clusters[j]] = clusters[i] # 更新距离矩阵 if method == 'single': distances[i, :] = np.minimum(distances[i, :], distances[j, :]) distances[:, i] = distances[i, :] elif method == 'complete': distances[i, :] = np.maximum(distances[i, :], distances[j, :]) distances[:, i] = distances[i, :] elif method == 'average': distances[i, :] = (distances[i, :] + distances[j, :]) / 2 distances[:, i] = distances[i, :] elif method == 'centroid': centroid = (data[clusters == i].mean(axis=0) + data[clusters == j].mean(axis=0)) / 2 distances[i, :] = euclidean_distance(centroid, data) distances[:, i] = distances[i, :] else: raise ValueError("Invalid method") distances[i, i] = np.inf distances[j, :] = np.inf distances[:, j] = np.inf # 返回聚类结果 return np.unique(clusters) ``` 可以通过调用函数 `hierarchical_clustering(data, method)` 来实现层次聚类。其中,`data` 是一个二维的数据集,每行代表一个样本;`method` 是距离计算方法,包括'single', 'complete', 'average'和'centroid'。函数返回聚类结果,每个元素代表一个聚类,元素为聚类中心的下标。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值