K-means聚类算法

最新推荐文章于 2024-06-07 11:58:32 发布

川师_King

最新推荐文章于 2024-06-07 11:58:32 发布

阅读量1.9k

点赞数 4

分类专栏：数据挖掘

本文链接：https://blog.csdn.net/weixin_44196785/article/details/116458976

版权

数据挖掘专栏收录该内容

11 篇文章 9 订阅

订阅专栏

计算欧几里得距离

# -*- coding: utf-8 -*-
import numpy as np
def euclid_distance(x1, x2):
    """计算欧几里得距离
    参数:
        x1 - numpy数组
        x2 - numpy数组
    返回值：
        distance - 浮点数，欧几里得距离
    """
    distance = 0
    #   请在此添加实现代码     #
    #********** Begin *********#
    import numpy as np
    distance = np.sqrt(np.sum((x1-x2)**2))
    #********** End ***********#
    return distance

计算样本的最近邻聚类中心

# -*- coding: utf-8 -*-
def nearest_cluster_center(x, centers):
    """计算各个聚类中心与输入样本最近的
    参数:
        x - numpy数组
        centers - numpy二维数组
    返回值：
        cindex - 整数，类中心的索引值，比如3代表分配x到第3个聚类中
    """
    cindex = -1
    from distance import euclid_distance
    #   请在此添加实现代码     #
    #********** Begin *********#
    #计算点到各个中心的距离
    n_clusters = len(centers)
    distance_list = []
    for cluster_index in range(n_clusters):
        distance_list.append((cluster_index, euclid_distance(x, centers[cluster_index])))
    #找出最小距离的类
    distance_list = sorted(distance_list, key=lambda s:s[1])
    cindex = distance_list[0][0]
    #********** End ***********#    
    return cindex

计算各聚类中心

# -*- coding: utf-8 -*-
def estimate_centers(X, y_estimated, n_clusters):
    """重新计算各聚类中心
    参数:
        X - numpy二维数组，代表数据集的样本特征矩阵
        y_estimated - numpy数组，估计的各个样本的聚类中心索引
        n_clusters - 整数，设定的聚类个数
    返回值：
        centers - numpy二维数组，各个样本的聚类中心
    """
    import numpy as np
    centers = np.zeros((n_clusters, X.shape[1]))
    #   请在此添加实现代码     #
    #********** Begin *********#
    for i in range(n_clusters):
        centers[i] = np.mean(X[y_estimated==i], 0)
    #********** End ***********#
    return centers

评估聚类效果

# -*- coding: utf-8 -*-
def acc(x1, x2):
    """计算精度
    参数:
        x1 - numpy数组
        x2 - numpy数组
    返回值：
        value - 浮点数，精度
    """
    value = 0
    #   请在此添加实现代码     #
    #********** Begin *********#
    import numpy as np
    value = float(np.sum(x1==x2))/len(x1)
    #********** End ***********#
    return value

组合已实现的函数完成K-means

# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
from distance import euclid_distance
from estimate import estimate_centers
from loss import acc
from near import nearest_cluster_center
#随机种子对聚类的效果会有影响，为了便于测试，固定随机数种子
np.random.seed(5)
#读入数据集
dataset = pd.read_csv('./data/iris.csv')
#取得样本特征矩阵
X = dataset[['150','4','setosa','versicolor']].as_matrix()
y = np.array(dataset['virginica'])
#读入数据
n_clusters, n_iteration = input().split(',')
n_clusters = int(n_clusters)#聚类中心个数
n_iteration = int(n_iteration)#迭代次数
#随机选择若干点作为聚类中心
point_index_lst = np.arange(len(y))
np.random.shuffle(point_index_lst)
cluster_centers = X[point_index_lst[:n_clusters]]
#开始算法流程
y_estimated = np.zeros(len(y))
#   请在此添加实现代码     #
#********** Begin *********#
for iter in range(n_iteration):
    for xx_index in range(len(X)):
        #计算各个点最接近的聚类中心
        y_estimated[xx_index] = nearest_cluster_center(X[xx_index], cluster_centers)
    #计算各个聚类中心
    cluster_centers = estimate_centers(X, y_estimated, n_clusters)
#********** End ***********#
print('%.3f' % acc(y_estimated, y))

川师_King

关注

4
点赞
踩
22

收藏

觉得还不错? 一键收藏
2
评论
K-means聚类算法

计算欧几里得距离# -*- coding: utf-8 -*-import numpy as npdef euclid_distance(x1, x2): """计算欧几里得距离参数: x1 - numpy数组 x2 - numpy数组返回值： distance - 浮点数，欧几里得距离 """ distance = 0 # 请在此添加实现代码 # #********** Begin
复制链接

扫一扫