Python从零实现 K-mean 和K-中心点聚类算法

样本数据下载

先上代码:

"""
@Author :Jintu Zheng
@Date: 2020-12-23
@Version: 1.0.00
@Desciption: K-means and K-medoids using numpy
"""
import numpy as np
import random
import math

class dataset(object):
    def __init__(self, path):
        self.raw_data = np.loadtxt(path, delimiter=',')

class Cluster(object):
    def __init__(self, k, data, epoches = 10, distance_func = "euclidean"):
        self.k = k
        self.data = data
        self.rows, self.data_dims = data.shape[0], data.shape[1]
        self.epoches = epoches
        self.result = []
        self.distance_func_name = distance_func
    
    def distance_func(self, vector1, vector2):
        """距离公式
        欧氏距离(euclidean)
        曼哈顿距离(manhattan)
        切比雪夫距离(chebyshev)
        夹角余弦(cosine)
        """
        _func = self.distance_func_name
        if _func == 'euclidean':
            return np.linalg.norm(vector1-vector2)
        elif _func == 'manhattan':
            return np.linalg.norm(vector1-vector2,ord=1)
        elif _func == 'chebyshev':
            return np.linalg.norm(vector1-vector2,ord=np.inf)
        elif _func == 'cosine':
            return np.dot(vector1,vector2)/(np.linalg.norm(vector1)*(np.linalg.norm(vector2)))
        
    def kmeans_forward(self):
        centers = []
        for ki in range(self.k):
            centers.append(np.array([random.uniform(0,1) for di in range(self.data_dims)]))
        centers = np.array(centers)
        for epoch in range(self.epoches):
            self.result.clear()
            print('Running for epoch {}/{} -->'.format(epoch, self.epoches))
            for row in range(self.rows):
                dis_lst = [self.distance_func(self.data[row],centers[i]) for i in range(self.k)]
                min_dis = min(dis_lst)
                self.result.append(dis_lst.index(min_dis))
            # update
            for ik in range(self.k):
                sum_elems = self.result.count(ik)
                if sum_elems > 0:
                    found_elems = []
                    for row in range(self.rows):
                        if self.result[row] == ik:
                            found_elems.append(self.data[row]) # found the same class
                    found_elems = np.array(found_elems)
                    centers[ik] = np.array([np.mean(found_elems[:,i]) for i in range(self.data_dims)]) # update centers         
        return self.result

    def _get_min_dis_center_in_class(self, data):
        loss_list = []
        for idx in range(data.shape[0]):
            loss = 0
            for idx_j in range(data.shape[0]):
                loss+=self.distance_func(data[idx], data[idx_j])
            loss_list.append(loss)
        return data[loss_list.index(min(loss_list))]

    def kmedoids_forward(self):
        centers = []
        for ki in range(self.k):
            centers.append(self.data[random.randint(0,self.rows)]) # Kmedoids must be the vector of the data
        centers = np.array(centers)
        for epoch in range(self.epoches):
            self.result.clear()
            print('Running for epoch {}/{} -->'.format(epoch, self.epoches))
            for row in range(self.rows):
                dis_lst = [self.distance_func(self.data[row],centers[i]) for i in range(self.k)]
                min_dis = min(dis_lst)
                self.result.append(dis_lst.index(min_dis))

            # update
            for ik in range(self.k):
                sum_elems = self.result.count(ik)
                if sum_elems > 0:
                    found_elems = []
                    for row in range(self.rows):
                        if self.result[row] == ik:
                            found_elems.append(self.data[row]) # found the same class
                    found_elems = np.array(found_elems)
                    centers[ik] = self._get_min_dis_center_in_class(found_elems) # update centers
        return self.result


if __name__ == "__main__":
    dt = dataset('data.csv')
    funcs = ['euclidean','manhattan','chebyshev','cosine']
    model = Cluster(k=10, epoches=1000, data=dt.raw_data, distance_func='manhattan')
    #result = model.kmeans_forward()
    result = model.kmedoids_forward()
    print(result)


K-means


1、随机选取K个质心的值

2、计算各个点到质心的距离

3、将点的类划分为离他最近的质心,形成K个cluster

4、根据分类好的cluster,在每个cluster内重新计算质心(平均每个点的值)

5、重复迭代2-4步直到满足迭代次数或误差小于指定的值


K-medoids


1、随机选取K个质心的值 (质心必须是某些样本点的值,而不是任意值)

2、计算各个点到质心的距离

3、将点的类划分为离他最近的质心,形成K个cluster

4、根据分类好的cluster,在每个cluster内重新计算质心:

  • 4.1 计算cluster内所有样本点到其中一个样本点的曼哈顿距离和(绝对误差)

  • 4.2 选出使cluster绝对误差最小的样本点作为质心

5、重复迭代2-4步直到满足迭代次数或误差小于指定的值

  • 4
    点赞
  • 11
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值