先上代码:
"""
@Author :Jintu Zheng
@Date: 2020-12-23
@Version: 1.0.00
@Desciption: K-means and K-medoids using numpy
"""
import numpy as np
import random
import math
class dataset(object):
def __init__(self, path):
self.raw_data = np.loadtxt(path, delimiter=',')
class Cluster(object):
def __init__(self, k, data, epoches = 10, distance_func = "euclidean"):
self.k = k
self.data = data
self.rows, self.data_dims = data.shape[0], data.shape[1]
self.epoches = epoches
self.result = []
self.distance_func_name = distance_func
def distance_func(self, vector1, vector2):
"""距离公式
欧氏距离(euclidean)
曼哈顿距离(manhattan)
切比雪夫距离(chebyshev)
夹角余弦(cosine)
"""
_func = self.distance_func_name
if _func == 'euclidean':
return np.linalg.norm(vector1-vector2)
elif _func == 'manhattan':
return np.linalg.norm(vector1-vector2,ord=1)
elif _func == 'chebyshev':
return np.linalg.norm(vector1-vector2,ord=np.inf)
elif _func == 'cosine':
return np.dot(vector1,vector2)/(np.linalg.norm(vector1)*(np.linalg.norm(vector2)))
def kmeans_forward(self):
centers = []
for ki in range(self.k):
centers.append(np.array([random.uniform(0,1) for di in range(self.data_dims)]))
centers = np.array(centers)
for epoch in range(self.epoches):
self.result.clear()
print('Running for epoch {}/{} -->'.format(epoch, self.epoches))
for row in range(self.rows):
dis_lst = [self.distance_func(self.data[row],centers[i]) for i in range(self.k)]
min_dis = min(dis_lst)
self.result.append(dis_lst.index(min_dis))
# update
for ik in range(self.k):
sum_elems = self.result.count(ik)
if sum_elems > 0:
found_elems = []
for row in range(self.rows):
if self.result[row] == ik:
found_elems.append(self.data[row]) # found the same class
found_elems = np.array(found_elems)
centers[ik] = np.array([np.mean(found_elems[:,i]) for i in range(self.data_dims)]) # update centers
return self.result
def _get_min_dis_center_in_class(self, data):
loss_list = []
for idx in range(data.shape[0]):
loss = 0
for idx_j in range(data.shape[0]):
loss+=self.distance_func(data[idx], data[idx_j])
loss_list.append(loss)
return data[loss_list.index(min(loss_list))]
def kmedoids_forward(self):
centers = []
for ki in range(self.k):
centers.append(self.data[random.randint(0,self.rows)]) # Kmedoids must be the vector of the data
centers = np.array(centers)
for epoch in range(self.epoches):
self.result.clear()
print('Running for epoch {}/{} -->'.format(epoch, self.epoches))
for row in range(self.rows):
dis_lst = [self.distance_func(self.data[row],centers[i]) for i in range(self.k)]
min_dis = min(dis_lst)
self.result.append(dis_lst.index(min_dis))
# update
for ik in range(self.k):
sum_elems = self.result.count(ik)
if sum_elems > 0:
found_elems = []
for row in range(self.rows):
if self.result[row] == ik:
found_elems.append(self.data[row]) # found the same class
found_elems = np.array(found_elems)
centers[ik] = self._get_min_dis_center_in_class(found_elems) # update centers
return self.result
if __name__ == "__main__":
dt = dataset('data.csv')
funcs = ['euclidean','manhattan','chebyshev','cosine']
model = Cluster(k=10, epoches=1000, data=dt.raw_data, distance_func='manhattan')
#result = model.kmeans_forward()
result = model.kmedoids_forward()
print(result)
K-means
1、随机选取K个质心的值
2、计算各个点到质心的距离
3、将点的类划分为离他最近的质心,形成K个cluster
4、根据分类好的cluster,在每个cluster内重新计算质心(平均每个点的值)
5、重复迭代2-4步直到满足迭代次数或误差小于指定的值
K-medoids
1、随机选取K个质心的值 (质心必须是某些样本点的值,而不是任意值)
2、计算各个点到质心的距离
3、将点的类划分为离他最近的质心,形成K个cluster
4、根据分类好的cluster,在每个cluster内重新计算质心:
-
4.1 计算cluster内所有样本点到其中一个样本点的曼哈顿距离和(绝对误差)
-
4.2 选出使cluster绝对误差最小的样本点作为质心
5、重复迭代2-4步直到满足迭代次数或误差小于指定的值