python实现Kmeans
K-Means算法是典型的基于距离的聚类算法,采用距离作为相似性的评价指标,两个样本点的距离越近,其相似度就越大。在K-means算法中,cluster是由距离靠近的样本点组成的,因此把得到距离接近的cluster作为目标。
步骤:
- 指定k个初始质心(centroids),以作为聚类的初始cluster
- 对所有点计算到各个质心的距离,将该点类别标为最近质心的cluster
- 重新计算k个cluster的新质心
- 重复计算,直到算法收敛(质心不再发生变化)
import pandas as pd
import numpy as np
import data_utils as du
from scipy.spatial.distance import pdist
# 计算距离
def dist(x, y, p=1):
X = np.vstack([x, y])
dist = float("inf")
# L1范式
if p == 1:
dist = np.linalg.norm(x - y, 1)
# 欧式距离 L2范式
if p == 2:
# dis = np.sqrt(np.sum(np.square(x-y)))
dist = np.linalg.norm(x - y, 2)
# 标准化欧氏距离
elif p == 3:
dist = pdist(X, 'seuclidean')
# cosin 余弦相似度
elif p == 4:
dist = pdist(X,'cosine')
# 马氏距离
elif p == 5:
dist = pdist(X, 'mahalanobis')
return dist
# 构建聚簇中心,取k个随机距离质心
def randCent(data, k):
cents = np.array(random.sample(list(data), k))
return cents
# pdist:1.L1范式 2.欧式距离 3.标准化欧式距离 4.余弦相似度 5.马氏距离
def kMeans(data, k, pdist):
rn = data.shape[0]
cluster = np.zeros(rn)
cents = randCent(data, k) # 选取k个初始质心(作为初始cluster)
iterations = 0 # 迭代次数
while True: # 迭代
iterations += 1
# print('iterations=', iterations)
if iterations > 1000: break
for i in range(rn): # 把每一个数据点划分到离它最近的中心点
m_dist = np.inf
m_index = -1 # 属于第几个聚簇
for j in range(k):
d = dist(cents[j, :], data[i, :], pdist) # i点到j中心的距离
if d < m_dist:
m_dist = d
m_index = j
cluster[i] = m_index
new_cents = cents.copy()
for cent_i in range(k): # 重新计算中心点
k_cluster = data[np.nonzero(cluster == cent_i)] # 找到第cent_i聚簇的所有点
if len(k_cluster) != 0:
new_cents[cent_i, :] = np.mean(k_cluster, 0) # 算出这些数据的中心点
# print('cents=', cents)
# print('new_cents=', new_cents)
# print(new_cents == cents)
if (new_cents == cents).all(): # 迭代结束
break
cents = new_cents.copy()
return cluster, iterations
if __name__ =='__main__':
print("----k_means----")
k = 4
pdist = 1
data, labels = du.read_data()
label_set = sorted(list(set(labels)))
cluster, iterations = k_means(data, k, pdist)
print("end of k_means \niterations=", iterations)
# print("----------------")
data_utils.py
import pandas as pd
import numpy as np
def read_data():
pf = pd.read_csv('Frogs_MFCCs.csv', encoding='utf-8')
data = np.array(pf)
f_data = data[:, 0:22].astype(np.float)
return f_data, data[:, 22]
if __name__ == '__main__':
f_data, labels = read_data()
label_set = list(set(labels))
代码及数据下载:https://download.csdn.net/download/SAM2un/12036441