初学,基本关键句都加了下注释
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import scipy.io as sio
mat = sio.loadmat('ex7data2.mat')
data2 = pd.DataFrame(mat.get('X'), columns=['X1', 'X2'])
#给矩阵加一列
def combine_data_C(data, C):
data_with_c = data.copy()
data_with_c['C'] = C
return data_with_c
# k-means fn --------------------------------
# 从data中随机取k个样本
def random_init(data, k):
"""choose k sample from data set as init centroids
Args:
data: DataFrame
k: int
Returns:
k samples: ndarray
"""
return data.sample(k).values
#x:某个样本,centroids:k集(k*n);寻找某个样本归属的簇;返回的是k的index
def _find_your_cluster(x, centroids):
"""find the right cluster for x with respect to shortest distance
Args:
x: ndarray (n, ) -> n features
centroids: ndarray (k, n)
Returns:
k: int
"""
#这个函数是按照axis方向返回(function(arr))结果是k*1维
distances = np.apply_along_axis(func1d=np.linalg.norm, # this give you l2 norm范数函数
axis=1,
arr=centroids - x) # use ndarray's broadcast
return np.argmin(distances)
#data:样本集;centroid:簇集;寻找每个样本对应的簇;
def assign_cluster(data, centroids):
"""assign cluster for each node in data
return C ndarray
"""
return np.apply_along_axis(lambda x: _find_your_cluster(x, centroids),
axis=1,
arr=data.values)
def new_centroids(data, C):
data_with_c = combine_data_C(data, C)
# 按照:c分组,求均值,再删掉C行
return data_with_c.groupby('C', as_index=False).mean().sort_values(by='C').drop('C', axis=1).values
def cost(data, centroids, C):
m = data.shape[0]
# centroids:3*2,C:m*1,expand_C_with_centroids:m*2 m是样本个数 蛮神奇
expand_C_with_centroids = centroids[C]
# 计算样本在该聚类中心下的距离范数 然后求均值
distances = np.apply_along_axis(func1d=np.linalg.norm,
axis=1,
arr=data.values-expand_C_with_centroids)
return distances.sum() / m
# 一次性k均值,如果初始化的k点不好 结果可能会很次
def _k_means_iter( data, k, epoch=100, tol=0.0001):
"""one shot k-means
with early break
"""
# 先随机初始化k个聚类中心
centroids = random_init(data, k)
cost_progress = []
# 迭代100次
for i in range(epoch):
# 标记循环次数
print('running epoch {}'.format(i))
# 把样本分配给聚类中心
C = assign_cluster(data, centroids)
# 获取聚类中心的均值,生成新的聚类中心组centroids:k*1
centroids = new_centroids(data, C)
# 计算本次迭代后的代价函数,c:m*1
cost_progress.append(cost(data, centroids, C))
if len(cost_progress) > 1: # early break
#距离上次迭代优化率低于0.0001就结束迭代(may局部优化)
if (np.abs(cost_progress[-1] - cost_progress[-2])) / cost_progress[-1] < tol:
break
return C, centroids, cost_progress[-1]
def k_means(data, k, epoch=100, n_init=10):
"""do multiple random init and pick the best one to return
Args:
data (pd.DataFrame)
Returns:
(C, centroids, least_cost)
"""
# 进行10次初始化 获得10*3的迭代结果
tries = np.array([_k_means_iter(data, k, epoch) for _ in range(n_init)])
# 找到cost函数最小的index
least_cost_idx = np.argmin(tries[:, -1])
return tries[least_cost_idx]
# 从样本集中随机选取三个点
init_centroids = random_init(data2, 3)
print(init_centroids)
x = np.array([1, 1])
print(x.shape)
# fig, ax = plt.subplots(figsize=(6, 4))
# ax.scatter(x=init_centroids[:, 0], y=init_centroids[:, 1])
# #enumerate:枚举;annotate:对数据集内的点进行标注
# for i, node in enumerate(init_centroids):
# ax.annotate('{}: ({},{})'.format(i, node[0], node[1]), node)
# # 标注一个1,1
# ax.scatter(x[0], x[1], marker='x', s=200)
# plt.show()
# 获取样本集中每一个样本对应的簇的index序列
C = assign_cluster(data2, init_centroids)
# 合并样本集和对应的簇index
# data_with_c =combine_data_C(data2, C)
# data_with_c.head()
# sns.lmplot('X1', 'X2', hue='C', data=data_with_c, fit_reg=False)
# plt.show()
# ↑浅试一下样本集分组(因为是随机聚类中心,每次运行都不一样)
final_C, final_centroid, _= _k_means_iter(data2, 3)
data_with_c = combine_data_C(data2, final_C)
# sns.lmplot('X1', 'X2', hue='C', data=data_with_c, fit_reg=False)
# plt.show()
best_C, best_centroids, least_cost = k_means(data2, 3)
data_with_c = combine_data_C(data2, best_C)
sns.lmplot('X1', 'X2', hue='C', data=data_with_c, fit_reg=False)
plt.show()
# 下面是封装好的k-均值函数
from sklearn.cluster import KMeans
# 设聚类中心数量
sk_kmeans = KMeans(n_clusters=3)
# 适配样本集
sk_kmeans.fit(data2)
# 获取样本对应的聚类中心
sk_C = sk_kmeans.predict(data2)
data_with_c = combine_data_C(data2, sk_C)
sns.lmplot('X1', 'X2', hue='C', data=data_with_c, fit_reg=False)
plt.show()