K-Means
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import scipy.io as sio
import numpy as np
mat = sio.loadmat('./data/ex7data2.mat')
data2 = pd.DataFrame(mat.get('X'), columns=['X1', 'X2'])
sns.lmplot('X1', 'X2', data=data2, fit_reg=False)
plt.show()
# 相关函数
def combine_data_C(data, C):
data_with_c = data.copy()
data_with_c['C'] = C
return data_with_c
def random_init(data, k): # 随机抽 k 个样本
"""choose k sample from data set as init centroids
Args:
data: DataFrame
k: int
Returns:
k samples: ndarray
"""
return data.sample(k).values # 从行抽
def _find_your_cluster(x, centroids): # 根据距离找到最佳的簇
"""find the right cluster for x with respect to shortest distance
Args:
x: ndarray (n, ) -> n features
centroids: ndarray (k, n)
Returns:
k: int
""" # 这个apply_along_axis 函数返回的是一个根据func()函数以及维度axis运算后得到的的数组
distances = np.apply_along_axis(func1d=np.linalg.norm, # 函数就是计算范数,默认是二范数计算距离的
axis=1, # 对矩阵就是对每一行进行计算
arr=centroids - x) # 我们要操作的数组,使用了广播机制变成了 (k,n)
return np.argmin(distances) # 就是x 到每个重心的距离,,看看大佬的代码能学到很多啊
def assign_cluster(data, centroids): # 对数据中的每一个节点分配簇
"""assign cluster for each node in data
return C ndarray
"""
return np.apply_along_axis(lambda x: _find_your_cluster(x, centroids),
axis=1,
arr=data.values)
def new_centroids(data, C): # 得到新的重心
data_with_c = combine_data_C(data, C) # 根据分配的新簇分组
return data_with_c.groupby('C', as_index=False).\
mean().\
sort_values(by='C').\
drop('C', axis=1).\
values # 出现了,方法链。。。
init_centroids = random_init(data2, 3) # 三个重心
init_centroids
>>>
array([[2.28664839, 5.0076699 ],
[5.50295759, 2.62924634],
[4.20584789, 2.81647368]])
x = np.array([1, 1])
fig, ax = plt.subplots(figsize=(6,4))
ax.scatter(x=init_centroids[:, 0], y=init_centroids[:, 1])
for i, node in enumerate(init_centroids): # 进行一个标记
ax.annotate('{}: ({},{})'.format(i, node[0], node[1]), node)
ax.scatter(x[0], x[1], marker='x', s=200)
plt.show()
C = assign_cluster(data2, init_centroids) # 得到最近的簇的索引
data_with_c =combine_data_C(data2, C)
data_with_c.head()
>>>
X1 X2 C
0 1.842080 4.607572 0
1 5.658583 4.799964 1
2 6.352579 3.290854 1
3 2.904017 4.612204 0
4 3.231979 4.939894 0
sns.lmplot(x='X1',y= 'X2', hue='C', data=data_with_c, fit_reg=False)
plt.show() # 第一轮效果肯定没那么好就是了
new = new_centroids(data2, C) # 过程看不懂的可以拆开看,这样就获得了新的重心
C = assign_cluster(data2,new) # 得到最近的簇的索引
data_with_c =combine_data_C(data2, C)
sns.lmplot(x='X1',y= 'X2', hue='C', data=data_with_c, fit_reg=False)
plt.show() # 第二次的效果就很好了
def cost(data, centroids, C): # 代价就是各个数据点到对应重心的距离
m = data.shape[0]
expand_C_with_centroids = centroids[C]
distances = np.apply_along_axis(func1d=np.linalg.norm,
axis=1,
arr=data.values - expand_C_with_centroids)
return distances.sum() / m
def _k_means_iter(data, k, epoch=100, tol=0.0001): # 就是将上面的函数组合一下
"""one shot k-means
with early break
"""
centroids = random_init(data, k) # 初始化重心
cost_progress = []
for i in range(epoch):
print('running epoch {}'.format(i)) # 重心移动次数
C = assign_cluster(data, centroids) # 分配新的重心
centroids = new_centroids(data, C)
cost_progress.append(cost(data, centroids, C))
if len(cost_progress) > 1: # early break
if (np.abs(cost_progress[-1] - cost_progress[-2])) / cost_progress[-1] < tol:
break # 计算一个阈值吧,,当代价够小就不用在额外的循环了
return C, centroids, cost_progress[-1] # 返回值是最后的标签,重心位置,和代价
def k_means(data, k, epoch=100, n_init=10):
"""do multiple random init and pick the best one to return
Args:
data (pd.DataFrame)
Returns:
(C, centroids, least_cost) 这里进行多次上个函数,就是多次随机初始化值然后选择最优的
"""
tries = np.array([_k_means_iter(data, k, epoch) for _ in range(n_init)])
least_cost_idx = np.argmin(tries[:, -1])
return tries[least_cost_idx]
best_C, best_centroids, least_cost = k_means(data2, 3) # 最多十次就停止了
print(least_cost)
data_with_c = combine_data_C(data2, best_C)
sns.lmplot(x='X1', y='X2', hue='C', data=data_with_c, fit_reg=False)
plt.show()
from sklearn.cluster import KMeans # 调参侠实在是太香了。。。。
sk_kmeans = KMeans(n_clusters=3)
sk_kmeans.fit(data2)
sk_C = sk_kmeans.predict(data2)
data_with_c = combine_data_C(data2, sk_C)
sns.lmplot(x='X1', y='X2', hue='C', data=data_with_c, fit_reg=False)
plt.show()
用来压缩图片
pic = cv2.imread('data/bird_small.png') / 255
# pic.shape (128, 128, 3)
data = pic.reshape(128*128,3) # 序列化
C, centroids, cost = k_means(pd.DataFrame(data), 16, epoch = 10, n_init=3)
# 两种方法比较一下用时。。第一个明显长
from sklearn.cluster import KMeans
model = KMeans(n_clusters=16, n_init=100, n_jobs=-1) # 16个重心,从100次中找到最好的
model.fit(data)
centroids = model.cluster_centers_ # n_jobs 就是使用的线程数,-1就用所有的
print(centroids.shape)
C = model.predict(data)
print(C.shape)
compressed_pic = centroids[C].reshape((128,128,3))
fig, ax = plt.subplots(1, 2)
ax[0].imshow(pic)
ax[1].imshow(compressed_pic)
plt.show() # 我用 cv 读的数据,,通道颠倒了。。不要在意这些细节
import sys
sys.getsizeof(compressed_pic),sys.getsizeof(pic)
>>>(128,393344) # 看看我们压缩的效果,很明显的。
PCA 主成分分析
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(context="notebook", style="white")
import numpy as np
import pandas as pd
import scipy.io as sio
mat = sio.loadmat('./data/ex7data1.mat')
X = mat.get('X')
# print(X.shape) (50, 2) 二维的
sns.lmplot('X1', 'X2',
data=pd.DataFrame(X, columns=['X1', 'X2']),
fit_reg=False)
plt.show()
# 又是喜闻乐见的一堆函数
def plot_n_image(X, n):
pic_size = int(np.sqrt(X.shape[1])) # 画出前几个图像,n 要可以开根。
grid_size = int(np.sqrt(n)) # 因为要并成一个矩形嘛
first_n_images = X[:n, :]
fig, ax_array = plt.subplots(nrows=grid_size, ncols=grid_size,
sharey=True, sharex=True, figsize=(8, 8))
for r in range(grid_size):
for c in range(grid_size):
ax_array[r, c].imshow(first_n_images[grid_size * r + c].reshape((pic_size, pic_size)))
plt.xticks(np.array([]))
plt.yticks(np.array([]))
# PCA functions ---------------------------------------
def covariance_matrix(X):
"""
Args:
X (ndarray) (m, n)
Return:
cov_mat (ndarray) (n, n):
covariance matrix of X
"""
m = X.shape[0]
return (X.T @ X) / m # 协方差
def normalize(X):
"""
for each column, X-mean / std
"""
X_copy = X.copy()
m, n = X_copy.shape
for col in range(n):
X_copy[:, col] = (X_copy[:, col] - X_copy[:, col].mean()) / X_copy[:, col].std() # 是对列的嘛。。
return X_copy # 特征规范化
def pca(X):
"""
http://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.svd.html
Args:
X ndarray(m, n)
Returns:
U ndarray(n, n): principle components
"""
# 1. 规范化
X_norm = normalize(X)
# 2. 计算协方差
Sigma = covariance_matrix(X_norm) # (n, n)
# 3. 奇异值分解,[U,S,V] 其中 U,V 是相同的,用其中之一就行
U, S, V = np.linalg.svd(Sigma) # U: principle components (n, n)
return U, S, V
def project_data(X, U, k):
"""
Args:
U (ndarray) (n, n)
Return:
projected X (n dim) at k dim
"""
m, n = X.shape
if k > n:
raise ValueError('k should be lower dimension of n')
return X @ U[:, :k] # 选择前 K 个。降到k 维。。返回 Z
def recover_data(Z, U):
m, n = Z.shape
if n >= U.shape[0]:
raise ValueError('Z dimension is >= U, you should recover from lower dimension to higher')
return Z @ U[:, :n].T # 返回 X 吧。。
X_norm = normalize(X) # 进行规范化,X (50, 2)
U, S, V = pca(X_norm)
Z = project_data(X_norm, U, 1) # 降到一维
Z[:10]
>>>
array([[ 1.49631261],
[-0.92218067],
[ 1.22439232],
[ 1.64386173],
[ 1.2732206 ],
[-0.97681976],
[ 1.26881187],
[-2.34148278],
[-0.02999141],
[-0.78171789]])
X_recovered = recover_data(Z, U) # 绘制返回去的图
fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(list(X_recovered[:, 0]), list(X_recovered[:, 1]))
plt.show()
神奇的可视化
使用PCA 进行图像处理
mat = sio.loadmat('./data/ex7faces.mat') # 面部数据
mat.keys()
X = np.array([x.reshape((32, 32)).T.reshape(1024) for x in mat.get('X')])
X.shape # 这里的转置是为了让脸正过来,,
>>>(5000, 1024)
plot_n_image(X, n=64)
plt.show()
# 原则上不会看见面孔的
U, _, _ = pca(X)
Z = project_data(X, U, k=100) # 只要前100 个
plot_n_image(Z, n=64)
plt.show()
from sklearn.decomposition import PCA # 调参 调参 调参
sk_pca = PCA(n_components=100)
Z = sk_pca.fit_transform(X)
print(Z.shape)
X_recover = sk_pca.inverse_transform(Z)
X_recover.shape
plot_n_image(X_recover, n=64)
plt.show()