1. kmeans算法
"""
案例1: 给定一个二维数据集,使用kmeans进行聚类
数据集:data/ex7data2.mat
"""
import numpy as np
import scipy.io as sio
import matplotlib.pyplot as plt
from skimage import io
def find_centroids(X, centros): # 获取每个样本所属的类别
idx = []
for i in range(len(X)):
# (2,)(k,2)->(k,2)
dist = np.linalg.norm((X[i] - centros), axis=1) # (k,) 每一个样本点到中心点的距离,axis=1按行向量处理.dist->(3,)
id_i = np.argmin(dist) # 保存样本点到三个中心点最小距离的索引值。--索引值共有0,1,2三个,即三类
idx.append(id_i)
return np.array(idx) # 每一个样本点对应一个类别(k),idx是按样本点顺序对应的分类标签数组
def compute_centros(X, idx, k): # 计算聚类中心点
centros = []
for i in range(k):
centros_i = np.mean(X[idx == i], axis=0) # 计算每类中心点坐标(x,y)--axis=0压缩行,对每列求平均
centros.append(centros_i)
return np.array(centros)
def run_kmeans(X, centros, iters):
k = len(centros)
centros_all = [centros] # 用来存储每一次迭代更新的聚类中心点坐标(X,y)
centros_i = centros
for i in range(iters):
idx = find_centroids(X, centros_i)
centros_i = compute_centros(X, idx, k)
centros_all.append(centros_i)
return idx, np.array(centros_all) # idx返回最后一次迭代更新的分类结果,centros_all聚类迭代过程中所有的中心点坐标
def plot_data(X, centros_all, idx): # 绘制数据集和聚类中心的移动轨迹
plt.figure()
plt.scatter(X[:, 0], X[:, 1], c=idx, cmap='rainbow')
plt.plot(centros_all[:, :, 0], centros_all[:, :, 1], 'kx--')
def init_centros(X,k): # 随机初始化k个聚类中心点
index = np.random.choice(len(X),k)
return X[index]
if __name__ == '__main__':
data1 = sio.loadmat('data/ex7data2.mat')
print(data1)
X = data1['X']
plt.scatter(X[:, 0], X[:, 1])
# 人为初始化三个聚类中心点
centros = np.array([[3, 3], [6, 2], [8, 5]]) # 初始化三个聚类中心点(在此认为选择了三个点),即分三类
idx = find_centroids(X, centros)
idx, centros_all = run_kmeans(X, centros, iters=10)
plot_data(X, centros_all, idx)
# 随机初始化三个聚类中心点
for i in range(4):
idx, centros_all = run_kmeans(X, init_centros(X, k=3), iters=10)
plot_data(X, centros_all, idx)
plt.show()
2. PCA二维数据的降维
"""
案例1:使用PCA进行二维数据的降维
数据集:data/ex7data1.mat
"""
import numpy as np
import pandas as pd
import scipy.io as sio
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(context="notebook", style="white")
mat = sio.loadmat('./data/ex7data1.mat')
X = mat['X'] # X-->(50,2)
print(X.shape)
# 散点图
plt.figure()
plt.scatter(X[:,0],X[:,1])
# 对X去均值化
X_demean = X - np.mean(X, axis=0)
plt.figure()
plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False #用来正常显示负号
plt.title('X去均值化的散点图')
plt.scatter(X_demean[:,0],X_demean[:,1])
C = X_demean.T @ X_demean / len(X) # 计算协方差矩阵n×n,n表示特征数量
print(C)
U, S, V = np.linalg.svd(C) # 计算特征值,特征向量。SVD奇异值分解
U_reduce = U[:,0] # 选取前K向量,获得n×k的矩阵。此处选U的第一列(2,)
z = X_demean @ U_reduce # 新特征向量z-->(50,1)
plt.figure(figsize=(5,5))
plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False #用来正常显示负号
plt.title('数据降维')
plt.scatter(X_demean[:,0], X_demean[:,1])
plt.plot([0,U_reduce[0]], [0,U_reduce[1]], c='r')
plt.plot([0,U[:,1][0]], [0,U[:,1][1]], c='k')
# 还原数据---z为前面PCA压缩的数据,还原回原始的二维空间
X_approx = z.reshape(50,1) @ U_reduce.reshape(1,2) + np.mean(X,axis=0) # X_approx-->(50,2)近似等于原X
plt.figure()
plt.scatter(X[:,0], X[:,1])
plt.scatter(X_approx[:,0], X_approx[:,1])
3. PCA二维数据的降维-01
"""
案例1:使用PCA进行二维数据的降维
数据集:data/ex7data1.mat
"""
import numpy as np
import pandas as pd
import scipy.io as sio
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(context="notebook", style="white")
# support functions ---------------------------------------
def plot_n_image(X, n):
""" plot first n images
n has to be a square number
"""
pic_size = int(np.sqrt(X.shape[1]))
grid_size = int(np.sqrt(n))
first_n_images = X[:n, :]
fig, ax_array = plt.subplots(nrows=grid_size, ncols=grid_size,
sharey=True, sharex=True, figsize=(8, 8))
for r in range(grid_size):
for c in range(grid_size):
ax_array[r, c].imshow(first_n_images[grid_size * r + c].reshape((pic_size, pic_size)))
plt.xticks(np.array([]))
plt.yticks(np.array([]))
# PCA functions ---------------------------------------
def covariance_matrix(X):
"""
Args:
X (ndarray) (m, n)
Return:
cov_mat (ndarray) (n, n):
covariance matrix of X
"""
m = X.shape[0]
return (X.T @ X) / m
def normalize(X):
"""
for each column, X-mean / std
"""
X_copy = X.copy()
m, n = X_copy.shape
for col in range(n):
X_copy[:, col] = (X_copy[:, col] - X_copy[:, col].mean()) / X_copy[:, col].std()
return X_copy
def pca(X):
"""
http://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.svd.html
Args:
X ndarray(m, n)
Returns:
U ndarray(n, n): principle components
"""
# 1. normalize data
X_norm = normalize(X)
# 2. calculate covariance matrix
Sigma = covariance_matrix(X_norm) # (n, n)
# 3. do singular value decomposition
# remeber, we feed cov matrix in SVD, since the cov matrix is symmetry
# left sigular vector and right singular vector is the same, which means
# U is V, so we could use either one to do dim reduction
U, S, V = np.linalg.svd(Sigma) # U: principle components (n, n)
return U, S, V
def project_data(X, U, k):
"""
Args:
U (ndarray) (n, n)
Return:
projected X (n dim) at k dim
"""
m, n = X.shape
if k > n:
raise ValueError('k should be lower dimension of n')
return X @ U[:, :k]
def recover_data(Z, U):
m, n = Z.shape
if n >= U.shape[0]:
raise ValueError('Z dimension is >= U, you should recover from lower dimension to higher')
return Z @ U[:, :n].T
if __name__ == '__main__':
mat = sio.loadmat('./data/ex7data1.mat')
X = mat.get('X')
X_norm = normalize(X)
print(X_norm.shape)
Sigma = covariance_matrix(X_norm)
print(Sigma)
U, S, V = pca(X)
Z = project_data(X_norm, U, 1) # project data to lower dimension
print(Z[:10])
print(Z)
X_recover = recover_data(Z, U)
# project data to lower dimension
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12, 4))
sns.regplot('X1', 'X2',
data=pd.DataFrame(X_norm, columns=['X1', 'X2']),
fit_reg=False,
ax=ax1)
ax1.set_title('Original dimension')
sns.rugplot(Z, ax=ax2).set(xlim=(-3, 3))
ax2.set_xlabel('Z')
ax2.set_title('Z dimension')
fig, (ax3, ax4, ax5) = plt.subplots(ncols=3, figsize=(16, 4))
# recover data to original dimension
sns.rugplot(Z, ax=ax3).set(xlim=(-3, 3))
ax3.set_title('Z dimension')
ax3.set_xlabel('Z')
sns.regplot('X1', 'X2',
data=pd.DataFrame(X_recover, columns=['X1', 'X2']),
fit_reg=False,
ax=ax4)
ax4.set_title("2D projection from Z")
sns.regplot('X1', 'X2',
data=pd.DataFrame(X_norm, columns=['X1', 'X2']),
fit_reg=False,
ax=ax5)
ax5.set_title('Original dimension')
plt.show()
4. PCA图像降维
"""
案例2:使用PCA进行图像的降维,k=36
数据集:data/ex7faces.mat
具体算法公式--见笔记第14讲数据降维与重建
"""
import numpy as np
import scipy.io as sio
import matplotlib.pyplot as plt
# support functions ---------------------------------------
def plot_n_image(X, n): # 图像可视化,显示n张图片
""" plot first n images
n has to be a square number
"""
pic_size = int(np.sqrt(X.shape[1]))
grid_size = int(np.sqrt(n))
first_n_images = X[:n, :]
fig, ax_array = plt.subplots(nrows=grid_size, ncols=grid_size,
sharey=True, sharex=True, figsize=(8, 8))
for r in range(grid_size):
for c in range(grid_size):
ax_array[r, c].imshow(first_n_images[grid_size * r + c].reshape((pic_size, pic_size)))
plt.xticks(np.array([]))
plt.yticks(np.array([]))
# PCA functions ---------------------------------------
def covariance_matrix(X):
"""
Args:
X (ndarray) (m, n)
Return:
cov_mat (ndarray) (n, n):
covariance matrix of X
"""
m = X.shape[0]
return (X.T @ X) / m
def normalize(X):
"""
for each column, X-mean / std
"""
X_copy = X.copy()
m, n = X_copy.shape
for col in range(n):
X_copy[:, col] = (X_copy[:, col] - X_copy[:, col].mean()) / X_copy[:, col].std()
return X_copy
def pca(X):
"""
http://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.svd.html
Args:
X ndarray(m, n)
Returns:
U ndarray(n, n): principle components
"""
# 1. normalize data
X_norm = normalize(X)
# 2. calculate covariance matrix
Sigma = covariance_matrix(X_norm) # (n, n)
# 3. do singular value decomposition
# remeber, we feed cov matrix in SVD, since the cov matrix is symmetry
# left sigular vector and right singular vector is the same, which means
# U is V, so we could use either one to do dim reduction
U, S, V = np.linalg.svd(Sigma) # U: principle components (n, n)
return U, S, V
def project_data(X, U, k):
"""
Args:
U (ndarray) (n, n)
Return:
projected X (n dim) at k dim
"""
m, n = X.shape
if k > n:
raise ValueError('k should be lower dimension of n')
return X @ U[:, :k]
def recover_data(Z, U):
m, n = Z.shape
if n >= U.shape[0]:
raise ValueError('Z dimension is >= U, you should recover from lower dimension to higher')
return Z @ U[:, :n].T
if __name__ == "__main__":
mat = sio.loadmat('./data/ex7faces.mat')
X = np.array([x.reshape((32, 32)).T.reshape(1024) for x in mat.get('X')]) # X-->(5000,1024)
plot_n_image(X, n=64)
U, _, _ = pca(X)
# didn't see face in principle components
plot_n_image(U, n=32)
# reduce dimension to k=100
Z = project_data(X, U, k=100)
plot_n_image(Z, n=64)
# recover from k=100
X_recover = recover_data(Z, U) # 选取的Z的前100列(维)数据,还原结果,和原始图像近似
plot_n_image(X_recover, n=64)
plt.show()
# -----------------------sklearn PCA-------------------------------------
from sklearn.decomposition import PCA
sk_pca = PCA(n_components=100)
Z = sk_pca.fit_transform(X)
plot_n_image(Z, 64)
plt.show()
X_recover = sk_pca.inverse_transform(Z)
plot_n_image(X_recover, n=64)
plt.show()
5. 图片颜色聚类
"""
案例2: 使用kmeans对图片颜色进行聚类
RGB图像,每个像素点值范围0-255
数据集:data/bird_small.mat,data/bird_small.png
"""
import numpy as np
import scipy.io as sio
import matplotlib.pyplot as plt
from skimage import io
def find_centroids(X, centros):
idx = []
for i in range(len(X)):
# (2,)(k,2)->(k,2)
dist = np.linalg.norm((X[i] - centros), axis=1) # (k,)
id_i = np.argmin(dist)
idx.append(id_i)
return np.array(idx)
def compute_centros(X, idx, k):
centros = []
for i in range(k):
centros_i = np.mean(X[idx == i], axis=0)
centros.append(centros_i)
return np.array(centros)
def run_kmeans(X, centros, iters):
k = len(centros)
centros_all = []
centros_all.append(centros)
centros_i = centros
for i in range(iters):
idx = find_centroids(X, centros_i)
centros_i = compute_centros(X, idx, k)
centros_all.append(centros_i)
return idx, np.array(centros_all)
def init_centros(X, k):
index = np.random.choice(len(X), k)
return X[index]
if __name__ == '__main__':
data = sio.loadmat('data/bird_small.mat')
A = data['A']
print(A.shape)
image = io.imread('data/bird_small.png')
fig, ax = plt.subplots(figsize=(8, 6))
plt.imshow(image)
plt.axis('off') # 不显示坐标轴
A = A / 255
A = A.reshape(-1, 3) # 三维->二维
print(A.shape)
k = 16
idx, centros_all = run_kmeans(A, init_centros(A, k=16), iters=20)
centros = centros_all[-1] # 迭代最后一次的聚类中心点坐标
im = np.zeros(A.shape)
for i in range(k):
im[idx == i] = centros[i]
im = im.reshape(128, 128, 3)
# im = np.rint(im*255).astype('uint8') # 还原为0-255的rgb范围图像
print(im)
fig, ax = plt.subplots(figsize=(8, 6))
plt.imshow(im)
plt.show()