目录
1.1.1 Finding closest centroids
1.1.2 Computing centroid means
1.2 K-means on example dataset
1.4 Image compression with K-means
1.4.2 Use scikit-learn to realize K-means algorithm
2. Principal Component Analysis
2.3 Dimensionality Reduction with PCA
2.3.1 Projecting the data onto the principal components
2.3.2 Reconstructing an approximation of the data
2.3.3 Visualizing the projections
1. K-means Clustering
内容:我们先将K-means算法运用到二维数据集上,之后我们再将它运用到图片压缩上。
1.1 Implementing K-means
K-means:
1. Iteration
step1:将样本分配给离它最近的聚类中心(cluster assignment step);
step2:将聚类中心移动到它所拥有的点的平均位置。(move centroid)。
2. Initialization
原因:可能会进入局部最优。
方法:多次进行初始化参数,运行K-means,计算其代价函数这一过程。找出最小的代价函数值,即为那一种聚类分法。
1.1.1 Finding closest centroids
内容:将每个样本分配给离它们最近的聚类中心,并设定:
输出一维的关于每个样本分配给了最近的聚类中心(idx)的向量。
findClosestCentroids.py
import numpy as np
def findClosestCentroids(X, centroids):
m = X.shape[0]
K = centroids.shape[0]
idx = np.zeros(m)
for i in range(m):
min_dist = 1000000
for j in range(K):
dist = np.sum((X[i, :] - centroids[j, :]) ** 2)
if dist < min_dist:
min_dist = dist
idx[i] = j
return idx
main.py
from scipy.io import loadmat
import numpy as np
from findClosestCentroids import * # 找到最近的聚类中心
raw_data = loadmat('ex7data1.mat')
X = raw_data['X']
initial_centroids = np.array([[3, 3], [6, 2], [8, 5]])
idx = findClosestCentroids(X, initial_centroids)
print(X[:8])
print(idx[:8])
[[3.38156267 3.38911268]
[4.52787538 5.8541781 ]
[2.65568187 4.41199472]
[2.76523467 3.71541365]
[2.84656011 4.17550645]
[3.89067196 6.48838087]
[3.47580524 3.63284876]
[5.91129845 6.68076853]]
[0. 0. 0. 0. 0. 0. 0. 2.]
main.py
from scipy.io import loadmat
import pandas as pd
raw_data = loadmat('ex7data2.mat')
X = raw_data['X']
data = pd.DataFrame(X, columns=['X1', 'X2'])
print(data.head())
X1 X2
0 1.842080 4.607572
1 5.658583 4.799964
2 6.352579 3.290854
3 2.904017 4.612204
4 3.231979 4.939894
数据可视化
plot.py
def plotData(ax, X):
ax.scatter(X['X1'], X['X2'])
ax.set_xlabel('X1')
ax.set_ylabel('X2')
main.py
from scipy.io import loadmat
import pandas as pd
import matplotlib.pyplot as plt
from plot import * # 绘图
raw_data = loadmat('ex7data2.mat')
X = raw_data['X']
data = pd.DataFrame(X, columns=['X1', 'X2'])
fig, ax = plt.subplots(figsize=(8, 6))
plotData(ax, data)
plt.show()
1.1.2 Computing centroid means
内容:计算聚类中心。聚类中心重新赋值为分配给该聚类中心的所有点的平均值,即:
np.where和索引用法例子
test.py
import numpy as np
idx = np.array([0, 0, 1, 2, 1, 2, 2, 0, 1])
for i in range(3):
indices = np.where(idx == i)
print(indices)
(array([0, 1, 7], dtype=int64),)
(array([2, 4, 8], dtype=int64),)
(array([3, 5, 6], dtype=int64),)
computeCentroids.py
import numpy as np
def computeCentroids(X, idx, k):
r, c = X.shape
centroids = np.zeros((k, c))
for i in range(k):
indices = np.where(idx == i)
# axis=1:列求和
centroids[i, :] = np.sum(X[indices, :], axis=1) / len(indices[0])
return centroids
main.py
from scipy.io import loadmat
import numpy as np
from findClosestCentroids import * # 找到最近的聚类中心
from computeCentroids import * # 计算聚类中心
raw_data = loadmat('ex7data2.mat')
X = raw_data['X']
initial_centroids = np.array([[3, 3], [6, 2], [8, 5]])
idx = findClosestCentroids(X, initial_centroids)
re_centroids = computeCentroids(X, idx, 3)
print(re_centroids)
[[2.42830111 3.15792418]
[5.81350331 2.63365645]
[7.11938687 3.6166844 ]]
1.2 K-means on example dataset
内容:实现K-means算法,将前面两个步骤放入循环中。
runKMeans.py
import numpy as np
from findClosestCentroids import * # 找到最近的聚类中心
from computeCentroids import * # 计算聚类中心
def runKMeans(X, initial_centroids, max_iters):
r, c = X.shape
k = initial_centroids.shape[0]
idx = np.zeros(r)
centroids = initial_centroids
for i in range(max_iters):
idx = findClosestCentroids(X, centroids)
centroids = computeCentroids(X, idx, k)
return idx, centroids
plot.py
import numpy as np
def plotData(ax, X):
ax.scatter(X['X1'], X['X2'])
ax.set_xlabel('X1')
ax.set_ylabel('X2')
def plotClusteringData(ax, X, idx, centroids):
k = centroids.shape[0]
color = ['r', 'g', 'b']
for i in range(k):
cluster_i = X[np.where(idx == i)[0], :]
ax.scatter(cluster_i[:, 0], cluster_i[:, 1], s=10, c=color[i], label='Cluster{}'.format(i + 1))
ax.scatter(centroids[i, 0], centroids[i, 1], s=60, marker='x', c=color[
i], label='cluster centroid{}'.format(
i + 1))
ax.legend()
main.py
from scipy.io import loadmat
import numpy as np
import matplotlib.pyplot as plt
from runKMeans import * # 运行K-means算法
from plot import * # 绘制聚类好的数据
raw_data = loadmat('ex7data2.mat')
X = raw_data['X']
initial_centroids = np.array([[3, 3], [6, 2], [8, 5]])
max_iters = 10
idx, centroids = runKMeans(X, initial_centroids, max_iters)
fig, ax = plt.subplots(figsize=(8, 6))
plotClusteringData(ax, X, idx, centroids)
plt.show()
1.3 Random initialization
内容:初始化聚类中心,可以将数据打乱,再选择K个点作为聚类中心。
kMeansInitCentroids.py
import numpy as np
def kMeansInitCentroids(X, K):
r, c = X.shape
centroids = np.zeros((K, c))
idx = np.random.randint(0, r, K)
# random.randint(low,high,size)-从[low,high)中返回数量为size的随机数
for i in range(K):
centroids[i, :] = X[idx[i], :]
return centroids
main.py(将迭代次数设置为1以便看到初始化聚类中心的位置)
from scipy.io import loadmat
import matplotlib.pyplot as plt
from kMeansInitCentroids import * # 初始化聚类中心
from runKMeans import * # 运行K-means算法
from plot import * # 绘制聚类好的数据
raw_data = loadmat('ex7data2.mat')
X = raw_data['X']
K = 3
initial_centroids = kMeansInitCentroids(X, K)
max_iters = 1
idx, centroids = runKMeans(X, initial_centroids, max_iters)
fig, ax = plt.subplots(figsize=(8, 6))
plotClusteringData(ax, X, idx, centroids)
plt.show()
1.4 Image compression with K-means
内容:原图像是由24bit的颜色呈现的,每个像素是由3个8bit的无符号整数(0-255)分表表示红、绿,蓝的强度值。在这张包含上千种颜色的图像里使用K-means算法,最后只使用16种颜色将图像呈现出来。
1.4.1 K-means on pixels
原图展现
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
# 1.mpimg.imread()用于从图像文件中读取图像数据
# 2.imshow(X)用于绘制图像,X-输入数据(数组等)
data_img = mpimg.imread('bird_small.png')
draw_img = plt.imshow(data_img)
plt.show()
图像数据
main.py
from scipy.io import loadmat
raw_data = loadmat('bird_small.mat')
image_data = raw_data['A']
print(image_data)
print(image_data.shape)
[[[219 180 103]
[230 185 116]
[226 186 110]
...
[ 14 15 13]
[ 13 15 12]
[ 12 14 12]][[230 193 119]
[224 192 120]
[226 192 124]
...
[ 16 16 13]
[ 14 15 10]
[ 11 14 9]][[228 191 123]
[228 191 121]
[220 185 118]
...
[ 14 16 13]
[ 13 13 11]
[ 11 15 10]]...
[[ 15 18 16]
[ 18 21 18]
[ 18 19 16]
...
[ 81 45 45]
[ 70 43 35]
[ 72 51 43]][[ 16 17 17]
[ 17 18 19]
[ 20 19 20]
...
[ 80 38 40]
[ 68 39 40]
[ 59 43 42]][[ 15 19 19]
[ 20 20 18]
[ 18 19 17]
...
[ 65 43 39]
[ 58 37 38]
[ 52 39 34]]]
(128, 128, 3)
原始数据的处理
from scipy.io import loadmat
import numpy as np
raw_data = loadmat('bird_small.mat')
image_data = raw_data['A'] # 像素值用rgb来表示
image_data = image_data / 100 # normalize value ranges
X = np.reshape(image_data, (image_data.shape[0] * image_data.shape[1], image_data.shape[2]))
print(X.shape) # (16384, 3)
main.py
from scipy.io import loadmat
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from kMeansInitCentroids import * # 初始化聚类中心
from runKMeans import * # 运行K-means算法
raw_data = loadmat('bird_small.mat')
image_data = raw_data['A'] # 像素值用rgb来表示
image_data = image_data / 100 # normalize value ranges
X = np.reshape(image_data, (image_data.shape[0] * image_data.shape[1], image_data.shape[2]))
K = 16
max_iters = 10
# 1.randomly initialize centroids
initialize_centroids = kMeansInitCentroids(X, K)
# 2.run K-means algorithm
idx, centroids = runKMeans(X, initialize_centroids, max_iters)
# 3.get the closest centroids one last time
idx = findClosestCentroids(X, centroids)
# 4.map each pixel to the centroid value
# astype()用来转换numpy数组的数据类型
X_pixel = centroids[idx.astype(int), :]
X_pixel = np.reshape(X_pixel, (image_data.shape[0], image_data.shape[1], image_data.shape[2]))
# print(X_pixel.shape) #(128, 128, 3)
plt.imshow(X_pixel)
plt.show()
1.4.2 Use scikit-learn to realize K-means algorithm
A. 显示图像
main.py
import matplotlib.pyplot as plt
from skimage.io import imread, imshow
img_data = imread('bird_small.png') # 1.读取图像
imshow(img_data) # 2.绘制图像
plt.show() # 3.显示图像
B. 进行压缩
main.py
import matplotlib.pyplot as plt
from skimage.io import imread, imshow
from sklearn.cluster import KMeans # sklearn机器学习库
img_data = imread('bird_small.png') / 255 # 1.读取图像
data = img_data.reshape(img_data.shape[0] * img_data.shape[1], img_data.shape[2])
# print(data.shape) # (16384, 3)
# KMeans的主要参数
# n_clusters:K值。
# n_init:不同初始化聚类中心的次数。
model = KMeans(n_clusters=16, n_init=100) # 创建分类器对象
model.fit(data) # 用训练集拟合分类器模型
centroids = model.cluster_centers_ # 输出聚类中心
idx = model.predict(data) # 预测训练集中每个实例的类 [1 1 1 ... 8 8 8]
compressed_data = centroids[idx, :].reshape(img_data.shape)
# print(compressed_data.shape) # (128, 128, 3)
fig, ax = plt.subplots(1, 2)
ax[0].imshow(img_data) # 绘制图像
ax[1].imshow(compressed_data)
plt.show()
2. Principal Component Analysis
内容:先在2维数据中使用PCA,再在人脸图像数据中进行PCA降维,尽可能保留的方差最大(数据信息量最大)。
2.1 Example Dataset
内容:数据可视化
main.py
from scipy.io import loadmat
import matplotlib.pyplot as plt
X = loadmat('ex7data1.mat')['X']
fig, ax = plt.subplots(figsize=(8, 6))
ax.scatter(X[:, 0], X[:, 1])
plt.show()
2.2 Implementing PCA
Step1: 计算协方差矩阵(先进行特征标准化);
Step2: 计算协方差矩阵的特征向量和特征值以识别主成分(使用SVD奇异值分解)。
协方差矩阵:
PCA.py
import numpy as np
def PCA(X):
m = X.shape[0]
X = (X - X.mean()) / X.std()
X = np.matrix(X)
Sigmoid = (X.T * X) / m # 计算协方差矩阵
U, S, V = np.linalg.svd(Sigmoid)
return U, S, V
main.py
from scipy.io import loadmat
from PCA import * # 运行PCA算法
X = loadmat('ex7data1.mat')['X']
U, S, V = PCA(X) # U为主成分
print(U, S, V)
[[-0.79241747 -0.60997914]
[-0.60997914 0.79241747]] [1.43584536 0.56415464] [[-0.79241747 -0.60997914]
[-0.60997914 0.79241747]]
2.3 Dimensionality Reduction with PCA
内容:使用U矩阵(主成分),将原始数据投影到低维的空间里。
2.3.1 Projecting the data onto the principal components
内容:投影数据。
projectData.py
def projectData(X, U, K):
U_reduce = U[:, :K]
# X(50, 2),U_reduce(2,1)
# print((X * U_reduce).shape) # (50,1)
return X * U_reduce
main.py
from scipy.io import loadmat
from PCA import * # 运行PCA算法
from projectData import * # 投影到低维空间
X = loadmat('ex7data1.mat')['X']
U, S, V = PCA(X) # U为主成分
Z = projectData(X, U, 1)
print(Z)
[[-4.74689738]
[-7.15889408]
[-4.79563345]
[-4.45754509]
[-4.80263579]
[-7.04081342]
[-4.97025076]
[-8.75934561]
[-6.2232703 ]
[-7.04497331]
[-6.91702866]
[-6.79543508]
[-6.3438312 ]
[-6.99891495]
[-4.54558119]
[-8.31574426]
[-7.16920841]
[-5.08083842]
[-8.54077427]
[-6.94102769]
[-8.5978815 ]
[-5.76620067]
[-8.2020797 ]
[-6.23890078]
[-4.37943868]
[-5.56947441]
[-7.53865023]
[-7.70645413]
[-5.17158343]
[-6.19268884]
[-6.24385246]
[-8.02715303]
[-4.81235176]
[-7.07993347]
[-5.45953289]
[-7.60014707]
[-4.39612191]
[-7.82288033]
[-3.40498213]
[-6.54290343]
[-7.17879573]
[-5.22572421]
[-4.83081168]
[-7.23907851]
[-4.36164051]
[-6.44590096]
[-2.69118076]
[-4.61386195]
[-5.88236227]
[-7.76732508]]
2.3.2 Reconstructing an approximation of the data
内容:压缩重现,恢复原始数据。
restore.py
def Restore(Z, U, K):
U_reduce = U[:, :K]
# Z(50,1) U_reduce(2,1)
# print((Z * U_reduce.T).shape) #(50,2)
return Z * U_reduce.T
main.py
from scipy.io import loadmat
from PCA import * # 运行PCA算法
from projectData import * # 投影到低维空间
from restore import * # 压缩重现
X = loadmat('ex7data1.mat')['X']
U, S, V = PCA(X) # U为主成分
Z = projectData(X, U, 1)
X_recovered = Restore(Z, U, 1)
print(X_recovered)
[[3.76152442 2.89550838]
[5.67283275 4.36677606]
[3.80014373 2.92523637]
[3.53223661 2.71900952]
[3.80569251 2.92950765]
[5.57926356 4.29474931]
[3.93851354 3.03174929]
[6.94105849 5.3430181 ]
[4.93142811 3.79606507]
[5.58255993 4.29728676]
[5.48117436 4.21924319]
[5.38482148 4.14507365]
[5.02696267 3.8696047 ]
[5.54606249 4.26919213]
[3.60199795 2.77270971]
[6.58954104 5.07243054]
[5.681006 4.37306758]
[4.02614513 3.09920545]
[6.76785875 5.20969415]
[5.50019161 4.2338821 ]
[6.81311151 5.24452836]
[4.56923815 3.51726213]
[6.49947125 5.00309752]
[4.94381398 3.80559934]
[3.47034372 2.67136624]
[4.41334883 3.39726321]
[5.97375815 4.59841938]
[6.10672889 4.70077626]
[4.09805306 3.15455801]
[4.90719483 3.77741101]
[4.94773778 3.80861976]
[6.36085631 4.8963959 ]
[3.81339161 2.93543419]
[5.61026298 4.31861173]
[4.32622924 3.33020118]
[6.02248932 4.63593118]
[3.48356381 2.68154267]
[6.19898705 4.77179382]
[2.69816733 2.07696807]
[5.18471099 3.99103461]
[5.68860316 4.37891565]
[4.14095516 3.18758276]
[3.82801958 2.94669436]
[5.73637229 4.41568689]
[3.45624014 2.66050973]
[5.10784454 3.93186513]
[2.13253865 1.64156413]
[3.65610482 2.81435955]
[4.66128664 3.58811828]
[6.1549641 4.73790627]]
2.3.3 Visualizing the projections
main.py(绘图)
from scipy.io import loadmat
import matplotlib.pyplot as plt
from PCA import * # 运行PCA算法
from projectData import * # 投影到低维空间
from restore import * # 压缩重现
X = loadmat('ex7data1.mat')['X']
U, S, V = PCA(X) # U为主成分
Z = projectData(X, U, 1)
X_recovered = Restore(Z, U, 1)
fig, ax = plt.subplots(figsize=(8, 6))
ax.scatter(X[:, 0], X[:, 1], c='b', label='original data')
ax.scatter(list(X_recovered[:, 0]), list(X_recovered[:, 1]), c='r', label='projected data')
ax.legend()
plt.show()
2.4 Face Image Dataset
内容:在图像上运用PCA进行降维。
数据:X中每一行代表一个人脸图像,每一行向量中的长度为1024。
main.py
from scipy.io import loadmat
raw_data = loadmat('ex7faces.mat')['X']
print(raw_data.shape) # (5000, 1024)
显示图像
plotImage.py
import matplotlib.pyplot as plt
from skimage.io import imshow
import numpy as np
def plotNumImage(X, N):
pic_size = int(np.sqrt(X.shape[1])) # 将图像在之后用矩阵表示
grid_size = int(np.sqrt(X.shape[0])) # 绘图时用到
firstNImage = X[:N, :]
fig, axs = plt.subplots(nrows=grid_size, ncols=grid_size, sharey=True, sharex=True, figsize=(10, 10))
for r in range(grid_size):
for c in range(grid_size):
axs[r][c].imshow((firstNImage[r * grid_size + c, :]).reshape(pic_size, pic_size))
plt.xticks(np.array([]))
plt.yticks(np.array([]))
在这里我们先只显示一个图像
main.py
from scipy.io import loadmat
from skimage.io import imshow
import matplotlib.pyplot as plt
import numpy as np
raw_data = loadmat('ex7faces.mat')['X']
n = int(np.sqrt(raw_data.shape[1]))
face = np.reshape(raw_data[1, :], (n, n))
imshow(face)
plt.show()
2.4.1 PCA on Faces
内容:使用2.3节的步骤,进行数据降维。
main.py
from scipy.io import loadmat
from skimage.io import imshow
import matplotlib.pyplot as plt
import numpy as np
from PCA import * # 运行PCA
from projectData import * # 进行投影
from restore import * # 压缩重现
X = loadmat('ex7faces.mat')['X']
n = int(np.sqrt(X.shape[1]))
U, S, V = PCA(X)
Z = projectData(X, U, 100) # 降成100维
# 压缩重现
X_recovered = Restore(Z, U, 100)
# print(X_recovered.shape) # (5000, 1024)
face_recovered = np.array(np.reshape(X_recovered[1, :], (n, n)))
imshow(face_recovered)
plt.show()