2 主成分分析
主成分分析通过协方差矩阵提取数据的主要成分,如90%的成分,通常用户数据压缩和数据可视化(维度降低方便可视化)
2.1 导入模块和数据
该部分通过将二维数据压缩成一维数据演示主成分分析使用方法
导入模块和数据
import matplotlib.pyplot as plt
import numpy as np
import scipy.io as scio
from mpl_toolkits.mplot3d import Axes3D
from skimage import io
from skimage import img_as_float
import featureNormalize as fn # 数据规范化函数
import runkMeans as rk # K近邻算法函数
import displayData as dp # 绘制图形函数,详见EX7第一部分
import imp
imp.reload(rk) # 重新加载模块方法,方便被调用函数修改后重新加载
plt.ion()
加载数据
# ===================== Part 1: Load Example Dataset =====================
print('Visualizing example dataset for PCA.')
# The following command loads the dataset.
data = scio.loadmat('ex7data1.mat')
X = data['X']
Visualizing example dataset for PCA.
显示数据的维度,50条二维数据
X.shape
(50, 2)
绘制数据散点图
# Visualize the example dataset
plt.figure()
plt.scatter(X[:, 0], X[:, 1], facecolors='none', edgecolors='b', s=20)
plt.axis('equal')
plt.axis([0.5, 6.5, 2, 8])
[0.5, 6.5, 2, 8]
2.2 规范化函数(featureNormalize.py)
将数据规范化,方便拟合处理
import numpy as np
def feature_normalize(X):
mu = np.mean(X, 0) # 取平均
sigma = np.std(X, 0, ddof=1) # 取标准差
X_norm = (X - mu) / sigma # 规范化
return X_norm, mu, sigma
调用函数,将数据规范化
# ===================== Part 2: Principal Component Analysis =====================
print('Running PCA on example dataset.')
# Before running PCA, it is important to first normalize X
X_norm, mu, sigma = fn.feature_normalize(X)
Running PCA on example dataset.
规范化后数据
X_norm[:5]
array([[-0.5180535 , -1.57678415],
[ 0.45915361, 0.83189934],
[-1.13685138, -0.57729787],
[-1.04345995, -1.25794647],
[-0.97413176, -0.80837709]])
规范化前数据
X[:5]
array([[3.38156267, 3.38911268],
[4.52787538, 5.8541781 ],
[2.65568187, 4.41199472],
[2.76523467, 3.71541365],
[2.84656011, 4.17550645]])
平均值
mu
array([3.98926528, 5.00280585])
标准差
sigma
array([1.17304991, 1.02340778])
2.3 主成分分析函数(pca.py)
使用scipy.linalg.svd模块进行主成分分析,其中U为主成分矩阵,S为对角矩阵
协方差矩阵计算公式:
import scipy
def pca(X):
# Useful values
(m, n) = X.shape
# You need to return the following variables correctly.
U = np.zeros(n)
S = np.zeros(n)
# ===================== Your Code Here =====================
sigma = np.dot(X.T, X) / m #compute the covariance matrix
U, S, _ = scipy.linalg.svd(sigma, full_matrices=True, compute_uv=True)
# ==========================================================
return U, S
调用函数进行主成分分析
# Run PCA
U, S = pca(X_norm)
主成分矩阵
U
array([[-0.70710678, -0.70710678],
[-0.70710678, 0.70710678]])
对角矩阵
S
array([1.70081977, 0.25918023])
2.4 runkMeans.py函数
import numpy as np
import matplotlib.pyplot as plt
#import matplotlib.colors as colors
#import matplotlib.cm as cmx
import findClosestCentroids as fc
import computeCentroids as cc
def run_kmeans(X, initial_centroids, max_iters, plot):
if plot:
plt.figure()
# Initialize values
(m, n) = X.shape
K = initial_centroids.shape[0]#聚类中心个数
centroids = initial_centroids
previous_centroids = centroids
idx = np.zeros(m)
# Run K-Means
for i in range(max_iters):
# Output progress
print('K-Means iteration {}/{}'.format((i + 1), max_iters))
# For each example in X, assign it to the closest centroid
idx = fc.find_closest_centroids(X, centroids) # 每个训练样本找距离最短聚类中心
# Optionally plot progress
if plot:
# 调用plot_progress函数绘制训练样本散点图及聚类中心移动过程
plot_progress(X, centroids, previous_centroids, idx, K, i)
previous_centroids = centroids # 保留前一个聚类中心点位置,以便绘制聚类中心移动线段
input('Press ENTER to continue')
# Given the memberships, compute new centroids
centroids = cc.compute_centroids(X, idx, K) # 重新计算新的聚类中心
return centroids, idx
def plot_progress(X, centroids, previous, idx, K, i):
plt.scatter(X[:, 0], X[:, 1], c=idx, s=15)# 绘制训练样本散点图,每个样本颜色通过聚类中心索引号区别
plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', c='black', s=25) # 绘制聚类中心点
for j in range(centroids.shape[0]):
# 调用draw_line绘制聚类中心移动过程
draw_line(centroids[j], previous[j])
plt.title('Iteration number {}'.format(i + 1))
def draw_line(p1, p2):
plt.plot(np.array([p1[0], p2[0]]), np.array([p1[1], p2[1]]), c='black', linewidth=1)
2.5 绘制主成分图形
分别去两个点以便绘制图形
mu, mu + 1.5 * S[0] * U[:, 0]
(array([3.98926528, 5.00280585]), array([2.18527349, 3.19881406]))
mu, mu + 1.5 * S[1] * U[:, 1]
(array([3.98926528, 5.00280585]), array([3.71436313, 5.277708 ]))
绘制图形
plt.figure()
plt.scatter(X[:, 0], X[:, 1], facecolors='none', edgecolors='b', s=20)
plt.axis('equal')
plt.axis([0.5, 6.5, 2, 8])
rk.draw_line(mu, mu + 1.5 * S[0] * U[:, 0])
rk.draw_line(mu, mu + 1.5 * S[1] * U[:, 1])
打印主成分第一列数据
print('Top eigenvector: \nU[:, 0] = {}'.format(U[:, 0]))
print('You should expect to see [-0.707107 -0.707107]')
Top eigenvector:
U[:, 0] = [-0.70710678 -0.70710678]
You should expect to see [-0.707107 -0.707107]
2.6 通过投影数据进行数据压缩
2.6.1 绘制原始数据图形
# ===================== Part 3: Dimension Reduction =====================
print('Dimension reductino on example dataset.')
# Plot the normalized dataset (returned from pca)
plt.figure()
plt.scatter(X_norm[:, 0], X_norm[:, 1], facecolors='none', edgecolors='b', s=20)
plt.axis('equal')
plt.axis([-4, 3, -4, 3])
Dimension reductino on example dataset.
[-4, 3, -4, 3]
2.6.2 投影数据函数(projectData.py)
取主成分的前K列进行投影
def project_data(X, U, K):
# You need to return the following variables correctly.
Z = np.zeros((X.shape[0], K))
# ===================== Your Code Here =====================
UK = U[:, 0:K]
Z = np.dot(X, UK)
# ==========================================================
return Z
调用函数取投影数据
# Project the data onto K = 1 dimension
K = 1
Z = project_data(X_norm, U, K)
print('Projection of the first example: {}'.format(Z[0]))
print('(this value should be about 1.481274)')
print('Z.shape:\n', Z.shape)
Projection of the first example: [1.48127391]
(this value should be about 1.481274)
Z.shape:
(50, 1)
2.6.2 恢复数据函数(recoverData.py)
恢复数据函数公式:
def recover_data(Z, U, K):
# You need to return the following variables correctly.
X_rec = np.zeros((Z.shape[0], U.shape[0]))
# ===================== Your Code Here =====================
Ureduce = U[:, np.arange(K)]
X_rec = np.dot(Z, Ureduce.T)
# ==========================================================
return X_rec
调用函数恢复数据
X_rec = recover_data(Z, U, K)
print('Approximation of the first example: {}'.format(X_rec[0]))
print('(this value should be about [-1.047419 -1.047419])')
Approximation of the first example: [-1.04741883 -1.04741883]
(this value should be about [-1.047419 -1.047419])
绘制规范化数据、恢复后投影到原始空间(只保留U1方向)的数据及数据投影线
# Draw lines connecting the projected points to the original points
plt.figure(figsize=(8, 8))
plt.scatter(X_norm[:, 0], X_norm[:, 1], facecolors='none', edgecolors='b', s=20)
plt.scatter(X_rec[:, 0], X_rec[:, 1], facecolors='none', edgecolors='r', s=20)
for i in range(X_norm.shape[0]):
rk.draw_line(X_norm[i], X_rec[i])
2.7 PCA在人脸数据压缩上的示例
导入数据
# ===================== Part 4: Loading and Visualizing Face Data =====================
print('Loading face dataset.')
# Load Face dataset
data = scio.loadmat('ex7faces.mat')
X = data['X']
plt.figure(figsize=(12, 12))
dp.display_data(X[0:100]) #绘制图形 方法见EX7第一部分相关内容
Loading face dataset.
<Figure size 864x864 with 0 Axes>
共5000张 32 * 32 灰度人脸图像
X.shape
(5000, 1024)
调用plt.imshow模块显示一张人脸图像
plt.imshow(X[1].reshape(32, 32).T,cmap='gray')
<matplotlib.image.AxesImage at 0x8be0b38>
2.7.1 正则化、主成分分析、及绘制主成分数据图形
# ===================== Part 5: PCA on Face Data: Eigenfaces =====================
print('Running PCA on face dataset.\n(this might take a minute or two ...)')
# Before running PCA, it is important to first normalize X by subtracting
X_norm, mu, sigma = fn.feature_normalize(X)
# Run PCA
U, S = pca(X_norm)
# Visualize the top 36 eigenvectors found
dp.display_data(U[:, 0:36].T)
Running PCA on face dataset.
(this might take a minute or two ...)
2.7.2 投影数据
取前从1024个维度中取100作为主成分
# ===================== Part 6: Dimension Reduction for Faces =====================
# Project images to the eigen space using the top k eigenvectors
# If you are applying a machine learning algorithm
print('Dimension reduction for face dataset.')
K = 100
Z = project_data(X_norm, U, K)
print('The projected data Z has a shape of: {}'.format(Z.shape))
数据已经变成5000 * 100维
Dimension reduction for face dataset.
The projected data Z has a shape of: (5000, 100)
2.7.3 恢复数据
# =========== Part 7: Visualization of Faces after PCA Dimension Reduction ===========
X_rec = recover_data(Z, U, K)
Visualizing the projected (reduced dimension) faces.
主成分数据维度 1024 * 1024
U.shape
(1024, 1024)
投影数据维度 5000 * 100
Z.shape
(5000, 100)
恢复数据维度 5000 * 1024
X_rec.shape
(5000, 1024)
2.7.4 绘制原始数据(规范化后)和恢复数据图形
# Display normalized data
dp.display_data(X_norm[0:100])
plt.title('Original faces')
plt.axis('equal')
# Display reconstructed data from only k eigenfaces
dp.display_data(X_rec[0:100])
plt.title('Recovered faces')
plt.axis('equal')
(-1.0, 1.0, -1.0, 1.0)
2.8 数据可视化示例
该示例通过主成分分析,将三围数据压缩成二维数据,更方便数据进行可视化
# ===================== Part 8(a): PCA for Visualization =====================
# For this to work, you need to complete the K-Means assignment first
image = io.imread('bird_small.png') # 读取数据
image = img_as_float(image) # 数据浮点型处理
img_shape = image.shape
X = image.reshape((img_shape[0] * img_shape[1], 3))
原始图像的维度:128 * 128 * 3
img_shape
(128, 128, 3)
数据reshape后维度
X.shape
(16384, 3)
2.8.1 通过K近邻算法进行数据压缩处理
导入模块并执行K近邻算法
import kMeansInitCentroids as kmic # 随机初始化函数,详见 EX7 第一部分
import runkMeans as km # K近邻算法函数,详见EX7 第一部分
K = 16 # 保留10个聚类中心
max_iters = 10 # 迭代10次
initial_centroids = kmic.kmeans_init_centroids(X, K)
centroids, idx = km.run_kmeans(X, initial_centroids, max_iters, False)
K-Means iteration 1/10
K-Means iteration 2/10
K-Means iteration 3/10
K-Means iteration 4/10
K-Means iteration 5/10
K-Means iteration 6/10
K-Means iteration 7/10
K-Means iteration 8/10
K-Means iteration 9/10
K-Means iteration 10/10
只选取1000条数据进行演示
# Sample 1000 random indices (since working with all the data is
# too expensive. If you have a fast computer, you may increase this.
selected = np.random.randint(X.shape[0], size=1000)
selected.shape
(1000,)
2.8.2 绘制选中的1000条数据3维图形
# Visualize the data and centroid memberships in 3D
cm = plt.cm.get_cmap('RdYlBu')
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X[selected, 0], X[selected, 1], X[selected, 2], c=idx[selected].astype(np.float64), s=15, cmap=cm, vmin=0, vmax=K)
plt.title('Pixel dataset plotted in 3D. Color shows centroid memberships')
Text(0.5,0.92,'Pixel dataset plotted in 3D. Color shows centroid memberships')
2.8.3 规范化、主成分分析、绘制2D图形
# ===================== Part 8(b): PCA for Visualization =====================
# Use PCA to project this cloud to 2D for visualization
X_norm, mu, sigma = fn.feature_normalize(X) # 规范化
# PCA and project the data to 2D
U, S = pca(X_norm) # 主成分分析
Z = project_data(X_norm, U, 2) # 投影数据
# Plot in 2D
plt.figure()
plt.scatter(Z[selected, 0], Z[selected, 1], c=idx[selected].astype(np.float64), s=15, cmap=cm) # 绘制图形
plt.title('Pixel dataset plotted in 2D, using PCA for dimensionality reduction')
Text(0.5,1,'Pixel dataset plotted in 2D, using PCA for dimensionality reduction')
Z.shape
(16384, 2)