implementing PCA
前言
PCA,即主成分分析,是流行的降维算法(无监督学习算法),其主要应用有
- 减小数据集的特征维度,从而减小内存或者磁盘储存的消耗
- 提升算法效率,加快算法的运行
- 将数据集的特征维度减小到3维及以下,方便可视化
代码分析
首先导入类库
import numpy as np
import matplotlib.pyplot as plt
import scipy.io #Used to load the OCTAVE *.mat files
from random import sample #Used for random initialization
import scipy.misc #Used to show matrix as an image
import matplotlib.cm as cm #Used to display images in a specific colormap
from scipy import linalg #Used for the "SVD" function
import imageio
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline
导入数据并可视化
datafile = 'data/ex7data1.mat'
mat = scipy.io.loadmat( datafile )
X = mat['X']
#Quick plot
plt.figure(figsize=(7,5))
plot = plt.scatter(X[:,0], X[:,1], s=30, facecolors='none', edgecolors='b')
plt.title("Example Dataset",fontsize=18)
plt.grid(True)
下面开始编写PCA算法
数据集标准化函数
def featureNormalize(myX):
means = np.mean(myX,axis=0)#对每列求均值
myX_norm = myX - means
stds = np.std(myX_norm,axis=0)#对每列求标准差σ
myX_norm = myX_norm / stds
return means, stds, myX_norm
奇异值分解(SVD)函数,得到矩阵U,S,V
#SVD singular value decomposition
def getUSV(myX_norm):
#求协方差矩阵
cov_matrix = myX_norm.T.dot(myX_norm)/myX_norm.shape[0]
# Run single value decomposition to get the U principal component matrix
U, S, V = scipy.linalg.svd(cov_matrix, full_matrices = True, compute_uv = True)
return U, S, V
好了,算法编写完毕
调用算法,得到参数
# Feature normalize
means, stds, X_norm = featureNormalize(X)
# Run SVD
U, S, V = getUSV(X_norm)
将principal component可视化
print('Top principal component is ',U[:,0])
#快速绘图,现在包括主成分
plt.figure(figsize=(7,5))
plot = plt.scatter(X[:,0], X[:,1], s=30, facecolors='none', edgecolors='b')
plt.title