关注微信公共号:小程在线
关注CSDN博客:程志伟的博客
主要从PCA原理、PCA在手写、人脸实例中的运用
PCA使用的信息量衡量指标,就是样本方差,又称可解释性方差,方差越大,特征所带的信息量越多。
Python 3.7.3 (default, Apr 24 2019, 15:29:51) [MSC v.1915 64 bit (AMD64)]
Type "copyright", "credits" or "license" for more information.
IPython 7.6.1 -- An enhanced Interactive Python.
1. 调用库和模块
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
2. 提取数据集
iris = load_iris()
y = iris.target
X = iris.data
X.shape
Out[3]: (150, 4)
import pandas as pd
pd.DataFrame(X)
Out[4]:
0 1 2 3
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 1.3 0.2
3 4.6 3.1 1.5 0.2
4 5.0 3.6 1.4 0.2
.. ... ... ... ...
145 6.7 3.0 5.2 2.3
146 6.3 2.5 5.0 1.9
147 6.5 3.0 5.2 2.0
148 6.2 3.4 5.4 2.3
149 5.9 3.0 5.1 1.8
[150 rows x 4 columns]
3. 建模
#调用PCA
pca = PCA(n_components=2) #实例化
pca = pca.fit(X) #拟合模型
X_dr = pca.transform(X) #获取新矩阵
X_dr
Out[5]:
array([[-2.68412563, 0.31939725],
[-2.71414169, -0.17700123],
[-2.88899057, -0.14494943],
......
[ 1.76434572, 0.07885885],
[ 1.90094161, 0.11662796],
[ 1.39018886, -0.28266094]])
#也可以fit_transform一步到位
#X_dr = PCA(2).fit_transform(X)
#要将三种鸢尾花的数据分布显示在二维平面坐标系中,对应的两个坐标(两个特征向量)应该是三种鸢尾花降维后的
4. 可视化
plt.figure()
plt.scatter(X_dr[y==0, 0], X_dr[y==0, 1], c="red", label=iris.target_names[0])
plt.scatter(X_dr[y==1, 0], X_dr[y==1, 1], c="black", label=iris.target_names[1])
plt.scatter(X_dr[y==2, 0], X_dr[y==2, 1], c="orange", label=iris.target_names[2])
plt.legend()
plt.title('PCA of IRIS dataset')
plt.show()

colors = ['red', 'black', 'orange']
iris.target_names
plt.figure()
for i in [0, 1, 2]:
plt.scatter(X_dr[y == i, 0]
,X_dr[y == i, 1]
,alpha=.7
,c=colors[i]
,label=iris.target_names[i]
)
plt.legend()
plt.title('PCA of IRIS dataset')
plt.show()

5. 探索降维后的数据
属性explained_variance_,查看降维后每个新特征向量上所带的信息量大小(可解释性方差的大小)
pca.explained_variance_
Out[8]: array([4.22824171, 0.24267075])
#属性explained_variance_ratio,查看降维后每个新特征向量所占的信息量占原始数据总信息量的百分比
#又叫做可解释方差贡献率
pca.explained_variance_ratio_
Out[9]: array([0.92461872, 0.05306648])
#大部分信息都被有效地集中在了第一个特征上
pca.explained_variance_ratio_.sum()
Out[10]: 0.977685206318795
6. 选择最好的n_components:累积可解释方差贡献率曲线
import numpy as np
pca_line = PCA().fit(X)
plt.plot([1,2,3,4],np.cumsum(pca_line.explained_variance_ratio_))
plt.xticks([1,2,3,4]) #这是为了限制坐标轴显示为整数
plt.xlabel("number of components after dimension reduction")
plt.ylabel("cumulative explained variance ratio")
plt.show()

2.2.2 最大似然估计自选超参数
pca_mle = PCA(n_components="mle")
pca_mle = pca_mle.fit(X)
X_mle = pca_mle.transform(X)
X_mle
Out[12]:
array([[-2.68412563, 0.31939725, -0.02791483],
[-2.71414169, -0.17700123, -0.21046427],
[-2.88899057, -0.14494943, 0.01790026],
......
[ 1.52716661, -0.37531698, -0.12189817],
[ 1.76434572, 0.07885885, 0.13048163],
[ 1.90094161, 0.11662796, 0.72325156],
[ 1.39018886, -0.28266094, 0.36290965]])
pca_mle.explained_variance_ratio_.sum()
Out[13]: 0.9947878161267247
pca_f = PCA(n_components=0.97,svd_solver="full")
pca_f = pca_f.fit(X)
X_f = pca_f.transform(X)
pca_f.explained_variance_ratio_
Out[14]: array([0.92461872, 0.05306648])
#得到了比设定2个特征时更高的信息含量,对于鸢尾花这个很小的数据集来说,3个特征对应这么高的信息含量,并不
需要去纠结于只保留2个特征,毕竟三个特征也可以可视化
2.2.3 按信息量占比选超参数
输入[0,1]之间的浮点数,并且让参数svd_solver =='full',表示希望降维后的总解释性方差占比大于n_components
指定的百分比,即是说,希望保留百分之多少的信息量
pca_f = PCA(n_components=0.97,svd_solver="full")
pca_f = pca_f.fit(X)
X_f = pca_f.transform(X)
pca_f.explained_variance_ratio_
Out[16]: array([0.92461872, 0.05306648])
2.3.3 重要属性components_
from sklearn.datasets import fetch_lfw_people
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np
faces = fetch_lfw_people(min_faces_per_person=60)
faces.images.shape
Out[20]: (1348, 62, 47)
#1277是矩阵中图像的个数
#62是每个图像矩阵的行
#47是每个图像矩阵的列
faces.data.shape
Out[21]: (1348, 2914)
X = faces.data
fig, axes = plt.subplots(4,5
,figsize=(8,4)
,subplot_kw = {"xticks":[],"yticks":[]} #不要显示坐标轴
)
fig
Out[23]:

axes
Out[24]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x0000021ACB098C88>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000021ACB1109B0>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000021ACB142DA0>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000021ACB17F390>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000021ACB1AF940>],
[<matplotlib.axes._subplots.AxesSubplot object at 0x0000021ACB1E2EF0>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000021ACB21F4E0>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000021ACB24FA58>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000021ACB291080>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000021ACB2BE630>],
[<matplotlib.axes._subplots.AxesSubplot object at 0x0000021ACB2EFBE0>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000021ACB32F1D0>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000021ACB35E780>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000021ACB390D30>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000021ACB3CF320>],
[<matplotlib.axes._subplots.AxesSubplot object at 0x0000021ACB3FF8D0>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000021ACB433E80>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000021ACB46F470>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000021ACB4A0A20>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000021ACB4D2FD0>]],
dtype=object)
#填充图像
fig, axes = plt.subplots(4,5
,figsize=(8,4)
,subplot_kw = {"xticks":[],"yticks":[]} #不要显示坐标轴
)
for i, ax in enumerate(axes.flat):
ax.imshow(faces.images[i,:,:],cmap="gray" #选择色彩的模式
)
4. 建模降维,提取新特征空间矩阵
pca = PCA(150).fit(X)
V = pca.components_
V.shape
Out[32]: (150, 2914)
5. 将新特征空间矩阵可视化
fig, axes = plt.subplots(3,8,figsize=(8,4),subplot_kw = {"xticks":[],"yticks":[]})
for i, ax in enumerate(axes.flat):
ax.imshow(V[i,:].reshape(62,47),cmap="gray")
2.4 重要接口inverse_transform
2.4.1 迷你案例:用人脸识别看PCA降维后的信息保存量
1. 导入需要的库和模块(与2.3.3节中步骤一致)
from sklearn.datasets import fetch_lfw_people
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np
2. 导入数据,探索数据
faces = fetch_lfw_people(min_faces_per_person=60)
faces.images.shape
Out[35]: (1348, 62, 47)
faces.data.shape
Out[36]: (1348, 2914)
X = faces.data
3. 建模降维,获取降维后的特征矩阵X_dr
pca = PCA(150)
X_dr = pca.fit_transform(X)
X_dr.shape
Out[38]: (1348, 150)
4. 将降维后矩阵用inverse_transform返回原空间
X_inverse = pca.inverse_transform(X_dr)
X_inverse.shape
Out[39]: (1348, 2914)
5. 将特征矩阵X和X_inverse可视化
fig, ax = plt.subplots(2,10,figsize=(10,2.5)
,subplot_kw={"xticks":[],"yticks":[]}
)
#和2.3.3节中的案例一样,我们需要对子图对象进行遍历的循环,来将图像填入子图中
#那在这里,我们使用怎样的循环?
#现在我们的ax中是2行10列,第一行是原数据,第二行是inverse_transform后返回的数据
#所以我们需要同时循环两份数据,即一次循环画一列上的两张图,而不是把ax拉平
for i in range(10):
ax[0,i].imshow(faces.images[i,:,:],cmap="binary_r")
ax[1,i].imshow(X_inverse[i].reshape(62,47),cmap="binary_r")
2.4.2 迷你案例:用PCA做噪音过滤
1. 导入所需要的库和模块
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np
2. 导入数据,探索数据
digits = load_digits()
digits.data.shape
Out[2]: (1797, 64)
set(digits.target.tolist())
Out[3]: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
3. 定义画图函数
def plot_digits(data):
fig, axes = plt.subplots(4,10,figsize=(10,4)
,subplot_kw = {"xticks":[],"yticks":[]}
)
for i, ax in enumerate(axes.flat):
ax.imshow(data[i].reshape(8,8),cmap="binary")
plot_digits(digits.data)

4. 为数据加上噪音
np.random.RandomState(42)
#在指定的数据集中,随机抽取服从正态分布的数据
#两个参数,分别是指定的数据集,和抽取出来的正太分布的方差
noisy = np.random.normal(digits.data,2)
plot_digits(noisy)

5. 降维
pca = PCA(0.5).fit(noisy)
X_dr = pca.transform(noisy)
X_dr.shape
Out[9]: (1797, 6)
6. 逆转降维结果,实现降噪
without_noise = pca.inverse_transform(X_dr)
plot_digits(without_noise)

3 案例:PCA对手写数字数据集的降维
1. 导入需要的模块和库
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
2. 导入数据,探索数据
data = pd.read_csv(r"H:\程志伟\digit recognizor.csv")
X = data.iloc[:,1:]
y = data.iloc[:,0]
X.shape
Out[12]: (42000, 784)
3. 画累计方差贡献率曲线,找最佳降维后维度的范围
pca_line = PCA().fit(X)
plt.figure(figsize=[20,5])
plt.plot(np.cumsum(pca_line.explained_variance_ratio_))
plt.xlabel("number of components after dimension reduction")
plt.ylabel("cumulative explained variance ratio")
plt.show()

4. 降维后维度的学习曲线,继续缩小最佳维度的范围
score = []
for i in range(1,101,10):
X_dr = PCA(i).fit_transform(X)
once = cross_val_score(RFC(n_estimators=10,random_state=0)
,X_dr,y,cv=5).mean()
score.append(once)
plt.figure(figsize=[20,5])
plt.plot(range(1,101,10),score)
plt.show()
5. 细化学习曲线,找出降维后的最佳维度
score = []
for i in range(10,25):
X_dr = PCA(i).fit_transform(X)
once = cross_val_score(RFC(n_estimators=10,random_state=0),X_dr,y,cv=5).mean()
score.append(once)
plt.figure(figsize=[20,5])
plt.plot(range(10,25),score)
plt.show()
6. 导入找出的最佳维度进行降维,查看模型效果
X_dr = PCA(23).fit_transform(X)
cross_val_score(RFC(n_estimators=10,random_state=0),X_dr,y,cv=5).mean()
Out[17]: 0.9178338210669867
cross_val_score(RFC(n_estimators=100,random_state=0),X_dr,y,cv=5).mean()
Out[18]: 0.9452863087790204
7. 特征数量已经不足原来的3%,换模型
from sklearn.neighbors import KNeighborsClassifier as KNN
cross_val_score(KNN(),X_dr,y,cv=5).mean()
Out[19]: 0.9698566957488104
8. KNN的k值学习曲线
score = []
for i in range(10):
X_dr = PCA(23).fit_transform(X)
once = cross_val_score(KNN(i+1),X_dr,y,cv=5).mean()
score.append(once)
plt.figure(figsize=[20,5])
plt.plot(range(10),score)
plt.show()
9. 定下超参数后,模型效果
cross_val_score(KNN(5),X_dr,y,cv=5).mean()
Out[22]: 0.9698567467712594
可以发现,原本785列的特征被我们缩减到23列之后,用KNN跑出了目前位置这个数据集上最好的结果