1.10 PCA (主成分分析)
以鸢尾花 iris 数据为例,展示 PCA 的使用。
手动实现 PCA
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
%matplotlib inline
# 载入数据
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['label'] = iris.target
df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
df.label.value_counts()
'''
2 50
1 50
0 50
Name: label, dtype: int64
'''
# 查看数据
df.tail()
# 查看数据
x = df.iloc[:, 0:4]
y = df.iloc[:, 4]
print("查看第一个数据:\n", x.iloc[0, 0:4])
print("查看第一个标签:\n", y.iloc[0])
'''
查看第一个数据:
sepal length 5.1
sepal width 3.5
petal length 1.4
petal width 0.2
Name: 0, dtype: float64
查看第一个标签:
0
'''
class PCA:
def __init__(self):
pass
def fit(self, X, n_components):
n_sample = np.shape(X)[0]
covariance_matrix = (1 / (n_sample-1)) * (X - X.mean(axis=0)).T.dot(X - X.mean(axis=0))
# 对协方差矩阵进行特征值分解
eigenvalues, eigenvectors = np.linalg.eig(covariance_matrix)
# 对特征值(特征向量)从大到小排序
idx = eigenvalues.argsort()[::-1]
eigenvalues = eigenvalues[idx][:n_components]
eigenvectors = np.atleast_1d(eigenvectors[:, idx])[:, :n_components]
# 得到低维表示
X_transformed = X.dot(eigenvectors)
return X_transformed
model = PCA()
Y = model.fit(x, 2)
principalDf = pd.DataFrame(np.array(Y),
columns=['principal component 1', 'principal component 2'])
Df = pd.concat([principalDf, y], axis = 1)
fig = plt.figure(figsize = (5,5))
ax = fig.add_subplot(1,1,1)
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component PCA', fontsize = 20)
targets = [0, 1, 2]
# ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
colors = ['r', 'g', 'b']
for target, color in zip(targets,colors):
indicesToKeep = Df['label'] == target
ax.scatter(Df.loc[indicesToKeep, 'principal component 1'],
Df.loc[indicesToKeep, 'principal component 2'],
c = color,
s = 50)
ax.legend(targets)
ax.grid()
使⽤ sklearn 包实现 PCA
from sklearn.decomposition import PCA as sklearnPCA
sklearn_pca = sklearnPCA(n_components=2)
Y = sklearn_pca.fit_transform(x)
principalDf = pd.DataFrame(data = np.array(Y), columns = ['principal component 1', 'principal component 2'])
Df = pd.concat([principalDf, y], axis = 1)
fig = plt.figure(figsize = (5,5))
ax = fig.add_subplot(1,1,1)
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component PCA', fontsize = 20)
targets = [0, 1, 2]
# ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
colors = ['r', 'g', 'b']
for target, color in zip(targets,colors):
indicesToKeep = Df['label'] == target
ax.scatter(Df.loc[indicesToKeep, 'principal component 1'],
Df.loc[indicesToKeep, 'principal component 2'],
c = color,
s = 50)
ax.legend(targets)
ax.grid()