第5章 特征抽取
5.1 无监督特征抽取
请参考《数据准备和特征工程》中的相关章节,调试如下代码。

本节视频课程:无监督特征抽取
5.1.1 主成分分析
基础知识
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data
X[: 4]
array([[5.1, 3.5, 1.4, 0.2],
[4.9, 3. , 1.4, 0.2],
[4.7, 3.2, 1.3, 0.2],
[4.6, 3.1, 1.5, 0.2]])
from sklearn.decomposition import PCA
import numpy as np
# PCA算法可通过协方差实现,默认使用奇异值SVD分解
pca = PCA() # 创建PCA(主成分分析)模型
X_pca = pca.fit_transform(X) # 对X进行转换
np.round(X_pca[: 4], 2) # 对所有数据取两位小数
array([[-2.68, 0.32, -0.03, -0. ],
[-2.71, -0.18, -0.21, -0.1 ],
[-2.89, -0.14, 0.02, -0.02],
[-2.75, -0.32, 0.03, 0.08]])
# 获取各个特征的可解释方差比,比例越高的特征越重要
pca.explained_variance_ratio_
array([0.92461872, 0.05306648, 0.01710261, 0.00521218])
# 保留两个特征
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
X_pca[: 4]
array([[-2.68412563, 0.31939725],
[-2.71414169, -0.17700123],
[-2.88899057, -0.14494943],
[-2.74534286, -0.31829898]])
# 计算保留下来的两个特征的可解释方差比例之和
pca.explained_variance_ratio_.sum()
0.977685206318795
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, iris.target,
test_size=0.3,
random_state=0)
# 创建决策树分类模型clf,并通过训练好的模型来生成预测结果
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
# 根据预测结果进行评分accuracy_score
accuracy = accuracy_score(y_test, y_pred)
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(
X_pca, iris.target,
test_size=0.3,
random_state=0)
clf2 = DecisionTreeClassifier()
clf2.fit(X_train_pca, y_train_pca)
y_pred_pca = clf2.predict(X_test_pca)
accuracy2 = accuracy_score(y_test_pca, y_pred_pca)
print("含4个特征的数据集的预测准确率: ", np.round(accuracy, 2))
print("含2个特征的数据集的预测准确率: ", np.round(accuracy2, 2))
含4个特征的数据集的预测准确率: 0.98
含2个特征的数据集的预测准确率: 0.91
项目案例
from scipy.io import loadmat
mnist = loadmat("data/data20537/mnist-original.mat")
mnist
{'__header__': b'MATLAB 5.0 MAT-file Platform: posix, Created on: Sun Mar 30 03:19:02 2014',
'__version__': '1.0',
'__globals__': [],
'mldata_descr_ordering': array([[array(['label'], dtype='<U5'), array(['data'], dtype='<U4')]],
dtype=object),
'data': array([[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
...,
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
'label': array([[0., 0., 0., ..., 9., 9., 9.]])}
mnist_data = mnist["data"].T
mnist_label = mnist["label"][0]
mnist_data.shape
(70000, 784)
# 保留可解释方差比例大于0.95的特征
pca = PCA(.95)
lower_dimensional_data = pca.fit_transform(mnist_data)
lower_dimensional_data.shape
(70000, 154)
# 复制文件到matplotlib字体路径
!cp simhei.ttf /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/matplotlib/mpl-data/fonts/ttf/
# 删除matplotlib的缓冲目录
!rm -rf .cache/matplotlib
# 重启内核
%matplotlib inline
import matplotlib.pyplot as plt
# 设置显示中文
plt.rcParams['font.sans-serif'] = ['SimHei'] # 指定默认字体
plt.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题}
plt.figure(figsize=(8,4));
# 原图,plt.imshow:把数值展示成热图
#gray:黑-白 interpolation:显示图像的插值方法。
plt.subplot(1, 2, 1);
plt.imshow(mnist_data[1].reshape(28,28),
cmap = plt.cm.gray,
interpolation='nearest', #
clim=(0, 255));
plt.xlabel('784 components', fontsize = 14)
plt.title('原图', fontsize = 20);
# 154 principal components
iverse_data = pca.inverse_transform(lower_dimensional_data)
plt.subplot(1, 2, 2);
plt.imshow(iverse_data[1].reshape(28, 28),
cmap = plt.cm.gray,
interpolation='nearest',
clim=(0, 255));
plt.xlabel('154 components', fontsize = 14)
plt.title('特征抽取后的图', fontsize = 20)
Text(0.5,1,'特征抽取后的图')
import time
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from scipy.io import loadmat
import pandas as pd
mnist = loadmat("data/data20537/mnist-original.mat")
mnist_data = mnist["data"].T
mnist_label = mnist["label"][0]
# 划分训练集和测试集,并对训练集进行特征标准化
train_img, test_img, train_lbl, test_lbl = train_test_split(
mnist_data,
mnist_label,
test_size=1/7.0,
random_state=0)
scaler = StandardScaler()
scaler.fit(train_img)
train_img = scaler.transform(train_img)
test_img = scaler.transform(test_img)
# logistic_reg:对数几率回归,exp_var为保留比例
def logistic_reg(exp_var):
pca = PCA(exp_var)
pca.fit(train_img)
# 有监督特征提取,计算两个标签集之间的差异
lr = LogisticRegression(solver = 'lbfgs')
lr.fit(pca.transform(train_img), train_lbl)
lbl_pred = lr.predict(pca.transform(test_img))
acc = accuracy_score(test_lbl, lbl_pred)
# 返回特征数量,和评分结果
return pca.n_components_, acc
v, n, a, t = [], [], [], []
for i in [None, 0.99, 0.95, 0.90, 0.85]:
start = time.time()
components, accuracy = logistic_reg(i)
stop = time.time()
deltat = stop - start
v.append(i)
n.append(components)
a.append(accuracy)
t.append(deltat)
df = pd.DataFrame({"Var_ratio":v,
"N_components":n,
"Accuracy": a,
"Times": t})
df
Var_ratio | N_components | Accuracy | Times | |
---|---|---|---|---|
0 | NaN | 784 | 0.9164 | 67.382504 |
1 | 0.99 | 541 | 0.9198 | 46.709717 |
2 | 0.95 | 330 | 0.9215 | 41.099790 |
3 | 0.90 | 236 | 0.9216 | 40.692168 |
4 | 0.85 | 184 | 0.9204 | 41.502007 |
动手练习
import pandas as pd
# 练习通过URL获得数据的方法
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
# 从url加载数据集
df = pd.read_csv(url, names=['sepal length','sepal width','petal length','petal width','target'])
from sklearn.preprocessing import StandardScaler
features = ['sepal length', 'sepal width', 'petal length', 'petal width']
# 划分特征集x和标签集y,并对x进行标准化
x = df.loc[:, features].values
y = df.loc[:,['target']].values
x = StandardScaler().fit_transform(x)
from sklearn.decomposition import PCA
# 对x进行特征抽取,得到最后只有2个主成分的集合principalDf
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents
, columns = ['主成分1', '主成分2'])
finalDf = pd.concat([principalDf, df[['target']]], axis = 1)
# ax.scatter: 绘制散点图
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1)
ax.set_xlabel('主成分1', fontsize = 15)
ax.set_ylabel('主成分2', fontsize = 15)
ax.set_title('PCA的2个主成分', fontsize = 20)
targets = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
colors = ['r', 'g', 'b']
for target, color in zip(targets,colors):
# 定位到要保留的索引:indicesToKeep
indicesToKeep = finalDf['target'] == target
ax.scatter(finalDf.loc[indicesToKeep, '主成分1']
, finalDf.loc[indicesToKeep, '主成分2']
, c = color
, s = 50)
ax.legend(targets)
ax.grid()
pca.explained_variance_ratio_
array([0.72770452, 0.23030523])
5.1.2 因子分析
基础知识
from sklearn.decomposition import FactorAnalysis
# FactorAnalysis: 因子分析
fa = FactorAnalysis()
iris_fa = fa.fit(iris.data)
# 展示了训练后的各个数据与潜在因子的方差
fa.components_
array([[ 0.70698856, -0.15800499, 1.65423609, 0.70084996],
[ 0.115161 , 0.15963548, -0.04432109, -0.01403039],
[-0. , 0. , 0. , 0. ],
[-0. , 0. , 0. , -0. ]])
# 设置特征抽取的数量为2
fa = FactorAnalysis(n_components=2)
iris_two = fa.fit_transform(iris.data)
iris_two[: 4]
array([[-1.32761727, -0.56131076],
[-1.33763854, -0.00279765],
[-1.40281483, 0.30634949],
[-1.30104274, 0.71882683]])
%matplotlib inline
import matplotlib.pyplot as plt
# 绘制FA的散点图
f = plt.figure(figsize=(5, 5))
ax = f.add_subplot(111)
ax.scatter(iris_two[:,0], iris_two[:, 1], c=iris.target)
ax.set_title("包含2个因子的FA")
Text(0.5,1,'包含2个因子的FA')
# 绘制PCA的散点图
f = plt.figure(figsize=(5, 5))
ax = f.add_subplot(111)
ax.scatter(X_pca[:,0], X_pca[:, 1], c=iris.target)
ax.set_title("包含2个主成分的PCA")
Text(0.5,1,'包含2个主成分的PCA')
项目案例
import pandas as pd
df = pd.read_csv("/home/aistudio/data/data20537/bfi.csv")
df.columns
Index(['Unnamed: 0', 'A1', 'A2', 'A3', 'A4', 'A5', 'C1', 'C2', 'C3', 'C4',
'C5', 'E1', 'E2', 'E3', 'E4', 'E5', 'N1', 'N2', 'N3', 'N4', 'N5', 'O1',
'O2', 'O3', 'O4', 'O5', 'gender', 'education', 'age'],
dtype='object')
# 删除无关数据
df.drop(['Unnamed: 0', 'gender', 'education', 'age'],axis=1,inplace=True)
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2800 entries, 0 to 2799
Data columns (total 25 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 A1 2784 non-null float64
1 A2 2773 non-null float64
2 A3 2774 non-null float64
3 A4 2781 non-null float64
4 A5 2784 non-null float64
5 C1 2779 non-null float64
6 C2 2776 non-null float64
7 C3 2780 non-null float64
8 C4 2774 non-null float64
9 C5 2784 non-null float64
10 E1 2777 non-null float64
11 E2 2784 non-null float64
12 E3 2775 non-null float64
13 E4 2791 non-null float64
14 E5 2779 non-null float64
15 N1 2778 non-null float64
16 N2 2779 non-null float64
17 N3 2789 non-null float64
18 N4 2764 non-null float64
19 N5 2771 non-null float64
20 O1 2778 non-null float64
21 O2 2800 non-null int64
22 O3 2772 non-null float64
23 O4 2786 non-null float64
24 O5 2780 non-null float64
dtypes: float64(24), int64(1)
memory usage: 547.0 KB
df.dropna(inplace=True)
# !mkdir /home/aistudio/external-libraries
# !pip install -i https://pypi.tuna.tsinghua.edu.cn/simple factor-analyzer -t /home/aistudio/external-libraries
import sys
sys.path.append('/home/aistudio/external-libraries')
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity
# bartlett球形检验,p_value为0说明拒绝原假设,可以进行因子分析
# 特征相关性检验的方法还有很多,如:KMO检验,目的是为了确认各个特征的背后是否存在某种联系
chi_square_value,p_value=calculate_bartlett_sphericity(df)
chi_square_value, p_value
(18170.9663508689, 0.0)
from factor_analyzer.factor_analyzer import calculate_kmo
# KMO检验,结果值在0~1之间,越接近于1,变量间的相关性越强,因子分析的效果越好。
kmo_all,kmo_model=calculate_kmo(df)
kmo_model
0.8485397221949231
from factor_analyzer import FactorAnalyzer
# 创建因子分析模型:fa,训练抽取其中的25个特征
# rotation:因子旋转的方法,None就是不进行因子旋转,'varimax’为方差最大旋转法
fa = FactorAnalyzer(rotation=None)
fa.fit(df, 25)
# 获取这25个特征的本征值,计算因子相关矩阵ev和特征值v
ev, v = fa.get_eigenvalues()
ev
array([5.13431118, 2.75188667, 2.14270195, 1.85232761, 1.54816285,
1.07358247, 0.83953893, 0.79920618, 0.71898919, 0.68808879,
0.67637336, 0.65179984, 0.62325295, 0.59656284, 0.56309083,
0.54330533, 0.51451752, 0.49450315, 0.48263952, 0.448921 ,
0.42336611, 0.40067145, 0.38780448, 0.38185679, 0.26253902])
# 绘制各特征因子与本征值的分布图
plt.scatter(range(1,df.shape[1]+1),ev)
plt.plot(range(1,df.shape[1]+1),ev)
plt.title('特征因子与本征值的分布图')
plt.xlabel('因子')
plt.ylabel('本征值')
plt.grid()
动手练习
# rotation="varimax":因子旋转的方法为:方差最大旋转法
fa = FactorAnalyzer(rotation="varimax", n_factors=5)
fa.fit(df)
fa.transform(df)
array([[-0.43983041, 0.10389656, -1.21671279, -0.69621532, -1.51944901],
[ 0.08436922, 0.55931085, -0.60152092, -0.18934143, -0.24838425],
[ 0.5205342 , 0.3182346 , 0.02564572, -0.75113554, 0.20450298],
...,
[-0.22326681, 0.72116924, 0.78710981, -1.05847976, 0.92510304],
[ 0.92052049, 0.7493374 , 0.07095309, -2.13514505, 0.67346015],
[-1.68998315, -1.35240274, -0.08143349, -1.85488924, -0.16387988]])