准备工作
将fetch_lfw_people数据集下载放置本地用户里新建一个scikit_learn_data文件夹
#导包
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_lfw_people
#读取人脸数据
faces = fetch_lfw_people(min_faces_per_person=70, resize=1)
#获取特征和标签
data = faces.data
target = faces.target
images = faces.images
target_names = faces.target_names
#显示其中一张照片
plt.imshow(images[0], cmap='gray')
#查看标签人名及标签数量
for i,name in enumerate(target_names):
size = (target == i).sum()
print(name, size)
Ariel Sharon 77
Colin Powell 236
Donald Rumsfeld 121
George W Bush 530
Gerhard Schroeder 109
Hugo Chavez 71
Tony Blair 144
#拆分数据
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=88)
# 降维
from sklearn.decomposition import PCA
from sklearn.svm import SVC
#使用svc查看维度及得分
for n in range(20, 201, 10):
pca = PCA(n, whiten=True)
X_train_pca = pca.fit_transform(X_train)
svc = SVC()
%time svc.fit(X_train_pca, y_train)
X_test_pca = pca.transform(X_test)
score = svc.score(X_test_pca, y_test)
print(n, score)
Wall time: 53.8 ms
20 0.6818181818181818
Wall time: 61.8 ms
30 0.7954545454545454
Wall time: 74.8 ms
40 0.7840909090909091
Wall time: 88.8 ms
50 0.8181818181818182
Wall time: 99.7 ms
60 0.8068181818181818
Wall time: 119 ms
70 0.8068181818181818
Wall time: 131 ms
80 0.8295454545454546
Wall time: 154 ms
90 0.7840909090909091
Wall time: 150 ms
100 0.7727272727272727
Wall time: 168 ms
110 0.7727272727272727
Wall time: 198 ms
120 0.75
Wall time: 189 ms
130 0.7613636363636364
Wall time: 197 ms
140 0.7045454545454546
Wall time: 216 ms
150 0.7045454545454546
Wall time: 232 ms
160 0.7159090909090909
Wall time: 238 ms
170 0.7045454545454546
Wall time: 268 ms
180 0.6931818181818182
Wall time: 253 ms
190 0.6818181818181818
Wall time: 261 ms
200 0.6818181818181818
# n取80 , 再对svc进行调参
from sklearn.model_selection import GridSearchCV
svc = SVC()
param_grid = {
'C': [0.1, 1, 10, 20, 30],
'gamma': [1/800, 1/80, 1/8, 1.25, 12.5],
'kernel': ['linear', 'rbf', 'poly'],
}
gv = GridSearchCV(svc, param_grid, cv=5, n_jobs=-1, verbose=1)
#降维
pca = PCA(80, whiten=True)
#训练转换
X_train_pca = pca.fit_transform(X_train)
#查看降维后训练数据大小
X_train_pca.shape
#(1200, 80)
#调参训练
gv.fit(X_train_pca, y_train)
#得分
gv.best_score_
0.8441666666666666
#最佳参数
gv.best_params_
{‘C’: 10, ‘gamma’: 0.0125, ‘kernel’: ‘rbf’}
#查看降维的得分
gv.score(X_test_pca, y_test)
0.8409090909090909
#预测数据
y_ = gv.predict(X_test_pca)
#结果展示
# 8行6列, 48个.
plt.figure(figsize=(6*2.5, 8 * 3))
for i in range(48):
axes = plt.subplot(8, 6, i +1)
axes.imshow(X_test[i].reshape(125, 94), cmap='gray')
axes.axis('off')
#预测和真实比较
if y_[i] != y_test[i]:
axes.set_title('True:%s\nPredict:%s' % (target_names[y_test[i]].split()[-1], target_names[y_[i]].split()[-1]), fontdict=dict(fontsize=10, color='r'))
bush样本最多, 最容易把别人预测成bush.
chavez样本最少, 所以被预测错了最多.
#使用交叉表查看结果比较
df.columns = [name.split()[-1] for name in target_names.tolist()] + ['all']
df.index = [name.split()[-1] for name in target_names.tolist()] + ['all']
df
#在网上找一张图片进行预测
# 训练的图片是什么规格的,测试的时候一定要是相同的规格.
bush = plt.imread('./bush.jpeg')
plt.imshow(bush)
# 变成黑白的图片
bush_gray = np.dot(bush, [0.299, 0.587 ,0.114]) / 3
plt.imshow(bush_gray, cmap='gray')
# 切割人脸
bush_face = bush_gray[30:320, 105:300]
plt.imshow(bush_face, cmap='gray')
#查看预测图片大小
bush_face.shape
#(290, 195)
#查看元素图片大小
images[0].shape
#(125, 94)
#因为图片规格不一致,无法直接进行训练预测,需要进行转换,在这里采样线性插值进行转换
from scipy import ndimage
bush_zoomed = ndimage.zoom(bush_face, (125/290, 94/195))
plt.imshow(bush_zoomed, cmap='gray')
#查看线性插值后图片的大小与原图片大小一致
bush_zoomed.shape
#(125, 94)
# 降维
bush_pca = pca.transform(bush_zoomed.reshape(1,-1))
#预测
gv.predict(bush_pca)
#预测结果
target_names[1]
#'Colin Powell'