sklearn----pca

"""用pca降维后一个人脸识别的实列"""

from sklearn.datasets import fetch_olivetti_faces
import time
import numpy as np
import logging
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

# 读取数据
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
data_home = 'datasets/'
logging.info('Start to load dataset')
faces = fetch_olivetti_faces(data_home=data_home)
logging.info('Done with load dataset')

X = faces.data
y = faces.target

"""将数据画出来"""
# targets = np.unique(faces.target)
# targets_names = np.array(["c%d" % t for t in targets])
# n_targets = targets_names.shape[0]
# n_samples, h, w = faces.images.shape
# print(n_samples)
# print(n_targets)
# print(w, h, X.shape)
# 
# 
# def plot_gallery(images, titles, h, w, n_row=2, n_col=5):
#     """显示图片阵列"""
#     plt.figure(figsize=(2 * n_col, 2.2 * n_row), dpi=144)
#     plt.subplots_adjust(bottom=0, left=0.01, right=.99, top=.90, hspace=.01)
#     for i in range(n_row * n_col):
#         plt.subplot(n_row, n_col, i+1)
#         plt.imshow(images[i].reshape((h, w)), cmap=plt.cm.gray)
#         # plt.title(titles[i])
#         plt.axis('off')
#     plt.show()
# 
# 
# n_row = 2
# n_col = 6
# 
# sample_images = None
# sample_titles = []
# 
# for i in range(n_targets):
#     people_images = X[y == i]
#     people_sample_index = np.random.randint(0, people_images.shape[0], 1)
#     people_sample_image = people_images[people_sample_index, :]
# 
#     if sample_images is not None:
#         sample_images = np.concatenate((sample_images, people_sample_image), axis=0)
#     else:
#         sample_images = people_sample_image
#     sample_titles.append(targets_names[i])

# plot_gallery(sample_images, sample_titles, h, w, n_row, n_col)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

"""找出k个特征属性,并呼画出数据还原率和k个属性之间的关系"""
# print("Exploring explained variance ratio for dataset")
# candidate_components = range(10, 300, 30)
# explained_ratios = []
# start = time.perf_counter()
# for c in candidate_components:
#     pca = PCA(n_components=c)
#     X_pca = pca.fit_transform(X)
#     explained_ratios.append(np.sum(pca.explained_variance_ratio_))
# print('Done in {0:.2f}s'.format(time.perf_counter() - start))

# 画出k个属性和数据还原率的图像
# plt.figure(figsize=(10, 6), dpi=144)
# plt.grid()
# plt.plot(candidate_components, explained_ratios)
# plt.xlabel('Number of PCA Components')
# plt.ylabel('Explained Variance Ratio')
# plt.title('Explained variance ratio for PCA')
# plt.yticks(np.arange(0.5, 1.05, .05))
# plt.xticks(np.arange(0, 300, 20))
# plt.show()

# 画出用不同的k个属性进行降维后的图像和原图的对比
# n_row = 5
# n_col = 1
#
# sample_images = sample_images[0:5]
# sample_titles = sample_titles[0:5]
#
# plotting_images = sample_images
# plotting_titles = sample_titles
# candidate_components = [140, 75, 37, 19, 8]
# for c in candidate_components:
#     print("Fitting and projecting on PCA(n_components={}) ...".format(c))
#     start = time.perf_counter()
#     pca = PCA(n_components=c)
#     pca.fit(X)
#     X_sample_pca = pca.transform(sample_images)
#     X_sample_inv = pca.inverse_transform(X_sample_pca)
#     plotting_images = np.concatenate((plotting_images, X_sample_inv), axis=0)
#     # sample_title_pca = sample_titles
#     plotting_titles = np.concatenate((plotting_titles, sample_titles), axis=0)
#     print("Done in {0:.2f}s".format(time.perf_counter()-start))
#
#
# print("Plotting sample image with different number of PCA components ...")
# plot_gallery(plotting_images, plotting_titles, h, w, 6, n_row)

# 进行特征pca降维,其中选择的特征个数为140
n_component = 140
print("Fitting PCA by using train data...")
start = time.perf_counter()
pca = PCA(n_components=n_component, svd_solver='randomized', whiten=True).fit(X_train)
print('Done in {0:.2f}s'.format(time.perf_counter() - start))
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

# 找出利用SVM算法的最佳系数
print('Searching the best parameters for SVC...')
param_grid = {'C': [1, 5, 10, 50, 100], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01]}
clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid, verbose=2, n_jobs=4)
clf.fit(X_train_pca, y_train)
print("Best  parameters found by grid search: %s" % clf.best_params_)
# 算出模型的准确率
print("Predict test dataset...")
test_score = clf.score(X_test_pca, y_test)
print(test_score)
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值