# %% md # # 在MNIST数据集上进行降维分析 # 图像数据维数高,而且特征之间(像素之间)相关性很高,因此我们预计用很少的维数就能保留足够多的信息 # # 降维技术: # PCA: PCA # MNIST数据集介绍: # 本数据来源于Kaggle竞赛提供的数据:Digit # Recognizer (https: // www.kaggle.com / c / digit - recognizer / data) # 训练集包含42, 000 # 个样本,测试集包含28, 000 # 个样本。 # 每个样本为28 * 28 # 的灰度图像,像素值在[0, 255] # 数据排列形式: # 000 # 001 # 002 # 003... # 026 # 027 # 02 # 8 # 02 # 9 # 030 # 031... # 054 # 055 # 056 # 057 # 05 # 8 # 05 # 9... # 0 # 82 # 0 # 83 # | | | | ... | | # 728 # 729 # 730 # 731... # 754 # 755 # 756 # 757 # 758 # 759... # 782 # 783 # # # %% # 导入必要的工具包 import pandas as pd import numpy as np import time from matplotlib import cm import matplotlib.pyplot as plt from sklearn.decomposition import PCA # % matplotlib # inline import seaborn as sns # 显示中文 plt.rcParams['font.sans-serif'] = ['Arial Unicode MS'] # %% # 读取训练数据,训练集中有42000个样本 train = pd.read_csv('MNIST_train.csv') y_train = train.label.values X_train = train.drop("label", axis=1).values # 将像素值[0,255] --> [0,1] X_train = X_train / 255.0 # 原始输入的特征维数和样本数目 print('the shape of train_image: {}'.format(X_train.shape)) # %% md ### 显示原始图像(64个) # %% plt.figure(figsize=(14, 12)) for digit_num in range(0, 64): plt.subplot(8, 8, digit_num + 1) grid_data = X_train[digit_num].reshape(28, 28) # reshape from 1d to 2d pixel array plt.imshow(grid_data, interpolation="none", cmap='gray') plt.xticks([]) plt.yticks([]) plt.tight_layout() # %% # 对Isomap等计算量大的降维技术来说,4w+样本太多,随机抽取其中20%做实验 from sklearn.model_selection import train_test_split # from sklearn.utils import shuffle X_selected, X_val, y_selected, y_val = train_test_split(X_train, y_train, train_size=0.2, random_state=0) # 原始输入的特征维数和样本数目 print('the shape of input: {}'.format(X_selected.shape)) # %% md ## PCA # %% start = time.time() # PCA(n_components=None, copy=True, whiten=False, svd_solver=’auto’, tol=0.0, iterated_power=’auto’, random_state=None) # 默认参数,保留所有成分,可以根据每个分成能解释的方差,人工确定合适的成分数目n_components pca = PCA() print("PCA begin..."); pca.fit(X_selected) # 降维 X_pca = pca.transform(X_selected) end = time.time() print("PCA time elaps:{}".format(int(end - start))) # %% # explained_variance_:每个主成分能解释的方差的的百分比,即X的协方差矩阵的特征值 # explained_variance_ratio_:每个主成分能解释的方差的的百分比 # 绘制不同PCA维数下模型的性能,找到最佳模型/参数(分数最高) plt.plot(pca.explained_variance_ratio_, 'b-') plt.xlabel(u"主成分数目", fontsize=14) plt.ylabel(u"能解释的方差的比例", fontsize=14) # %% # 主成分数目通常有三种方式: # 1. 直接确定主成分数目:如30 # 2. 根据主成分的累计贡献率确定主成分数目,如累计贡献率大于85%。 # 3. 肘部法(the elbow rule):根据每个成分对应的特征值(解释的方差比例),寻找一个截止点(elbow),在该截止点,特征值显著下降。 # 根据上图,主成分数目可以取50-100. # %% sum_explained_variance_ratio_ = np.zeros(X_selected.shape[1]) sum_explained_variance_ratio_[0] = pca.explained_variance_ratio_[0] for i in range(1, X_selected.shape[1]): sum_explained_variance_ratio_[i] = sum_explained_variance_ratio_[i - 1] + pca.explained_variance_ratio_[i] plt.plot(sum_explained_variance_ratio_, 'b-') plt.xlabel(u"主成分数目", fontsize=14) plt.ylabel(u"能解释的累计方差的比例", fontsize=14) # %% for i in range(0, X_selected.shape[1]): if sum_explained_variance_ratio_[i] > 0.85: print("%d number of components explain %d variance", i + 1, int(sum_explained_variance_ratio_[i] * 100)) break # %% for i in range(0, X_selected.shape[1]): if sum_explained_variance_ratio_[i] > 0.90: print("%d number of components explain %d variance", i + 1, int(sum_explained_variance_ratio_[i] * 100)) break # %% md ### 显示主成分 # %% n_components = 32 eigenvalues = pca.components_ n_row = 4 n_col = 8 # Plot the first 8 eignenvalues plt.figure(figsize=(13, 12)) for i in list(range(n_row * n_col)): offset = 0 plt.subplot(n_row, n_col, i + 1) plt.imshow(eigenvalues[i].reshape(28, 28), cmap='gray') title_text = 'Eigenvalue ' + str(i + 1) plt.title(title_text, size=6.5) plt.xticks(()) plt.yticks(()) plt.show() # %% def plot_embedding(data, label, title): x_min, x_max = np.min(data, 0), np.max(data, 0) data = (data - x_min) / (x_max - x_min) fig = plt.figure() for i in range(data.shape[0]): plt.text(data[i, 0], data[i, 1], str(label[i]), color=plt.cm.Set1(label[i] / 10.), fontdict={'weight': 'bold', 'size': 9}) plt.xticks([]) plt.yticks([]) plt.title(title, fontsize=14) # plt.scatter(data[:, 0], data[:, 1], # c=label, edgecolor='none', alpha=0.5, # cmap=plt.cm.get_cmap('spectral', 10)) plt.xlabel(u'成分1', fontsize=14) plt.ylabel(u'成分2', fontsize=14) # plt.colorbar(); # return fig # %% plot_embedding(X_pca[:, 0:2], y_selected, u'PCA') # %% md # Isomap # %%
手写测试集
最新推荐文章于 2024-07-29 14:37:54 发布