手写测试集

最新推荐文章于 2024-07-29 14:37:54 发布
菜鸟一枚cnk
最新推荐文章于 2024-07-29 14:37:54 发布
阅读量109
点赞数
文章标签：机器学习 python sklearn
本文链接：https://blog.csdn.net/m0_63122871/article/details/125210100
版权
# %% md

# # 在MNIST数据集上进行降维分析
# 图像数据维数高，而且特征之间（像素之间）相关性很高，因此我们预计用很少的维数就能保留足够多的信息
#
# 降维技术：
# PCA: PCA
# MNIST数据集介绍：
# 本数据来源于Kaggle竞赛提供的数据：Digit
# Recognizer （https: // www.kaggle.com / c / digit - recognizer / data）
# 训练集包含42, 000
# 个样本，测试集包含28, 000
# 个样本。
# 每个样本为28 * 28
# 的灰度图像，像素值在[0, 255]
# 数据排列形式：
# 000
# 001
# 002
# 003...
# 026
# 027
# 02
# 8
# 02
# 9
# 030
# 031...
# 054
# 055
# 056
# 057
# 05
# 8
# 05
# 9...
# 0
# 82
# 0
# 83
# | | | | ... | |
# 728
# 729
# 730
# 731...
# 754
# 755
# 756
# 757
# 758
# 759...
# 782
# 783
#
# # %%

# 导入必要的工具包
import pandas as pd
import numpy as np

import time

from matplotlib import cm
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
# % matplotlib
# inline

import seaborn as sns

# 显示中文
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']

# %%

# 读取训练数据,训练集中有42000个样本
train = pd.read_csv('MNIST_train.csv')

y_train = train.label.values
X_train = train.drop("label", axis=1).values

# 将像素值[0,255]  --> [0,1]
X_train = X_train / 255.0

# 原始输入的特征维数和样本数目
print('the shape of train_image: {}'.format(X_train.shape))

# %% md

### 显示原始图像(64个)

# %%

plt.figure(figsize=(14, 12))
for digit_num in range(0, 64):
    plt.subplot(8, 8, digit_num + 1)
    grid_data = X_train[digit_num].reshape(28, 28)  # reshape from 1d to 2d pixel array
    plt.imshow(grid_data, interpolation="none", cmap='gray')
    plt.xticks([])
    plt.yticks([])
plt.tight_layout()

# %%

# 对Isomap等计算量大的降维技术来说，4w+样本太多，随机抽取其中20%做实验
from sklearn.model_selection import train_test_split

# from sklearn.utils import shuffle
X_selected, X_val, y_selected, y_val = train_test_split(X_train, y_train, train_size=0.2, random_state=0)

# 原始输入的特征维数和样本数目
print('the shape of input: {}'.format(X_selected.shape))

# %% md

## PCA

# %%



start = time.time()

# PCA(n_components=None, copy=True, whiten=False, svd_solver=’auto’, tol=0.0, iterated_power=’auto’, random_state=None)
# 默认参数，保留所有成分，可以根据每个分成能解释的方差，人工确定合适的成分数目n_components
pca = PCA()
print("PCA begin...");
pca.fit(X_selected)

# 降维
X_pca = pca.transform(X_selected)

end = time.time()
print("PCA time elaps:{}".format(int(end - start)))

# %%

# explained_variance_：每个主成分能解释的方差的的百分比，即X的协方差矩阵的特征值
# explained_variance_ratio_：每个主成分能解释的方差的的百分比
# 绘制不同PCA维数下模型的性能，找到最佳模型／参数（分数最高）
plt.plot(pca.explained_variance_ratio_, 'b-')

plt.xlabel(u"主成分数目", fontsize=14)
plt.ylabel(u"能解释的方差的比例", fontsize=14)

# %%

# 主成分数目通常有三种方式：
# 1. 直接确定主成分数目：如30
# 2. 根据主成分的累计贡献率确定主成分数目，如累计贡献率大于85%。
# 3. 肘部法（the elbow rule）：根据每个成分对应的特征值（解释的方差比例），寻找一个截止点（elbow），在该截止点，特征值显著下降。
# 根据上图，主成分数目可以取50-100.

# %%

sum_explained_variance_ratio_ = np.zeros(X_selected.shape[1])

sum_explained_variance_ratio_[0] = pca.explained_variance_ratio_[0]
for i in range(1, X_selected.shape[1]):
    sum_explained_variance_ratio_[i] = sum_explained_variance_ratio_[i - 1] + pca.explained_variance_ratio_[i]
plt.plot(sum_explained_variance_ratio_, 'b-')

plt.xlabel(u"主成分数目", fontsize=14)
plt.ylabel(u"能解释的累计方差的比例", fontsize=14)

# %%

for i in range(0, X_selected.shape[1]):
    if sum_explained_variance_ratio_[i] > 0.85:
        print("%d number of components explain %d variance", i + 1, int(sum_explained_variance_ratio_[i] * 100))
        break

# %%

for i in range(0, X_selected.shape[1]):
    if sum_explained_variance_ratio_[i] > 0.90:
        print("%d number of components explain %d variance", i + 1, int(sum_explained_variance_ratio_[i] * 100))
        break

# %% md

### 显示主成分

# %%

n_components = 32
eigenvalues = pca.components_

n_row = 4
n_col = 8

# Plot the first 8 eignenvalues
plt.figure(figsize=(13, 12))
for i in list(range(n_row * n_col)):
    offset = 0
    plt.subplot(n_row, n_col, i + 1)
    plt.imshow(eigenvalues[i].reshape(28, 28), cmap='gray')
    title_text = 'Eigenvalue ' + str(i + 1)
    plt.title(title_text, size=6.5)
    plt.xticks(())
    plt.yticks(())
plt.show()


# %%

def plot_embedding(data, label, title):
    x_min, x_max = np.min(data, 0), np.max(data, 0)
    data = (data - x_min) / (x_max - x_min)

    fig = plt.figure()
    for i in range(data.shape[0]):
        plt.text(data[i, 0], data[i, 1], str(label[i]),
                 color=plt.cm.Set1(label[i] / 10.),
                 fontdict={'weight': 'bold', 'size': 9})
    plt.xticks([])
    plt.yticks([])
    plt.title(title, fontsize=14)

    # plt.scatter(data[:, 0], data[:, 1],
    #        c=label, edgecolor='none', alpha=0.5,
    #        cmap=plt.cm.get_cmap('spectral', 10))
    plt.xlabel(u'成分1', fontsize=14)
    plt.ylabel(u'成分2', fontsize=14)
    # plt.colorbar();
    # return fig


# %%


plot_embedding(X_pca[:, 0:2], y_selected, u'PCA')

# %% md

# Isomap

# %%