下载minst数据集
kaggle competitions download -c digit-recognizer
包含两个文件:
- train.csv
- test.csv
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
train = pd.read_csv('../../../datasets/digit-recognizer/train.csv')
print(train.shape)
test = pd.read_csv('../../../datasets/digit-recognizer/test.csv')
print(test.shape)
(42000, 785)
(28000, 784)
print(train.describe)
<bound method NDFrame.describe of label pixel0 pixel1 pixel2 pixel3 pixel4 pixel5 pixel6 pixel7 \
0 1 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0
2 1 0 0 0 0 0 0 0 0
3 4 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ...
41995 0 0 0 0 0 0 0 0 0
41996 1 0 0 0 0 0 0 0 0
41997 7 0 0 0 0 0 0 0 0
41998 6 0 0 0 0 0 0 0 0
41999 9 0 0 0 0 0 0 0 0
pixel8 ... pixel774 pixel775 pixel776 pixel777 pixel778 \
0 0 ... 0 0 0 0 0
1 0 ... 0 0 0 0 0
2 0 ... 0 0 0 0 0
3 0 ... 0 0 0 0 0
4 0 ... 0 0 0 0 0
... ... ... ... ... ... ... ...
41995 0 ... 0 0 0 0 0
41996 0 ... 0 0 0 0 0
41997 0 ... 0 0 0 0 0
41998 0 ... 0 0 0 0 0
41999 0 ... 0 0 0 0 0
pixel779 pixel780 pixel781 pixel782 pixel783
0 0 0 0 0 0
1 0 0 0 0 0
2 0 0 0 0 0
3 0 0 0 0 0
4 0 0 0 0 0
... ... ... ... ... ...
41995 0 0 0 0 0
41996 0 0 0 0 0
41997 0 0 0 0 0
41998 0 0 0 0 0
41999 0 0 0 0 0
[42000 rows x 785 columns]>
y_train = train['label']
train_label = y_train.values.astype(int)
X_train = train.drop('label', axis=1)
X_train = X_train.values.astype(int)
X_test = test.values.astype(int)
X_example = X_train[0].reshape(28, -1)
%matplotlib inline
plt.imshow(X_example, cmap='gray')
<matplotlib.image.AxesImage at 0x26e59836e48>
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.decomposition import PCA
PCA+SVM
pca = PCA(whiten=True, n_components=0.95)
pca.fit(X_train)
train_data = pca.transform(X_train)
# clf_svm = SVC(kernel='rbf', gamma=5, C=0.001)
clf_svm = SVC()
clf_svm.fit(train_data, train_label)
test_data = pca.transform(X_test)
test_predict = clf_svm.predict(test_data)
pca_svm_submission = pd.DataFrame(
{'ImageId':range(1, 28001), 'Label':test_predict}
)
pca_svm_submission.to_csv('pca_svm_submission.csv', index=False)
print(pca_svm_submission)
ImageId Label
0 1 2
1 2 0
2 3 9
3 4 7
4 5 3
... ... ...
27995 27996 9
27996 27997 7
27997 27998 3
27998 27999 9
27999 28000 2
[28000 rows x 2 columns]