import pandas as pd
df = pd.read_csv('F:/MNIST/mnist.csv', header=None)
df.head()
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 775 | 776 | 777 | 778 | 779 | 780 | 781 | 782 | 783 | 784 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 9 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 785 columns
import numpy as np
df = df.reindex(np.random.permutation(df.index)) # 打乱顺序的方法
x = df.drop(0, axis=1)
y = df[0]
print('X的概况:', x.shape)
print('Y的概况:', y.shape)
x_train, x_test, y_train, y_test = x[:60000], x[60000:], y[:60000], y[60000:]
X的概况: (70000, 784)
Y的概况: (70000,)
"""
data.irow(0) #取data的第一行
data.icol(0) #取data的第一列
data.ix[1:2] #返回第2行的第三种方法,返回的是DataFrame,跟data[1:2]同
data[1:2] #返回第2行,从0计,返回的是单行,通过有前后值的索引形式,
#如果采用data[1]则报错
"""
print(y[36000])
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
some_digit = x.loc[[36000]].values
some_digit_image = some_digit.reshape(28, 28)
plt.imshow(some_digit_image, cmap=matplotlib.cm.binary, interpolation='nearest')
plt.axis('off')
plt.show()
9
print(y_train.head())
y_test
41266 1
719 5
69105 0
58965 1
61484 1
Name: 0, dtype: int64
18193 9
8664 1
24326 3
59176 1
28477 5
40056 1
44939 5
32688 3
30398 3
1460 4
64379 4
66269 7
26930 0
33416 1
5443 1
61529 4
54202 5
64308 1
51409 7
10533 7
10417 2
54426 7
55812 4
65474 4
51762 5
8364 7
8601 9
69603 2
1043 7
28068 2
..
48235 4
30295 6
20432 2
12970 7
9458 0
65275 5
8466 4
62936 4
21244 5
3176 2
34733 4
2974 0
66701 0
12519 5
46747 1
34662 0
50322 5
39379 6
33823 0
60482 7
48927 9
7176 1
49476 5
33086 1
58856 8
1062 5
63524 6
65577 4
26155 9
55798 6
Name: 0, Length: 10000, dtype: int64
y_train_9 = (y_train==9)
y_test_9 = (y_test==9)
#print(y_test[[2797]])
#y_test_9
# 使用随机梯度下降(SGD)训练模型
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(x_train, y_train_9)
SGDClassifier(alpha=0.0001, average=False, class_weight=None,
early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
l1_ratio=0.15, learning_rate='optimal', loss='hinge',
max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
power_t=0.5, random_state=42, shuffle=True, tol=0.001,
validation_fraction=0.1, verbose=0, warm_start=False)
sgd_clf.predict(x.loc[[30069,42882,167,33531,11688,26571,16734,14324]])
# sgd_clf.predict([some_digit])
# 这种语法错误为:Found array with dim 3. Estimator expected <= 2.
array([ True, True, True, True, True, True, True, True])
# 使用cross_val_score模块的3折交叉验证方法改进SGD模型
from sklearn.model_selection import cross_val_score # 采用K-fold交叉验证发,3个折叠
cross_val_score(sgd_clf, x_train, y_train_9, cv=3, scoring='accuracy')
array([0.92680366, 0.9418 , 0.9439472 ])
# 使用混淆矩阵(cross_val_predict)方法获取预测的真OR假结果,以便计算精确率和召回率
from sklearn.model_selection import cross_val_predict # 先返回每个折叠的预测,每个预测都是在训练期间从未见过的
y_train_pred = cross_val_predict(sgd_clf, x_train, y_train_9, cv=3)
y_train_pred
array([False, False, False, ..., False, False, False])
# 使用生成的y_train_pred(预测结果正确与否的列表)与真实值计算精确率和召回率
# 手工算法
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_train_9, y_train_pred))
"""
真负类TN 假正类FP
假负类FN 真正类TP
"""
jingdu = 3637 / (3637+2281)
zhaohuilv = 3637 / (3637+1468)
print('精度:', jingdu)
print('召回率:', zhaohuilv)
[[52614 1468]
[ 2281 3637]]
精度: 0.6145657316661034
召回率: 0.7124387855044074
# 使用sklearn.metrics中的precision_score和recall_score自动计算精确率和召回率
from sklearn.metrics import precision_score,recall_score
precision_score(y_train_9, y_train_pred)
0.7124387855044074
recall_score(y_train_9, y_train_pred)
0.6145657316661034
# 使用sklearn.metrics中的f1_score模块,自动计算F1分数(2*精度*召回率 / (精度 + 召回率))
from sklearn.metrics import f1_score
f1_score(y_train_9, y_train_pred)
0.6598929511022407
# 设定阈值来改变精度和召回率,主意参数:method='decision_function'
# method='decision_function'这个参数返回的是计算得出的分数。可以用来设定阈值
#from sklearn.model_selection import cross_val_predict # 先返回每个折叠的预测,每个预测都是在训练期间从未见过的
#y_train_pred = cross_val_predict(sgd_clf, x_train, y_train_9, cv=3)
y_scores = cross_val_predict(sgd_clf, x_train, y_train_9, method='decision_function')
y_scores
D:\Anaconda3\lib\site-packages\sklearn\model_selection\_split.py:1978: FutureWarning: The default value of cv will change from 3 to 5 in version 0.22. Specify it explicitly to silence this warning.
warnings.warn(CV_WARNING, FutureWarning)
array([-42620.62981929, -51693.91890067, -39176.69145394, ...,
-22254.03321689, -53437.33278592, -33654.56993053])
# 画出精确率和召回率的图
from sklearn.metrics import precision_recall_curve
precisions, recalls, thresholds = precision_recall_curve(y_train_9, y_scores)
def plot_pre_recall_vs_thre(precisions, recalls, thresholds):
plt.plot(thresholds, precisions[:-1], 'b--',label='Precision')
plt.plot(thresholds, recalls[:-1],'g--', label='recall')
plt.xlabel('Threshold')
plt.legend(loc='upper left')
plt.ylim([0,1])
plot_pre_recall_vs_thre(precisions, recalls, thresholds)
plt.show()
y_train_pred_90 = (y_scores > 10000)
precision_score(y_train_9, y_train_pred_90)
0.8898305084745762
recall_score(y_train_9, y_train_pred_90)
0.01774248056775938
# ROC曲线:特异度和灵敏度的关系
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_train_9, y_scores)
def plot_roc_curve(fpr, tpr, label=None):
plt.plot(fpr, tpr, linewidth=2, label=label)
plt.plot([0,1], [0,1], 'k--')
plt.axis([0,1,0,1])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plot_roc_curve(fpr, tpr)
plt.show()
# 训练一个随机森林,与SGD分类在ROC曲线上进行对比
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(random_state=42)
y_probas_forest = cross_val_predict(forest_clf, x_train, y_train_9, cv=10, method='predict_proba')
D:\Anaconda3\lib\site-packages\sklearn\ensemble\forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
"10 in version 0.20 to 100 in 0.22.", FutureWarning)
D:\Anaconda3\lib\site-packages\sklearn\ensemble\forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
"10 in version 0.20 to 100 in 0.22.", FutureWarning)
D:\Anaconda3\lib\site-packages\sklearn\ensemble\forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
"10 in version 0.20 to 100 in 0.22.", FutureWarning)
D:\Anaconda3\lib\site-packages\sklearn\ensemble\forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
"10 in version 0.20 to 100 in 0.22.", FutureWarning)
D:\Anaconda3\lib\site-packages\sklearn\ensemble\forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
"10 in version 0.20 to 100 in 0.22.", FutureWarning)
D:\Anaconda3\lib\site-packages\sklearn\ensemble\forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
"10 in version 0.20 to 100 in 0.22.", FutureWarning)
D:\Anaconda3\lib\site-packages\sklearn\ensemble\forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
"10 in version 0.20 to 100 in 0.22.", FutureWarning)
D:\Anaconda3\lib\site-packages\sklearn\ensemble\forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
"10 in version 0.20 to 100 in 0.22.", FutureWarning)
D:\Anaconda3\lib\site-packages\sklearn\ensemble\forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
"10 in version 0.20 to 100 in 0.22.", FutureWarning)
D:\Anaconda3\lib\site-packages\sklearn\ensemble\forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
"10 in version 0.20 to 100 in 0.22.", FutureWarning)
# 随机森林中,使用参数method='predict_proba',返回的是概率值
y_probas_forest
array([[1., 0.],
[1., 0.],
[1., 0.],
...,
[1., 0.],
[1., 0.],
[1., 0.]])
y_scores_forest = y_probas_forest[:, 1]
fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train_9, y_scores_forest)
plt.plot(fpr, tpr, 'b', label='SGD')
plot_roc_curve(fpr_forest, tpr_forest, 'Random Forest')
plt.legend(loc='bottom right')
plt.show()
D:\Anaconda3\lib\site-packages\matplotlib\legend.py:497: UserWarning: Unrecognized location "bottom right". Falling back on "best"; valid locations are
best
upper right
upper left
lower left
lower right
right
center left
center right
lower center
upper center
center
% (loc, '\n\t'.join(self.codes)))
from sklearn.metrics import roc_auc_score
print(roc_auc_score(y_train_9, y_scores))
print(roc_auc_score(y_train_9, y_scores_forest))
0.9401824847125175
0.9883210685077506
# 多类别分类器
# OvA——OvO
# 先使用SGD对所有标签进行训练试试
sgd_clf.fit(x_train, y_train)
SGDClassifier(alpha=0.0001, average=False, class_weight=None,
early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
l1_ratio=0.15, learning_rate='optimal', loss='hinge',
max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
power_t=0.5, random_state=42, shuffle=True, tol=0.001,
validation_fraction=0.1, verbose=0, warm_start=False)
sgd_clf.predict(x.loc[[36000]])
array([4], dtype=int64)
# 看看sklearn中二分类器是如何进行多类别分类的,需要看decision_function()分数
# 很明显,分类为9的分数为-2712,是最大的,所以分类为9,结果也是正确的。
some_digit_scores = sgd_clf.decision_function(x.loc[[36000]])
print(some_digit_scores)
print(np.argmax(some_digit_scores)) # np.argmax() 求最大的nampy方法
print(sgd_clf.classes_) # 查看原始结果方法
[[-39761.98821912 -22916.54645981 -18184.28472608 1016.42192678
2300.61774756 -6673.18271756 -40432.41849423 -13508.94695046
-2635.88432119 -2399.9387372 ]]
4
[0 1 2 3 4 5 6 7 8 9]
# 使用交叉验证方法评估SGD的准确率
cross_val_score(sgd_clf, x_train, y_train, cv=3, scoring='accuracy')
array([0.85907114, 0.87885 , 0.8819823 ])
# 使用多分类器“随机森林”算法再试试看
forest_clf.fit(x_train, y_train)
D:\Anaconda3\lib\site-packages\sklearn\ensemble\forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
"10 in version 0.20 to 100 in 0.22.", FutureWarning)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=10,
n_jobs=None, oob_score=False, random_state=42, verbose=0,
warm_start=False)
# 返回概率值,发现结果是9的概率最大,所以返回结果是9
forest_clf.predict_proba(x.loc[[36000]])
array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]])
# 错误分析
# 使用cross_val_predict()函数进行预测,然后调用confusion_matrix()函数
y_train_pred_final = cross_val_predict(forest_clf, x_train, y_train, cv=3)
conf_mx = confusion_matrix(y_train, y_train_pred_final)
conf_mx
array([[5853, 2, 18, 6, 5, 15, 30, 2, 28, 2],
[ 1, 6618, 39, 19, 15, 10, 7, 11, 11, 5],
[ 54, 20, 5686, 39, 40, 7, 23, 57, 47, 5],
[ 36, 21, 147, 5646, 7, 119, 10, 52, 84, 36],
[ 18, 13, 34, 10, 5572, 10, 40, 19, 23, 117],
[ 50, 21, 31, 206, 35, 4948, 56, 12, 54, 30],
[ 57, 15, 26, 6, 31, 69, 5643, 1, 24, 2],
[ 13, 28, 96, 27, 82, 4, 2, 5861, 13, 79],
[ 29, 47, 104, 133, 61, 106, 44, 29, 5240, 78],
[ 33, 20, 31, 85, 193, 41, 6, 89, 65, 5355]],
dtype=int64)
plt.matshow(conf_mx, cmap=plt.cm.gray)
plt.show()
row_sums = conf_mx.sum(axis=1, keepdims=True)
norm_conf_mx = conf_mx / row_sums
np.fill_diagonal(norm_conf_mx, 0)
plt.matshow(norm_conf_mx, cmap=plt.cm.gray)
plt.show()
# 多标签分类
# 希望分类器为每个样本产生多个类别(如一张照片里识别出多个人)
# 分类器可以输出多个二元标签。
# 这种分类叫做,多标签分类系统
from sklearn.neighbors import KNeighborsClassifier
y_train_large = (y_train >= 7)
y_train_odd = (y_train % 2 == 1)
y_multilabel = np.c_[y_train_large, y_train_odd]
knn_clf = KNeighborsClassifier()
knn_clf.fit(x_train, y_multilabel)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=None, n_neighbors=5, p=2,
weights='uniform')
knn_clf.predict(x.loc[[36000]])
array([[ True, True]])
# 多标签分类的评估
y_train_knn_pred = cross_val_predict(knn_clf, x_train, y_train, cv=3)
f1_score(y_train, y_train_knn_pred, average='macro')