3.1 MNIST
下载数据集
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1)
mnist.keys()
X,y = mnist['data'], mnist['target']
print(X.shape)
print(y.shape)
将样本重新形成一个28*28数组,然后imshow()将图片显示出来
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
some_digit = np.array(X.iloc[0])
some_digit_image = some_digit.reshape(28,28)
plt.imshow(some_digit_image, cmap='binary')
plt.axis('off')
plt.show()
把y转化为整数
y=y.astype(np.uint8)
划分训练集和测试集
x_train,x_test,y_train,y_test = X[:60000],X[60000:],y[:60000],y[60000:]
3.2 训练二元分类器
先简化问题,只尝试识别一个数字,只能区分两个类别:5和非5.
y_train_5 = (y_train == 5)
y_test_5 = (y_test == 5)
一个好的初始选择是随机梯度下降(SGD)分类器。
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(x_train, y_train_5)
sgd_clf.predict([some_digit])
3.3 性能测量
1、使用交叉验证测量准确率
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone
skfolds = StratifiedKFold(n_splits=3, random_state=42,shuffle=True)
for train_index, test_index in skfolds.split(x_train, y_train_5):
clone_clf = clone(sgd_clf)
x_train_folds = x_train.iloc[train_index]
y_train_folds = y_train_5[train_index]
x_test_fold = x_train.iloc[test_index]
y_test_fold = y_train_5[test_index]
clone_clf.fit(x_train_folds, y_train_folds)
y_pred = clone_clf.predict(x_test_fold)
n_correct = sum(y_pred == y_test_fold)
print(n_correct/len(y_pred))
和上面功能类似的代码:
from sklearn.model_selection import cross_val_score
cross_val_score(sgd_clf,x_train,y_train_5,cv=3, scoring='accuracy')
将每张图都分类为非5
from sklearn.base import BaseEstimator
class Never5Classifier(BaseEstimator):
def fit(self, X, y=None):
return self
def predict(self, X):
return np.zeros((len(X),1),dtype=bool)
never_5_clf = Never5Classifier()
cross_val_score(never_5_clf, x_train, y_train_5, cv=3, scoring='accuracy')
说明准确率通常无法成为分类器的首要性能指标,特别是当我们处理有偏数据集时
2、混淆矩阵
cross_val_predict()函数同样执行K-折交叉验证,但返回的不是评估分数,而是每个折叠的预测
from sklearn.model_selection import cross_val_predict
y_train_pred = cross_val_predict(sgd_clf, x_train, y_train_5, cv=3)
from sklearn.metrics import confusion_matrix
confusion_matrix(y_train_5, y_train_pred)
3、精度和召回率
from sklearn.metrics import precision_score, recall_score #精度和召回率
precision_score(y_train_5, y_train_pred)
recall_score(y_train_5, y_train_pred)
F1分数对那些具有相近的精度和召回率的分类器更为有利
from sklearn.metrics import f1_score
f1_score(y_train_5, y_train_pred)
4、精度/召回率权衡
对于每个实例,它会基于决策函数计算出一个分值,如果该值大于阈值,则将该实例判为正类,否则便将其判为副类
scikit-learn不允许直接设置阈值,但可以通过访问它用于预测的决策分数。调用decision_function()方法
y_score = sgd_clf.decision_function([some_digit])
threshold = 0
y_some_digit_pred = (y_score>threshold)
y_some_digit_pred
使用cross_val_predict()函数获取训练集中所有实例的分数,然后使用precision_recall_curve()函数来计算所有可能的阈值的精度和召回率
y_scores = cross_val_predict(sgd_clf, x_train, y_train_5, cv=3, method='decision_function')
from sklearn.metrics import precision_recall_curve
precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores)
使用matplotlib绘制精度和召回率相对于阈值的函数图
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
plt.plot(thresholds, precisions[:-1], 'b--', label="Precision")
plt.plot(thresholds, recalls[:-1], 'g-', label="Recall")
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.show()
将精度设置为90%
# 将精度设为90%
threshold_90_precision = thresholds[np.argmax(precisions>=0.90)]
y_train_pred_90 = (y_score >= threshold_90_precision)
print(precision_score(y_train_5, y_train_pred_90))
print(recall_score(y_train_5, y_train_pred_90))
5、ROC曲线
真正类率(召回率)和假正类率(FPR)。FPR是被错误分为正类和负类实例比率。
ROC曲线绘制的是灵敏度(召回率)和(1-特异度)的关系
首先需要使用roc_curve()函数计算多种阈值的TPR和FPR
from sklearn.metrics import roc_curve
fpr,tpr,thresholds = roc_curve(y_train_5, y_scores)
def plot_roc_curve(fpr,tpr,label=None):
plt.plot(fpr,tpr,linewidth=2, label=label)
plt.plot([0,1],[0,1],'k--')
plt.xlabel('假正率')
plt.ylabel('真正率(召回率)')
plot_roc_curve(fpr,tpr)
plt.show()
虚线表示纯随机分类器的ROC曲线,一个优秀的分类器应该离这条线越远越好。
有一种分类器方法是测量曲线下面积(AUC),完美的ROC AUC等于1,而纯随机分类器ROC AUC等于0.5
from sklearn.metrics import roc_auc_score
roc_auc_score(y_train_5, y_score)
训练一个RandomForestClassifier分类器,并比较它和SGDClassifier分类器的ROC曲线和ROC AUC分数
RandomForestClassifier类没有decision_function()方法,它有dict_proba()方法。dict_proba()会返回一个数组,每行代表一个实例,每列代表一个类别。
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(random_state=42)
y_probas_forest = cross_val_predict(forest_clf, x_train, y_train_5, cv=3, method='predict_proba')
y_scores_forest = y_probas_forest[:,1]
fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train_5, y_scores_forest)
plt.plot(fpr,tpr,'b:',label='SGB')
plot_roc_curve(fpr_forest, tpr_forest, "Random Forest")
plt.legend(loc='lower right')
plt.show()
3.4 多类分类器
一种方法是训练10个二元分类器,称为一对剩余(OvR)策略
另一种是为每一对数字训练一个二元分类器,一个用于区分0和1,一个区分0和2,一个区分1和2以此类推。称为一对一(OvO)策略,优点在于只需要用到部分训练集对其必须区分的两个类进行训练。
在较小训练集上分别训练多个分类器比在大型数据集上训练少数分类器要快的多
from sklearn.svm import SVC
svm_clf = SVC()
svm_clf.fit(x_train,y_train)
svm_clf.predict([some_digit])
some_digit_scores = svm_clf.decision_function([some_digit])
print(some_digit_scores)
print(np.argmax(some_digit_scores))
print(svm_clf.classes_)
print(svm_clf.classes_[5])
想要强制Scikit-Learn使用一对一或者一对剩余策略,可以使用OneVsOneClassifier或OneVsRestClassifier类
from sklearn.multiclass import OneVsRestClassifier
over_clf = OneVsRestClassifier(SVC())
over_clf.fit(x_train, y_train)
over_clf.predict([some_digit])
3.5 误差分析
首先看混淆矩阵
y_train_pred = cross_val_predict(sgd_clf, x_train_scaled, y_train, cv=3)
conf_mx = confusion_matrix(y_train, y_train_pred)
conf_mx
混淆矩阵可视化
plt.matshow(conf_mx, cmap=plt.cm.gray)
plt.show()
将混淆矩阵中的每个值除以相应类中的图片的数量,这样比较的就是错误率而不是错误的绝对值(后者对图片较多的类不公平)
row_sums = conf_mx.sum(axis=1, keepdims=True)
norm_conf_mx = conf_mx/row_sums
用0填充对角线,只保留错误,重新绘制结果
np.fill_diagonal(norm_conf_mx,0)
plt.matshow(norm_conf_mx, cmap=plt.cm.gray)
plt.show()
3.6 多标签分类
多标签分类是希望分类器为每个实例输出多个类,例如:人脸识别的分类器,如果在一张照片里识别多个人怎么办?
看一简单实例
`from sklearn.neighbors import KNeighborsClassifier
y_train_large = (y_train>=7)
y_train_odd = (y_train52 == 1)
y_multilabel = np.c_[y_train_large, y_train_odd]
knn_clf = KNeighborsClassifier()
knn_clf.fit(x_train, y_multilabel)
knn_clf.predict([some_digit])
创建一个y_multilabel 数组,其中包含两个数字图片的目标标签:第一个表示数字是否是大数,第二个表示是否是奇数
不是所有的分类器都支持多标签分类
3.7 多输出分类
多输出-多类分类:是多标签分类的泛化,其标签也可以是多类的
构建一个系统去除图片中的噪声。给它输入一张有噪声的图片,它将输出一张干净的数字图片。
noise = np.random.randint(0,100,(len(x_train),784))
x_train_mod = x_train + noise
noise = np.random.randint(0,100,(len(x_test),784))
x_test_mod = x_test + noise
y_train_mod = x_train
y_test_mod = x_test
knn_clf.fit(x_train_mod, y_train_mod)
clean_digit = knn_clf.predict([x_test_mod.iloc[1]])