以下代码为Scikit-Learn & TensorFlow 原书第三章代码,关于MNIST数据集可以看上一篇博客:https://blog.csdn.net/RivenDong/article/details/100163831
from sklearn.datasets import fetch_mldata
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict
from sklearn.base import clone, BaseEstimator
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, precision_recall_curve, roc_curve, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
class Never5Classifier(BaseEstimator):
def fit(self, X, y=None):
pass
def predict(self, X):
return np.zeros((len(X), 1), dtype=bool)
# 绘制精度和召回率相对于阈值的函数
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
plt.plot(thresholds, recalls[:-1], "g--", label="Recall")
plt.xlabel("Threshold")
plt.legend(loc="upper left")
plt.ylim([0, 1])
# 用于绘制ROC曲线
def plot_roc_curve(fpr, tpr, label=None):
plt.plot(fpr, tpr, linewidth=2, label=label)
plt.plot([0, 1], [0, 1], 'k--')
plt.axis([0, 1, 0, 1])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
if __name__ == '__main__':
mnist = fetch_mldata('MNIST original', data_home='./datasets/')
X, y = mnist["data"], mnist["target"]
some_digit = X[36000]
some_digit_image = some_digit.reshape(28, 28)
plt.imshow(some_digit_image, cmap= matplotlib.cm.binary, interpolation="nearest")
plt.axis("off")
plt.show()
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]
shuffle_index = np.random.permutation(60000) # 将训练集数据洗牌
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]
y_train_5 = (y_train == 5) # 训练一个二元分类器的方法
y_test_5 = (y_test == 5)
sgd_clf = SGDClassifier(random_state=42) # 训练一个二元分类器
sgd_clf.fit(X_train, y_train_5)
print(sgd_clf.predict([some_digit]))
skfolds = StratifiedKFold(n_splits=3, random_state=42) # 使用交叉验证测量精度
for train_index, test_index in skfolds.split(X_train, y_train_5):
clone_clf = clone(sgd_clf)
X_train_folds = X_train[train_index]
y_train_folds = y_train_5[train_index]
X_test_fold = X_train[test_index]
y_test_fold = y_train_5[test_index]
clone_clf.fit(X_train_folds, y_train_folds)
y_pred = clone_clf.predict(X_test_fold)
n_correct = sum(y_pred == y_test_fold)
print(n_correct / len(y_pred))
print(cross_val_score(sgd_clf, X_train, y_train_5, cv=3, scoring="accuracy"))
never_5_clf = Never5Classifier() # 全部为非
print(cross_val_score(never_5_clf, X_train, y_train_5, cv=3, scoring="accuracy"))
y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3)
print(confusion_matrix(y_train_5, y_train_pred)) # 混淆矩阵
print(precision_score(y_train_5, y_train_pred))
print(recall_score(y_train_5, y_train_pred))
print(f1_score(y_train_5, y_train_pred))
y_scores = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3, method="decision_function") # 获取训练集中所有实例的分数,决策分数
precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores) # 计算所有可能的阈值的精度和召回率
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
fpr, tpr, thresholds2 = roc_curve(y_train_5, y_scores)
# plot_roc_curve(fpr, tpr) # ROC曲线
# plt.show()
print(roc_auc_score(y_train_5, y_scores)) # AUC
forest_clf = RandomForestClassifier(random_state=42)
y_probas_forest = cross_val_predict(forest_clf, X_train, y_train_5, cv=3, method="predict_proba")
y_scores_forest = y_probas_forest[:, 1]
fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train_5, y_scores_forest)
plt.plot(fpr, tpr, "b:", label="SGD")
plot_roc_curve(fpr_forest, tpr_forest, "Random Forest")
plt.legend(loc="lower right")
plt.show()
ovo_clf = OneVsOneClassifier(SGDClassifier(random_state=42)) # OvO策略
ovo_clf.fit(X_train, y_train)
print(ovo_clf.predict([some_digit]))
print(len(ovo_clf.estimators_))
forest_clf.fit(X_train, y_train)
forest_clf.predict([some_digit])
print(forest_clf.predict_proba([some_digit])) # 每个类别的概率列表
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))
y_train_pred = cross_val_predict(sgd_clf, X_train_scaled, y_train, cv=3)
conf_mx = confusion_matrix(y_train, y_train_pred)
plt.matshow(conf_mx, cmap=plt.cm.gray) # 绘制混淆矩阵
plt.show()
row_sums = conf_mx.sum(axis=1, keepdims=True)
norm_conf_mx = conf_mx / row_sums
np.fill_diagonal(norm_conf_mx, 0) # 将对角线元素标0
plt.matshow(norm_conf_mx, cmap = plt.cm.gray) # 查看分类错误率的情况
plt.show()
y_train_large = (y_train >= 7)
y_train_odd = (y_train % 2 == 1)
y_multilabel = np.c_[y_train_large, y_train_odd]
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_multilabel)
print(knn_clf.predict([some_digit])) # 5小于7且为奇数