Scikit-Learn & TensorFlow 第三章源代码

以下代码为Scikit-Learn & TensorFlow 原书第三章代码,关于MNIST数据集可以看上一篇博客:https://blog.csdn.net/RivenDong/article/details/100163831

from sklearn.datasets import fetch_mldata
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict
from sklearn.base import clone, BaseEstimator
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, precision_recall_curve, roc_curve, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

class Never5Classifier(BaseEstimator):
    def fit(self, X, y=None):
        pass
    def predict(self, X):
        return np.zeros((len(X), 1), dtype=bool)

# 绘制精度和召回率相对于阈值的函数
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
    plt.plot(thresholds, recalls[:-1], "g--", label="Recall")
    plt.xlabel("Threshold")
    plt.legend(loc="upper left")
    plt.ylim([0, 1])

# 用于绘制ROC曲线
def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')


if __name__ == '__main__':
    mnist = fetch_mldata('MNIST original', data_home='./datasets/')
    X, y = mnist["data"], mnist["target"]
    some_digit = X[36000]
    some_digit_image = some_digit.reshape(28, 28)
    plt.imshow(some_digit_image, cmap= matplotlib.cm.binary, interpolation="nearest")
    plt.axis("off")
    plt.show()

    X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]
    shuffle_index = np.random.permutation(60000)    # 将训练集数据洗牌
    X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]
    y_train_5 = (y_train == 5)  # 训练一个二元分类器的方法
    y_test_5 = (y_test == 5)

    sgd_clf = SGDClassifier(random_state=42)  # 训练一个二元分类器
    sgd_clf.fit(X_train, y_train_5)
    print(sgd_clf.predict([some_digit]))

    skfolds = StratifiedKFold(n_splits=3, random_state=42)  # 使用交叉验证测量精度
    for train_index, test_index in skfolds.split(X_train, y_train_5):
        clone_clf = clone(sgd_clf)
        X_train_folds = X_train[train_index]
        y_train_folds = y_train_5[train_index]
        X_test_fold = X_train[test_index]
        y_test_fold = y_train_5[test_index]

        clone_clf.fit(X_train_folds, y_train_folds)
        y_pred = clone_clf.predict(X_test_fold)
        n_correct = sum(y_pred == y_test_fold)
        print(n_correct / len(y_pred))

    print(cross_val_score(sgd_clf, X_train, y_train_5, cv=3, scoring="accuracy"))

    never_5_clf = Never5Classifier()  # 全部为非
    print(cross_val_score(never_5_clf, X_train, y_train_5, cv=3, scoring="accuracy"))

    y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3)
    print(confusion_matrix(y_train_5, y_train_pred))  # 混淆矩阵

    print(precision_score(y_train_5, y_train_pred))
    print(recall_score(y_train_5, y_train_pred))
    print(f1_score(y_train_5, y_train_pred))

    y_scores = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3, method="decision_function")  # 获取训练集中所有实例的分数,决策分数
    precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores)  # 计算所有可能的阈值的精度和召回率
    plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
    fpr, tpr, thresholds2 = roc_curve(y_train_5, y_scores)
    # plot_roc_curve(fpr, tpr)  # ROC曲线
    # plt.show()

    print(roc_auc_score(y_train_5, y_scores))  # AUC

    forest_clf = RandomForestClassifier(random_state=42)
    y_probas_forest = cross_val_predict(forest_clf, X_train, y_train_5, cv=3, method="predict_proba")
    y_scores_forest = y_probas_forest[:, 1]
    fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train_5, y_scores_forest)
    plt.plot(fpr, tpr, "b:", label="SGD")
    plot_roc_curve(fpr_forest, tpr_forest, "Random Forest")
    plt.legend(loc="lower right")
    plt.show()

    ovo_clf = OneVsOneClassifier(SGDClassifier(random_state=42))  # OvO策略
    ovo_clf.fit(X_train, y_train)
    print(ovo_clf.predict([some_digit]))
    print(len(ovo_clf.estimators_))

    forest_clf.fit(X_train, y_train)
    forest_clf.predict([some_digit])
    print(forest_clf.predict_proba([some_digit]))  # 每个类别的概率列表

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))
    y_train_pred = cross_val_predict(sgd_clf, X_train_scaled, y_train, cv=3)
    conf_mx = confusion_matrix(y_train, y_train_pred)
    plt.matshow(conf_mx, cmap=plt.cm.gray)  # 绘制混淆矩阵
    plt.show()

    row_sums = conf_mx.sum(axis=1, keepdims=True)
    norm_conf_mx = conf_mx / row_sums
    np.fill_diagonal(norm_conf_mx, 0)   # 将对角线元素标0
    plt.matshow(norm_conf_mx, cmap = plt.cm.gray)   # 查看分类错误率的情况
    plt.show()

    y_train_large = (y_train >= 7)
    y_train_odd = (y_train % 2 == 1)
    y_multilabel = np.c_[y_train_large, y_train_odd]
    knn_clf = KNeighborsClassifier()
    knn_clf.fit(X_train, y_multilabel)
    print(knn_clf.predict([some_digit]))  # 5小于7且为奇数
  • 2
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值