机器学习代码——模型评估与选择

二、模型评估与选择

数据来源:https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic

2.1经验误差与过拟合

2.2评估方法

2.2.1留出法(hold-out)
import pandas as pd # 数据科学计算工具
import numpy as np # 数值计算工具
from sklearn.preprocessing import LabelEncoder,StandardScaler  # 类别标签编码,标准化处理
from sklearn.decomposition import PCA  # 主成分分析
import matplotlib.pyplot as plt
wdbc = pd.read_csv("C:\\Users\???\Desktop\机器学习\数据\wdbc.data",header=None)
wdbc.head()
X, y = wdbc.loc[:, 2:].values, wdbc.loc[:, 1]  # 提取特征数据和样本标签集
X = StandardScaler().fit_transform(X)  # 对样本特征数据进行标准化
lab_en = LabelEncoder()  # 对目标值进行编码,创建对象
y = lab_en.fit_transform(y)  # 拟合和转换
lab_en.classes_, lab_en.transform(["B", "M"])
pca = PCA(n_components=6).fit(X)  # 选取6个主成分
evr = pca.explained_variance_ratio_
print("各主成分贡献率",evr, "\n累计贡献率",np.cumsum(evr))
X_pca = pca.transform(X)  # 转换获得各主成分数据
print(X_pca[:5, :])
plt.figure(figsize=(21, 5))
X_b, X_m = X_pca[y == 0], X_pca[y == 1]  # 把降维后的数据按类别分别提取
for i in range(3):
    plt.subplot(131 + i)
    plt.plot(X_b[:, i * 2], X_b[:, i * 2 + 1], "ro-", markersize=3, label="benign")
    plt.plot(X_m[:, i * 2], X_m[:, i * 2 + 1], "bx--", markersize=5, label="maligant")
    plt.legend(frameon=False)
    plt.grid(ls=":")
    plt.xlabel(str(i * 2 + 1) + "th principal component", fontsize=12)
    plt.xlabel(str(i * 2 + 2) + "th principal component", fontsize=12)
    plt.title("Eech category of data dim reduction by PCA", fontsize=12)
plt.show()
from sklearn.model_selection import train_test_split  #划分数据集
from sklearn.linear_model import LogisticRegression  #逻辑回归
from sklearn.metrics import  classification_report, accuracy_score  #分类报告,正确率
acc_test_score,acc_train_score = [], []  # 每次随即划分训练和测试评分
for i in range (50):
    X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.25, random_state=i, shuffle=True, stratify=y)
    log_red = LogisticRegression()  # 未掌握原理之前,所有参数默认
    log_red.fit(X_train, y_train)   # 采用训练集训练模型
    y_test_pred = log_red.predict(X_test)  # 模型训练完毕后,对测试样本进行预测
    acc_test_score.append(accuracy_score(y_test, y_test_pred))
    acc_train_score.append(accuracy_score(y_train, log_red.predict(X_train)))

plt.figure(figsize=(7,5))
plt.plot(acc_test_score, "ro", lw=1.5, markersize=4, label="Test")
plt.plot(acc_train_score, "ks--", lw=1, markersize=4, label="Train")
plt.legend(frameon=False)
plt.grid(ls=":")
plt.xlabel("Random division times", fontsize=12)
plt.xlabel("Accuracy score of test vs train", fontsize=12)
plt.title("Test samples accuracy score Mean = %.5f(+/-%.5f)" % (np.mean(acc_test_score), np.std(acc_test_score)), fontsize=12)
plt.show()
from sklearn.pipeline import make_pipeline  # 标准化
pipe_lr = make_pipeline(StandardScaler(),  # 主成分降维
                        PCA(n_components=6),  # 逻辑回归
                        LogisticRegression())
# 划分数据集为训练集和测试集,比例8:2
X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=0.20,
                 stratify=y, random_state=1, shuffle=True)
pipe_lr.fit(X_train, y_train)  # 训练模型
y_pred = pipe_lr.predict(X_test)  # 预测
print('Text Accuracy:%.3f' % pipe_lr.score(X_test,y_test))
2.2.2K折交叉验证法
from sklearn.model_selection import StratifiedKFold #交叉验证

kfold = StratifiedKFold(n_splits=10).split(X_train, y_train)  # 10折划分数据集
scores = []  # 10折每一次训练的验证评分
for i, (train_idx, test_idx) in enumerate(kfold):
    pipe_lr.fit(X_train[train_idx], y_train[train_idx])  # 9个子集用于训练
    score = pipe_lr.score(X_train[test_idx], y_train[test_idx])  # 其中一个子集用于测试
    scores.append(score)
    print("Fold: %d,Class dist: %s, Acc: %.3f" % (i + 1, np.bincount(y_train[train_idx]), score))
print("CV acc: %.3f (+/-%.3f)" % (np.mean(scores), np.std(scores)))
from sklearn.model_selection import cross_val_score

scores = cross_val_score(estimator=pipe_lr, X=X_train, y=y_train, cv=10)
print("CV accuracy scores: %s" % scores)
print("CV acc: %.3f (+/-%.3f)" % (np.mean(scores), np.std(scores)))
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
pipe_knn = make_pipeline(StandardScaler(),  # 标准化
                         PCA(n_components=10),  # 保留10个主成分
                         KNeighborsClassifier())  # K近邻分类
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, shuffle=True, stratify=y)
k_range = range(1, 31)  # k的选择范围,其他参数默认
cv_scores = []  # 每个看值的10折均分
for k in k_range:
    pipe_knn.set_params(kneighborsclassifier__n_neighbors=k)  # 设置参数近邻数n_neighbor
    scores = cross_val_score(estimator=pipe_knn, X=X_train,
                             y=y_train, cv=10)
    cv_scores.append(np.mean(scores))

plt.figure(figsize=(7, 5))
plt.plot(k_range, cv_scores, "ko-", lw=1, markeredgecolor="r")
plt.grid(ls=":")
plt.ylabel("Accuracy score of test", fontsize=12)
plt.xlabel("N_neighbors", fontsize=12)
plt.title("Test samples accuracy of different n_neighbors", fontsize=14)
plt.show()
# 获得最佳参数,重新训练模型,进行预测
idx = np.argmax(cv_scores)  # 最优评分的索引
pipe_knn.set_params(kneighborsclassifier__n_neighbors=idx+1)  # 设置最佳近邻数
pipe_knn.fit(X_train, y_train)
y_test_pred = pipe_knn.predict(X_test)
print("Test Xcore is %.5f with KNN n_neighbors=%d" % (accuracy_score(y_test, y_test_pred),idx + 1))
2.2.3自助法
def bootstrapping(m):
    """
    自助采样法,m表示样本量。即抽样的次数
    :param m:
    :return:
    """
    boostrap = []  # 存储每次采样的样本索引编号
    for i in range(m):
        boostrap.append(np.random.randint(0, m, 1))  # 随机产生一个样本的索引编号
    return np.asarray(boostrap).reshape(-1)
print("样本比例正例与反例的比:%d : %d = %.5f"%(len(y[y ==0]), len(y[y == 1]),len(y[y ==0])/len(y[y == 1])))

n_samples = X_pca.shape[0]  #样本量
ratio_bs = []  #存储每次未划分到训练集中的样本比例
for i in range(15000):
    train_idx = bootstrapping(n_samples)  # 一次自助采样获得训练集样本索引
    idx_all = np.linspace(0, n_samples - 1, n_samples, dtype=int)  # 总体样本的索引编号
    test_idx = np.setdiff1d(idx_all, train_idx)  # 测试样本的索引编号
    ratio_bs.append(len(test_idx) / n_samples)  # 测试样本的占所有样本得比例

y_train = y[train_idx]  # 其中一次自助采样后得训练样本目标集
print("抽样后,正例与反例得比例:%.5f" % (len(y_train[y_train == 0])/len(y_train[y_train == 1])))
print("自助采样后,未出现在训练集中的比例:%.5f" % np.mean(ratio_bs))  # 15000次自助采样的均值

import seaborn as sns
sns.displot(ratio_bs,kind="hist",color="purple")
plt.show()

X_train, y_train = X_pca[train_idx, :], y[train_idx]
x_test, y_test = X_pca[test_idx], y[test_idx]

knn = KNeighborsClassifier(n_neighbors=9)
knn.fit(X_train, y_train)
y_test_pred = knn.predict(X_test)
print("Test score is %.5f" % accuracy_score(y_test,y_test_pred))
2.2.4调参与最终模型
from sklearn.model_selection import GridSearchCV  # 网格搜索
from sklearn.svm import SVC
pipe_svc = make_pipeline(StandardScaler(), PCA(n_components=4), SVC())
X, y = wdbc.loc[:, 2:].values, wdbc.loc[:, 1]  # 提取特征数据和样本标签集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1, shuffle=True, stratify=y)
param_range = [0.001, 0.01, 0.1, 1, 10, 100]  # 指定C与gamma参数的取值
param_grid = [{'svc__C': param_range, 'svc__kernel': ['linear']},
              {'svc__C': param_range, 'svc__gamma': param_range, 'svc__kernel': ['rbf']}]  # 生成参数网络,参数的组合
gs_cv = GridSearchCV(estimator=pipe_svc, param_grid=param_grid, scoring="accuracy", cv=10, refit=True)
gs_result = gs_cv.fit(X_train, y_train)  # 运行网格搜索
print("Best: %f, using %s" % (gs_result.best_score_, gs_result.best_params_))

test_means = gs_result.cv_results_['mean_test_score']
params = gs_result.cv_results_['params']
for tm, param in zip(test_means, params):
    print("%f with: %s" % (tm, param))
clf = gs_result.best_estimator_
print("Test accuracy is %.5f" % clf.score(X_test, y_test))
2.2.5嵌套交叉验证
# 支持向量机
pipe_svc = make_pipeline(StandardScaler(), PCA(n_components=6), SVC())
param_range = [0.001, 0.01, 0.1, 1, 10, 100]  # 指定C与gamma参数的取值
param_grid = [{'svc__C': param_range, 'svc__kernel': ['linear']},
              {'svc__C': param_range, 'svc__gamma': param_range, 'svc__kernel': ['rbf']}]  # 生成参数网络,参数的组合
gs_svc = GridSearchCV(estimator=pipe_svc, param_grid=param_grid, scoring="accuracy", cv=2, refit=True)  # 内层,2折
score_svc = cross_val_score(gs_svc, X_train, y_train, scoring="accuracy", cv=5)  # 外层,5折
print("SVC CV accuracy:%.5f +/-%.5f" % (np.mean(score_svc), np.std(score_svc)))

# K近邻算法
pipe_knn = make_pipeline(StandardScaler(), PCA(n_components=6), KNeighborsClassifier())
param_grid = [{'kneighborsclassifier__n_neighbors': [3, 4, 5, 6, 7, 8, 9, 10]},
              {'kneighborsclassifier__algorithm': ['ball_tree', 'kd_tree', 'brute']}]  # 生成参数网络,参数的组合
gs_knn = GridSearchCV(estimator=pipe_knn, param_grid=param_grid, scoring="accuracy", cv=2, refit=True)  # 内层,2折
score_knn = cross_val_score(gs_knn, X_train, y_train, scoring="accuracy", cv=5)  # 外层,5折
print("KNN CV accuracy:%.3f +/-%.3f" % (np.mean(score_knn), np.std(score_knn)))

# 决策树
pipe_dtc = make_pipeline(StandardScaler(), PCA(n_components=6), DecisionTreeClassifier())
param_grid = [{'decisionTreeClassifier__max_depth': [1, 2, 3, 4, 5, 6, 7, None]},
              {'DecisionTreeClassifier__criterion': ['gini', 'entropy']}]  # 生成参数网络,参数的组合
gs_dtc = GridSearchCV(estimator=pipe_dtc, param_grid=param_grid, scoring="accuracy", cv=2, refit=True)  # 内层,2折
score_dtc = cross_val_score(gs_svc, X_train, y_train, scoring="accuracy", cv=5)  # 外层,5折
print("DTC CV accuracy:%.5f +/-%.5f" % (np.mean(score_dtc), np.std(score_dtc)))

2.2.6Hyperopt自动化超参数调优

from hyperopt import fmin, tpe, hp, rand  # 超参数优化技术
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm  # 支持向量机
from sklearn.model_selection import train_test_split  # 分割训练集和测试集
from sklearn.datasets import load_digits  # 加载手写数据集
from sklearn.decomposition import PCA  # 主成分分析
from sklearn.preprocessing import StandardScaler  # 标准化处理
from sklearn.pipeline import make_pipeline  # 标准化
from sklearn.model_selection import cross_val_score  # 交叉验证评分

# sklearn,datasets中内中的手写数字图片数据集
digits = load_digits()  # 数据加载进digit
X, y = digits.data, digits.target

fig = plt.figure(figsize=(12, 8))
fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05)  # 调节子图形布局
for i in range(24):
    ax = fig.add_subplot(4, 6, i + 1, xticks=[], yticks=[])  # 初始化子图:在4行6列的网格中,在第i+1个位置添加一个子图
    ax.imshow(digits.images[i])  # 在第i个位置显示图像
plt.show()

parameter_space_svc = {
    'C': hp.loguniform("C", np.log(1), np.log(100)),  # loguniform表示该参数取对数后服从均匀分布
    'kernel': hp.choice('kernel', ['rbf', 'poly']),
    'gamma': hp.loguniform("gamma", np.log(0.001), np.log(0.1))
}
pipe_svc = make_pipeline(StandardScaler(), PCA(n_components=20), svm.SVC())
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0, shuffle=True, stratify=y)

count = 0  # 计数器,每一次参数组合的枚举都会使它加1,用于输出序号
cv_scores = []
def hyperopt_train_val(args):
    clf = svm.SVC(**args)  # **可以把dict转换为关键字参数
    score = cross_val_score(clf, X_train, y_train).mean()
    cv_scores.append(score)
    global count
    count = count + 1
    print("[%d], %s, Validate acc: %5f" % (count, args, score))
    return -score

best = fmin(hyperopt_train_val, parameter_space_svc, algo=tpe.suggest, max_evals=100)

kernel_list = ['rbf', 'ploy']
best["kernel"] = kernel_list[best["kernel"]]
print("best params:", best)
clf = svm.SVC(**best)  # 根据最佳参数对测试数据进行预测
clf.fit(X_train, y_train)
print("The best params of svc, test scores = %.5f" % clf.score(X_train,y_train))

plt.figure(figsize=(8, 6))
plt.plot(cv_scores, "ko--", markersize=5, label="Test", markeredgecolor="r")
plt.grid(ls=":")
plt.xlabel("eval times", fontsize=12)
plt.ylabel("scores", fontsize=12)
plt.title("Super parameter tuning of SVM by Hyperopt", fontsize=14)
plt.show()


2.3性能度量

import numpy as np  # 数值计算
import pandas as pd  # 数据分析
import matplotlib.pyplot as plt  # 可视化


class ModelperformanceMetrices:
    """
    模型性能度量:分二分类和多分类,模型的泛化性能度量
    1.计算混淆矩阵
    2.计算分类报告,模板采用sklearn,classification_report格式
    3.计算P(查准率)R(查全率)指标,并可视化P-R曲线,计算AP
    4.计算ROC的指标:真正例率,假正例率,并可视化ROC曲线,计算AUC
    5.计算代价曲线:归一化指标,正例概率代价,可视化代价曲线,并计算期望总体代价
    """

    def __init__(self, y_true, y_prob):
        """
        初始化参数
        :param y_true:样本的正式类别
        :param y_prob:样本的预测类别
        """
        self.y_true = np.asarray(y_true, dtype=int)  # 将输入转换为数组
        self.y_prob = np.asarray(y_prob, dtype=float)
        self.n_samples, self.n_class = self.y_prob.shape  # 样本量和类别数
        if self.n_class > 2:  # 多分类
            self.y_true = self.label_one_hot()
        else:
            self.y_true = self.y_true.reshape(-1)
        self.cm = self.cal_confusion_matrix()  # 计算混淆矩阵

    def label_one_hot(self):
        """
        对真实类别标签进行one—hot编码,编码后的维度与模型预测概率维度一致
        :return:
        """
        y_ture_lab = np.zeros((self.n_samples, self.n_class))  # 给定形状和类型的新数组,并用零填充。
        for i in range(self.n_samples):
            y_ture_lab[i, self.y_true[i]] = 1
        return y_ture_lab

    def cal_confusion_matrix(self):
        """
        计算并构建混淆矩阵
        :return:
        """
        confusion_matrix = np.zeros((self.n_class, self.n_class))
        for i in range(self.n_samples):
            idx = np.argmax(self.y_prob[i, :])  # 最大概率所对应的索,既是类别
            if self.n_class == 2:
                idx_ture = self.y_true[i]
            else:
                idx_ture = np.argmax(self.y_true[i, :])  # 第i个样本的真实类别
            if idx_ture == idx:
                confusion_matrix[idx, idx] += 1  # 预测真确,则在对角线元素位置加1
            else:
                confusion_matrix[idx_ture, idx] += 1  # 预测错误,则在真实类别行预测错误列加1
        return confusion_matrix

    def cal_classification_report(self, target_names=None):
        """
        计算并构造分类报告
        :return:
        """
        precision = np.diag(self.cm) / np.sum(self.cm, axis=0)  # 查准率
        recall = np.diag(self.cm) / np.sum(self.cm, axis=1)  # 查全率
        f1_score = 2 * precision * recall / (precision + recall)  # F1调和平均
        support = np.sum(self.cm, axis=1)  # 各个类别的支持样本量
        support_all = np.sum(support)  # 总的样本量
        accuracy = np.sum(np.diag(self.cm)) / support_all  # 准确率
        p_m, r_m = precision.mean(), recall.mean()
        macro_avg = [p_m, r_m, 2 * p_m * r_m / (p_m + r_m)]  # 宏指标
        weight = support / support_all  # 以各个类别的样本量所占总的样本量比例为权重
        weight_avg = [np.sum(weight * precision), np.sum(weight * recall), np.sum(weight * f1_score)]

        # 构造分类报告
        metrics_1 = pd.DataFrame(np.array([precision, recall, f1_score, support]).T,
                                 columns=["precision", "recall", "f1_score", "support"])
        metrics_2 = pd.DataFrame([["", "", "", ""], ["", "", accuracy, support_all],
                                  np.hstack([macro_avg, support_all]),
                                  np.hstack([weight_avg, support_all])],
                                 columns=["precision", "recall", "f1_score", "support"])
        c_report = pd.concat([metrics_1, metrics_2], ignore_index=False)
        if target_names is None:
            target_names = [str(i) for i in range(self.n_class)]
        else:
            target_names = list(target_names)
        target_names.extend(["", "accuracy", "macro_avg", "weighted avg"])
        c_report.index = target_names
        return c_report

    @staticmethod
    def __sort_positive__(y_prob):
        """
        按照预测为正例的概率进行降序排列,并返回排序的索引向量
        :param y_prob:一维数组,样本预测为正例的概率
        :return:
        """
        idx = np.argsort(y_prob)[::-1]  # 降序排列
        return idx

    def precision_recall_curve(self):
        """
        Precision和Recall曲线,计算各坐标点的值,可视化P—R曲线
        :return:
        """
        pr_array = np.zeros((self.n_samples, 2))  # 存储每个样本预测概率作为阈值是的P和R指标
        if self.n_class == 2:  # 二分类
            idx = self.__sort_positive__(self.y_prob[:, 0])  # 降序排列
            y_true = self.y_true[idx]  # 真值类别标签按照排序索引进行排序
            # 准对每个样本,把预测概率作为阈值,计算各指标
            for i in range(self.n_samples):
                tp, fn, tn, fp = self.__call_sub_metrics__(y_true, i + 1)
                pr_array[i, :] = tp / (tp + fn), tp / (tp + fp)
        else:  # 多分类
            precision = np.zeros((self.n_samples, self.n_class))  # 查准率
            recall = np.zeros((self.n_samples, self.n_class))  # 查全率
            for k in range(self.n_class):  # 真实类别第k列
                idx = self.__sort_positive__(self.y_prob[:, k])
                y_true_k = self.y_true[:, k]  # 真值类别第k列
                y_true = y_true_k[idx]  # 对第k个类别的真值排序
                # 准对每个样本,把预测概率作为阈值,计算各指标
                for i in range(self.n_samples):
                    tp, fn, tn, fp = self.__call_sub_metrics__(y_true, i + 1)
                    precision[i, k] = tp / (tp + fp)  # 查准率
                    recall[i, k] = tp / (tp + fn)  # 查全率
                # 宏查准率与宏查全率
                pr_array = np.array([np.mean(recall, axis=1), np.mean(precision, axis=1)]).T
        return pr_array

    def roc_metrics_curve(self):
        """
        ROC曲线,计算真正利率和假正例率,并可视化
        :return:
        """
        roc_array = np.zeros((self.n_samples, 2))  # 存储每个样本预测概率作为阈值是的TPR和FPR指标
        if self.n_class == 2:  # 二分类
            idx = self.__sort_positive__(self.y_prob[:, 0])  # 降序排列
            y_true = self.y_true[idx]  # 真值类别标签按照排序索引进行排序
            # 准对每个样本,把预测概率作为阈值,计算各指标
            n_nums, p_nums = len(y_true[y_true == 1]), len(y_true[y_true == 0])  # 真实类别中反例与正例的样本量
            tp, fn, tn, fp = self.__call_sub_metrics__(y_true, 1)
            roc_array[0, :] = fp / (tn + fp), tp / (tp + fn)
            for i in range(1, self.n_samples):
                # tp, fn, tn, fp = self.__call_sub_metrics__(y_true, i + 1)
                if y_true[i] == 1:
                    roc_array[i, :] = roc_array[i - 1, 0] + 1 / n_nums, roc_array[i - 1, 1]
                else:
                    roc_array[i, :] = roc_array[i - 1, 0], roc_array[i - 1, 1] + 1 / p_nums
                # roc_array[i, :] = fp / (tn + fp), tp / (tp + fn)
        else:  # 多分类
            fpr = np.zeros((self.n_samples, self.n_class))  # 假正例率
            tpr = np.zeros((self.n_samples, self.n_class))  # 真正例率
            for k in range(self.n_class):  # 针对每个类别,分别计算TPR,FPR指标,然后平均
                idx = self.__sort_positive__(self.y_prob[:, k])
                y_true_k = self.y_true[:, k]  # 真值类别第k列
                y_true = y_true_k[idx]  # 对第k个类别的真值排序
                # 准对每个样本,把预测概率作为阈值,计算各指标
                for i in range(self.n_samples):
                    tp, fn, tn, fp = self.__call_sub_metrics__(y_true, i + 1)
                    fpr[i, k] = fp / (tp + fp)  # 查准率
                    tpr[i, k] = tp / (tp + fn)  # 查全率
                # 宏查准率与宏查全率
                roc_array = np.array([np.mean(fpr, axis=1), np.mean(tpr, axis=1)]).T
        return roc_array

    def __call_sub_metrics__(self, y_true_sort, n):
        """
        计算TP、TN、FP、TN
        :param y_true_sort:排序后的真实类别
        :param n:以第n个样本预测概率为阈值
        :return:
        """
        if self.n_class == 2:
            pre_label = np.r_[np.zeros(n, dtype=int), np.ones(self.n_samples - n, dtype=int)]
            tp = len(pre_label[(pre_label == 0) & (pre_label == y_true_sort)])  # 真正例
            tn = len(pre_label[(pre_label == 1) & (pre_label == y_true_sort)])  # 真反例
            fp = np.sum(y_true_sort) - tn  # 假正例
            fn = self.n_samples - tp - tn - fp  # 假反例
        else:
            pre_label = np.r_[np.ones(n, dtype=int), np.zeros(self.n_samples - n, dtype=int)]
            tp = len(pre_label[(pre_label == 1) & (pre_label == y_true_sort)])  # 真正例
            tn = len(pre_label[(pre_label == 0) & (pre_label == y_true_sort)])  # 真反例
            fn = np.sum(y_true_sort) - tp  # 假正例
            fp = self.n_samples - tp - tn - fn  # 假反例

        return tp, fn, tn, fp

    @staticmethod
    def __cal_ap__(pr_val):
        """
        计算AP
        :param pr_val:
        :return:
        """
        return (pr_val[1:, 0] - pr_val[0:-1, 0]).dot(pr_val[1:, 1])

    @staticmethod
    def __cal_auc__(roc_val):
        """
        计算ROC曲线下的面积,AUC
        :param roc_val:
        :return:
        """
        return (roc_val[1:, 0] - roc_val[0:-1, 0]).dot(roc_val[:-1, 1] + roc_val[1:, 1]) / 2

    def plt_pr_curve(self, pr_val, label=None, is_show=True):
        """
        可视化PR曲线
        :param is_show:
        :param pr_val: PR指标各坐标点值的数组
        :param label:
        :return:
        """
        ap = self.__cal_ap__(pr_val)
        plt.figure(figsize=(7, 5))
        if label:
            plt.step(pr_val[:, 0], pr_val[:, 1], "-", lw=2, where="post",
                     label=label + ", AP = %.3f" % ap)
        else:
            plt.step(pr_val[:, 0], pr_val[:, 1], "-", lw=2, where="post")
        plt.title("Precision Recall Curve of Test Samples and AP = %.3f" % ap)
        plt.xlabel("Recall", fontdict={"fontsize": 12})
        plt.ylabel("Precision", fontdict={"fontsize": 12})
        plt.grid(ls=":")
        plt.legend(frameon=False)
        if is_show:
            plt.show()

    def plt_roc_curve(self, roc_val, label=None, is_show=True):
        """
        可视化ROC曲线
        :param is_show:
        :param roc_val: ROC指标各坐标点值的数组
        :param label:
        :return:
        """
        auc = self.__cal_auc__(roc_val)
        plt.figure(figsize=(7, 5))
        if label:
            plt.step(roc_val[:, 0], roc_val[:, 1], "-", lw=2, where="post",
                     label=label + ", AP = %.3f" % auc)
        else:
            plt.step(roc_val[:, 0], roc_val[:, 1], "-", lw=2, where="post")
        plt.title("ROC Curve of Test Samples and AUC = %.3f" % auc)
        plt.xlabel("False Positive Rate", fontdict={"fontsize": 12})
        plt.ylabel("True Positive Rate", fontdict={"fontsize": 12})
        plt.grid(ls=":")
        plt.legend(frameon=False)
        if is_show:
            plt.show()

  • 8
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值