二、模型评估与选择
数据来源:https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic
2.1经验误差与过拟合
2.2评估方法
2.2.1留出法(hold-out)
import pandas as pd # 数据科学计算工具
import numpy as np # 数值计算工具
from sklearn.preprocessing import LabelEncoder,StandardScaler # 类别标签编码,标准化处理
from sklearn.decomposition import PCA # 主成分分析
import matplotlib.pyplot as plt
wdbc = pd.read_csv("C:\\Users\???\Desktop\机器学习\数据\wdbc.data",header=None)
wdbc.head()
X, y = wdbc.loc[:, 2:].values, wdbc.loc[:, 1] # 提取特征数据和样本标签集
X = StandardScaler().fit_transform(X) # 对样本特征数据进行标准化
lab_en = LabelEncoder() # 对目标值进行编码,创建对象
y = lab_en.fit_transform(y) # 拟合和转换
lab_en.classes_, lab_en.transform(["B", "M"])
pca = PCA(n_components=6).fit(X) # 选取6个主成分
evr = pca.explained_variance_ratio_
print("各主成分贡献率",evr, "\n累计贡献率",np.cumsum(evr))
X_pca = pca.transform(X) # 转换获得各主成分数据
print(X_pca[:5, :])
plt.figure(figsize=(21, 5))
X_b, X_m = X_pca[y == 0], X_pca[y == 1] # 把降维后的数据按类别分别提取
for i in range(3):
plt.subplot(131 + i)
plt.plot(X_b[:, i * 2], X_b[:, i * 2 + 1], "ro-", markersize=3, label="benign")
plt.plot(X_m[:, i * 2], X_m[:, i * 2 + 1], "bx--", markersize=5, label="maligant")
plt.legend(frameon=False)
plt.grid(ls=":")
plt.xlabel(str(i * 2 + 1) + "th principal component", fontsize=12)
plt.xlabel(str(i * 2 + 2) + "th principal component", fontsize=12)
plt.title("Eech category of data dim reduction by PCA", fontsize=12)
plt.show()
from sklearn.model_selection import train_test_split #划分数据集
from sklearn.linear_model import LogisticRegression #逻辑回归
from sklearn.metrics import classification_report, accuracy_score #分类报告,正确率
acc_test_score,acc_train_score = [], [] # 每次随即划分训练和测试评分
for i in range (50):
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.25, random_state=i, shuffle=True, stratify=y)
log_red = LogisticRegression() # 未掌握原理之前,所有参数默认
log_red.fit(X_train, y_train) # 采用训练集训练模型
y_test_pred = log_red.predict(X_test) # 模型训练完毕后,对测试样本进行预测
acc_test_score.append(accuracy_score(y_test, y_test_pred))
acc_train_score.append(accuracy_score(y_train, log_red.predict(X_train)))
plt.figure(figsize=(7,5))
plt.plot(acc_test_score, "ro", lw=1.5, markersize=4, label="Test")
plt.plot(acc_train_score, "ks--", lw=1, markersize=4, label="Train")
plt.legend(frameon=False)
plt.grid(ls=":")
plt.xlabel("Random division times", fontsize=12)
plt.xlabel("Accuracy score of test vs train", fontsize=12)
plt.title("Test samples accuracy score Mean = %.5f(+/-%.5f)" % (np.mean(acc_test_score), np.std(acc_test_score)), fontsize=12)
plt.show()
from sklearn.pipeline import make_pipeline # 标准化
pipe_lr = make_pipeline(StandardScaler(), # 主成分降维
PCA(n_components=6), # 逻辑回归
LogisticRegression())
# 划分数据集为训练集和测试集,比例8:2
X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=0.20,
stratify=y, random_state=1, shuffle=True)
pipe_lr.fit(X_train, y_train) # 训练模型
y_pred = pipe_lr.predict(X_test) # 预测
print('Text Accuracy:%.3f' % pipe_lr.score(X_test,y_test))
2.2.2K折交叉验证法
from sklearn.model_selection import StratifiedKFold #交叉验证
kfold = StratifiedKFold(n_splits=10).split(X_train, y_train) # 10折划分数据集
scores = [] # 10折每一次训练的验证评分
for i, (train_idx, test_idx) in enumerate(kfold):
pipe_lr.fit(X_train[train_idx], y_train[train_idx]) # 9个子集用于训练
score = pipe_lr.score(X_train[test_idx], y_train[test_idx]) # 其中一个子集用于测试
scores.append(score)
print("Fold: %d,Class dist: %s, Acc: %.3f" % (i + 1, np.bincount(y_train[train_idx]), score))
print("CV acc: %.3f (+/-%.3f)" % (np.mean(scores), np.std(scores)))
from sklearn.model_selection import cross_val_score
scores = cross_val_score(estimator=pipe_lr, X=X_train, y=y_train, cv=10)
print("CV accuracy scores: %s" % scores)
print("CV acc: %.3f (+/-%.3f)" % (np.mean(scores), np.std(scores)))
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
pipe_knn = make_pipeline(StandardScaler(), # 标准化
PCA(n_components=10), # 保留10个主成分
KNeighborsClassifier()) # K近邻分类
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, shuffle=True, stratify=y)
k_range = range(1, 31) # k的选择范围,其他参数默认
cv_scores = [] # 每个看值的10折均分
for k in k_range:
pipe_knn.set_params(kneighborsclassifier__n_neighbors=k) # 设置参数近邻数n_neighbor
scores = cross_val_score(estimator=pipe_knn, X=X_train,
y=y_train, cv=10)
cv_scores.append(np.mean(scores))
plt.figure(figsize=(7, 5))
plt.plot(k_range, cv_scores, "ko-", lw=1, markeredgecolor="r")
plt.grid(ls=":")
plt.ylabel("Accuracy score of test", fontsize=12)
plt.xlabel("N_neighbors", fontsize=12)
plt.title("Test samples accuracy of different n_neighbors", fontsize=14)
plt.show()
# 获得最佳参数,重新训练模型,进行预测
idx = np.argmax(cv_scores) # 最优评分的索引
pipe_knn.set_params(kneighborsclassifier__n_neighbors=idx+1) # 设置最佳近邻数
pipe_knn.fit(X_train, y_train)
y_test_pred = pipe_knn.predict(X_test)
print("Test Xcore is %.5f with KNN n_neighbors=%d" % (accuracy_score(y_test, y_test_pred),idx + 1))
2.2.3自助法
def bootstrapping(m):
"""
自助采样法,m表示样本量。即抽样的次数
:param m:
:return:
"""
boostrap = [] # 存储每次采样的样本索引编号
for i in range(m):
boostrap.append(np.random.randint(0, m, 1)) # 随机产生一个样本的索引编号
return np.asarray(boostrap).reshape(-1)
print("样本比例正例与反例的比:%d : %d = %.5f"%(len(y[y ==0]), len(y[y == 1]),len(y[y ==0])/len(y[y == 1])))
n_samples = X_pca.shape[0] #样本量
ratio_bs = [] #存储每次未划分到训练集中的样本比例
for i in range(15000):
train_idx = bootstrapping(n_samples) # 一次自助采样获得训练集样本索引
idx_all = np.linspace(0, n_samples - 1, n_samples, dtype=int) # 总体样本的索引编号
test_idx = np.setdiff1d(idx_all, train_idx) # 测试样本的索引编号
ratio_bs.append(len(test_idx) / n_samples) # 测试样本的占所有样本得比例
y_train = y[train_idx] # 其中一次自助采样后得训练样本目标集
print("抽样后,正例与反例得比例:%.5f" % (len(y_train[y_train == 0])/len(y_train[y_train == 1])))
print("自助采样后,未出现在训练集中的比例:%.5f" % np.mean(ratio_bs)) # 15000次自助采样的均值
import seaborn as sns
sns.displot(ratio_bs,kind="hist",color="purple")
plt.show()
X_train, y_train = X_pca[train_idx, :], y[train_idx]
x_test, y_test = X_pca[test_idx], y[test_idx]
knn = KNeighborsClassifier(n_neighbors=9)
knn.fit(X_train, y_train)
y_test_pred = knn.predict(X_test)
print("Test score is %.5f" % accuracy_score(y_test,y_test_pred))
2.2.4调参与最终模型
from sklearn.model_selection import GridSearchCV # 网格搜索
from sklearn.svm import SVC
pipe_svc = make_pipeline(StandardScaler(), PCA(n_components=4), SVC())
X, y = wdbc.loc[:, 2:].values, wdbc.loc[:, 1] # 提取特征数据和样本标签集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1, shuffle=True, stratify=y)
param_range = [0.001, 0.01, 0.1, 1, 10, 100] # 指定C与gamma参数的取值
param_grid = [{'svc__C': param_range, 'svc__kernel': ['linear']},
{'svc__C': param_range, 'svc__gamma': param_range, 'svc__kernel': ['rbf']}] # 生成参数网络,参数的组合
gs_cv = GridSearchCV(estimator=pipe_svc, param_grid=param_grid, scoring="accuracy", cv=10, refit=True)
gs_result = gs_cv.fit(X_train, y_train) # 运行网格搜索
print("Best: %f, using %s" % (gs_result.best_score_, gs_result.best_params_))
test_means = gs_result.cv_results_['mean_test_score']
params = gs_result.cv_results_['params']
for tm, param in zip(test_means, params):
print("%f with: %s" % (tm, param))
clf = gs_result.best_estimator_
print("Test accuracy is %.5f" % clf.score(X_test, y_test))
2.2.5嵌套交叉验证
# 支持向量机
pipe_svc = make_pipeline(StandardScaler(), PCA(n_components=6), SVC())
param_range = [0.001, 0.01, 0.1, 1, 10, 100] # 指定C与gamma参数的取值
param_grid = [{'svc__C': param_range, 'svc__kernel': ['linear']},
{'svc__C': param_range, 'svc__gamma': param_range, 'svc__kernel': ['rbf']}] # 生成参数网络,参数的组合
gs_svc = GridSearchCV(estimator=pipe_svc, param_grid=param_grid, scoring="accuracy", cv=2, refit=True) # 内层,2折
score_svc = cross_val_score(gs_svc, X_train, y_train, scoring="accuracy", cv=5) # 外层,5折
print("SVC CV accuracy:%.5f +/-%.5f" % (np.mean(score_svc), np.std(score_svc)))
# K近邻算法
pipe_knn = make_pipeline(StandardScaler(), PCA(n_components=6), KNeighborsClassifier())
param_grid = [{'kneighborsclassifier__n_neighbors': [3, 4, 5, 6, 7, 8, 9, 10]},
{'kneighborsclassifier__algorithm': ['ball_tree', 'kd_tree', 'brute']}] # 生成参数网络,参数的组合
gs_knn = GridSearchCV(estimator=pipe_knn, param_grid=param_grid, scoring="accuracy", cv=2, refit=True) # 内层,2折
score_knn = cross_val_score(gs_knn, X_train, y_train, scoring="accuracy", cv=5) # 外层,5折
print("KNN CV accuracy:%.3f +/-%.3f" % (np.mean(score_knn), np.std(score_knn)))
# 决策树
pipe_dtc = make_pipeline(StandardScaler(), PCA(n_components=6), DecisionTreeClassifier())
param_grid = [{'decisionTreeClassifier__max_depth': [1, 2, 3, 4, 5, 6, 7, None]},
{'DecisionTreeClassifier__criterion': ['gini', 'entropy']}] # 生成参数网络,参数的组合
gs_dtc = GridSearchCV(estimator=pipe_dtc, param_grid=param_grid, scoring="accuracy", cv=2, refit=True) # 内层,2折
score_dtc = cross_val_score(gs_svc, X_train, y_train, scoring="accuracy", cv=5) # 外层,5折
print("DTC CV accuracy:%.5f +/-%.5f" % (np.mean(score_dtc), np.std(score_dtc)))
2.2.6Hyperopt自动化超参数调优
from hyperopt import fmin, tpe, hp, rand # 超参数优化技术
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm # 支持向量机
from sklearn.model_selection import train_test_split # 分割训练集和测试集
from sklearn.datasets import load_digits # 加载手写数据集
from sklearn.decomposition import PCA # 主成分分析
from sklearn.preprocessing import StandardScaler # 标准化处理
from sklearn.pipeline import make_pipeline # 标准化
from sklearn.model_selection import cross_val_score # 交叉验证评分
# sklearn,datasets中内中的手写数字图片数据集
digits = load_digits() # 数据加载进digit
X, y = digits.data, digits.target
fig = plt.figure(figsize=(12, 8))
fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05) # 调节子图形布局
for i in range(24):
ax = fig.add_subplot(4, 6, i + 1, xticks=[], yticks=[]) # 初始化子图:在4行6列的网格中,在第i+1个位置添加一个子图
ax.imshow(digits.images[i]) # 在第i个位置显示图像
plt.show()
parameter_space_svc = {
'C': hp.loguniform("C", np.log(1), np.log(100)), # loguniform表示该参数取对数后服从均匀分布
'kernel': hp.choice('kernel', ['rbf', 'poly']),
'gamma': hp.loguniform("gamma", np.log(0.001), np.log(0.1))
}
pipe_svc = make_pipeline(StandardScaler(), PCA(n_components=20), svm.SVC())
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0, shuffle=True, stratify=y)
count = 0 # 计数器,每一次参数组合的枚举都会使它加1,用于输出序号
cv_scores = []
def hyperopt_train_val(args):
clf = svm.SVC(**args) # **可以把dict转换为关键字参数
score = cross_val_score(clf, X_train, y_train).mean()
cv_scores.append(score)
global count
count = count + 1
print("[%d], %s, Validate acc: %5f" % (count, args, score))
return -score
best = fmin(hyperopt_train_val, parameter_space_svc, algo=tpe.suggest, max_evals=100)
kernel_list = ['rbf', 'ploy']
best["kernel"] = kernel_list[best["kernel"]]
print("best params:", best)
clf = svm.SVC(**best) # 根据最佳参数对测试数据进行预测
clf.fit(X_train, y_train)
print("The best params of svc, test scores = %.5f" % clf.score(X_train,y_train))
plt.figure(figsize=(8, 6))
plt.plot(cv_scores, "ko--", markersize=5, label="Test", markeredgecolor="r")
plt.grid(ls=":")
plt.xlabel("eval times", fontsize=12)
plt.ylabel("scores", fontsize=12)
plt.title("Super parameter tuning of SVM by Hyperopt", fontsize=14)
plt.show()
2.3性能度量
import numpy as np # 数值计算
import pandas as pd # 数据分析
import matplotlib.pyplot as plt # 可视化
class ModelperformanceMetrices:
"""
模型性能度量:分二分类和多分类,模型的泛化性能度量
1.计算混淆矩阵
2.计算分类报告,模板采用sklearn,classification_report格式
3.计算P(查准率)R(查全率)指标,并可视化P-R曲线,计算AP
4.计算ROC的指标:真正例率,假正例率,并可视化ROC曲线,计算AUC
5.计算代价曲线:归一化指标,正例概率代价,可视化代价曲线,并计算期望总体代价
"""
def __init__(self, y_true, y_prob):
"""
初始化参数
:param y_true:样本的正式类别
:param y_prob:样本的预测类别
"""
self.y_true = np.asarray(y_true, dtype=int) # 将输入转换为数组
self.y_prob = np.asarray(y_prob, dtype=float)
self.n_samples, self.n_class = self.y_prob.shape # 样本量和类别数
if self.n_class > 2: # 多分类
self.y_true = self.label_one_hot()
else:
self.y_true = self.y_true.reshape(-1)
self.cm = self.cal_confusion_matrix() # 计算混淆矩阵
def label_one_hot(self):
"""
对真实类别标签进行one—hot编码,编码后的维度与模型预测概率维度一致
:return:
"""
y_ture_lab = np.zeros((self.n_samples, self.n_class)) # 给定形状和类型的新数组,并用零填充。
for i in range(self.n_samples):
y_ture_lab[i, self.y_true[i]] = 1
return y_ture_lab
def cal_confusion_matrix(self):
"""
计算并构建混淆矩阵
:return:
"""
confusion_matrix = np.zeros((self.n_class, self.n_class))
for i in range(self.n_samples):
idx = np.argmax(self.y_prob[i, :]) # 最大概率所对应的索,既是类别
if self.n_class == 2:
idx_ture = self.y_true[i]
else:
idx_ture = np.argmax(self.y_true[i, :]) # 第i个样本的真实类别
if idx_ture == idx:
confusion_matrix[idx, idx] += 1 # 预测真确,则在对角线元素位置加1
else:
confusion_matrix[idx_ture, idx] += 1 # 预测错误,则在真实类别行预测错误列加1
return confusion_matrix
def cal_classification_report(self, target_names=None):
"""
计算并构造分类报告
:return:
"""
precision = np.diag(self.cm) / np.sum(self.cm, axis=0) # 查准率
recall = np.diag(self.cm) / np.sum(self.cm, axis=1) # 查全率
f1_score = 2 * precision * recall / (precision + recall) # F1调和平均
support = np.sum(self.cm, axis=1) # 各个类别的支持样本量
support_all = np.sum(support) # 总的样本量
accuracy = np.sum(np.diag(self.cm)) / support_all # 准确率
p_m, r_m = precision.mean(), recall.mean()
macro_avg = [p_m, r_m, 2 * p_m * r_m / (p_m + r_m)] # 宏指标
weight = support / support_all # 以各个类别的样本量所占总的样本量比例为权重
weight_avg = [np.sum(weight * precision), np.sum(weight * recall), np.sum(weight * f1_score)]
# 构造分类报告
metrics_1 = pd.DataFrame(np.array([precision, recall, f1_score, support]).T,
columns=["precision", "recall", "f1_score", "support"])
metrics_2 = pd.DataFrame([["", "", "", ""], ["", "", accuracy, support_all],
np.hstack([macro_avg, support_all]),
np.hstack([weight_avg, support_all])],
columns=["precision", "recall", "f1_score", "support"])
c_report = pd.concat([metrics_1, metrics_2], ignore_index=False)
if target_names is None:
target_names = [str(i) for i in range(self.n_class)]
else:
target_names = list(target_names)
target_names.extend(["", "accuracy", "macro_avg", "weighted avg"])
c_report.index = target_names
return c_report
@staticmethod
def __sort_positive__(y_prob):
"""
按照预测为正例的概率进行降序排列,并返回排序的索引向量
:param y_prob:一维数组,样本预测为正例的概率
:return:
"""
idx = np.argsort(y_prob)[::-1] # 降序排列
return idx
def precision_recall_curve(self):
"""
Precision和Recall曲线,计算各坐标点的值,可视化P—R曲线
:return:
"""
pr_array = np.zeros((self.n_samples, 2)) # 存储每个样本预测概率作为阈值是的P和R指标
if self.n_class == 2: # 二分类
idx = self.__sort_positive__(self.y_prob[:, 0]) # 降序排列
y_true = self.y_true[idx] # 真值类别标签按照排序索引进行排序
# 准对每个样本,把预测概率作为阈值,计算各指标
for i in range(self.n_samples):
tp, fn, tn, fp = self.__call_sub_metrics__(y_true, i + 1)
pr_array[i, :] = tp / (tp + fn), tp / (tp + fp)
else: # 多分类
precision = np.zeros((self.n_samples, self.n_class)) # 查准率
recall = np.zeros((self.n_samples, self.n_class)) # 查全率
for k in range(self.n_class): # 真实类别第k列
idx = self.__sort_positive__(self.y_prob[:, k])
y_true_k = self.y_true[:, k] # 真值类别第k列
y_true = y_true_k[idx] # 对第k个类别的真值排序
# 准对每个样本,把预测概率作为阈值,计算各指标
for i in range(self.n_samples):
tp, fn, tn, fp = self.__call_sub_metrics__(y_true, i + 1)
precision[i, k] = tp / (tp + fp) # 查准率
recall[i, k] = tp / (tp + fn) # 查全率
# 宏查准率与宏查全率
pr_array = np.array([np.mean(recall, axis=1), np.mean(precision, axis=1)]).T
return pr_array
def roc_metrics_curve(self):
"""
ROC曲线,计算真正利率和假正例率,并可视化
:return:
"""
roc_array = np.zeros((self.n_samples, 2)) # 存储每个样本预测概率作为阈值是的TPR和FPR指标
if self.n_class == 2: # 二分类
idx = self.__sort_positive__(self.y_prob[:, 0]) # 降序排列
y_true = self.y_true[idx] # 真值类别标签按照排序索引进行排序
# 准对每个样本,把预测概率作为阈值,计算各指标
n_nums, p_nums = len(y_true[y_true == 1]), len(y_true[y_true == 0]) # 真实类别中反例与正例的样本量
tp, fn, tn, fp = self.__call_sub_metrics__(y_true, 1)
roc_array[0, :] = fp / (tn + fp), tp / (tp + fn)
for i in range(1, self.n_samples):
# tp, fn, tn, fp = self.__call_sub_metrics__(y_true, i + 1)
if y_true[i] == 1:
roc_array[i, :] = roc_array[i - 1, 0] + 1 / n_nums, roc_array[i - 1, 1]
else:
roc_array[i, :] = roc_array[i - 1, 0], roc_array[i - 1, 1] + 1 / p_nums
# roc_array[i, :] = fp / (tn + fp), tp / (tp + fn)
else: # 多分类
fpr = np.zeros((self.n_samples, self.n_class)) # 假正例率
tpr = np.zeros((self.n_samples, self.n_class)) # 真正例率
for k in range(self.n_class): # 针对每个类别,分别计算TPR,FPR指标,然后平均
idx = self.__sort_positive__(self.y_prob[:, k])
y_true_k = self.y_true[:, k] # 真值类别第k列
y_true = y_true_k[idx] # 对第k个类别的真值排序
# 准对每个样本,把预测概率作为阈值,计算各指标
for i in range(self.n_samples):
tp, fn, tn, fp = self.__call_sub_metrics__(y_true, i + 1)
fpr[i, k] = fp / (tp + fp) # 查准率
tpr[i, k] = tp / (tp + fn) # 查全率
# 宏查准率与宏查全率
roc_array = np.array([np.mean(fpr, axis=1), np.mean(tpr, axis=1)]).T
return roc_array
def __call_sub_metrics__(self, y_true_sort, n):
"""
计算TP、TN、FP、TN
:param y_true_sort:排序后的真实类别
:param n:以第n个样本预测概率为阈值
:return:
"""
if self.n_class == 2:
pre_label = np.r_[np.zeros(n, dtype=int), np.ones(self.n_samples - n, dtype=int)]
tp = len(pre_label[(pre_label == 0) & (pre_label == y_true_sort)]) # 真正例
tn = len(pre_label[(pre_label == 1) & (pre_label == y_true_sort)]) # 真反例
fp = np.sum(y_true_sort) - tn # 假正例
fn = self.n_samples - tp - tn - fp # 假反例
else:
pre_label = np.r_[np.ones(n, dtype=int), np.zeros(self.n_samples - n, dtype=int)]
tp = len(pre_label[(pre_label == 1) & (pre_label == y_true_sort)]) # 真正例
tn = len(pre_label[(pre_label == 0) & (pre_label == y_true_sort)]) # 真反例
fn = np.sum(y_true_sort) - tp # 假正例
fp = self.n_samples - tp - tn - fn # 假反例
return tp, fn, tn, fp
@staticmethod
def __cal_ap__(pr_val):
"""
计算AP
:param pr_val:
:return:
"""
return (pr_val[1:, 0] - pr_val[0:-1, 0]).dot(pr_val[1:, 1])
@staticmethod
def __cal_auc__(roc_val):
"""
计算ROC曲线下的面积,AUC
:param roc_val:
:return:
"""
return (roc_val[1:, 0] - roc_val[0:-1, 0]).dot(roc_val[:-1, 1] + roc_val[1:, 1]) / 2
def plt_pr_curve(self, pr_val, label=None, is_show=True):
"""
可视化PR曲线
:param is_show:
:param pr_val: PR指标各坐标点值的数组
:param label:
:return:
"""
ap = self.__cal_ap__(pr_val)
plt.figure(figsize=(7, 5))
if label:
plt.step(pr_val[:, 0], pr_val[:, 1], "-", lw=2, where="post",
label=label + ", AP = %.3f" % ap)
else:
plt.step(pr_val[:, 0], pr_val[:, 1], "-", lw=2, where="post")
plt.title("Precision Recall Curve of Test Samples and AP = %.3f" % ap)
plt.xlabel("Recall", fontdict={"fontsize": 12})
plt.ylabel("Precision", fontdict={"fontsize": 12})
plt.grid(ls=":")
plt.legend(frameon=False)
if is_show:
plt.show()
def plt_roc_curve(self, roc_val, label=None, is_show=True):
"""
可视化ROC曲线
:param is_show:
:param roc_val: ROC指标各坐标点值的数组
:param label:
:return:
"""
auc = self.__cal_auc__(roc_val)
plt.figure(figsize=(7, 5))
if label:
plt.step(roc_val[:, 0], roc_val[:, 1], "-", lw=2, where="post",
label=label + ", AP = %.3f" % auc)
else:
plt.step(roc_val[:, 0], roc_val[:, 1], "-", lw=2, where="post")
plt.title("ROC Curve of Test Samples and AUC = %.3f" % auc)
plt.xlabel("False Positive Rate", fontdict={"fontsize": 12})
plt.ylabel("True Positive Rate", fontdict={"fontsize": 12})
plt.grid(ls=":")
plt.legend(frameon=False)
if is_show:
plt.show()