分析并使用sklearn进行模型性能度量

最新推荐文章于 2022-11-21 01:31:22 发布

Amelia0911

最新推荐文章于 2022-11-21 01:31:22 发布

阅读量495

点赞数 2

分类专栏：机器学习

本文链接：https://blog.csdn.net/liuzhuomei0911/article/details/114115760

版权

机器学习专栏收录该内容

1 篇文章 0 订阅

订阅专栏

一、准确率计算：(混淆矩阵)

#混淆矩阵用于模型预测时：
from torchnet import meter
confusion_matrix = meter.ConfusionMeter(class_num)#指定类别数目
outputs = model(images) #每个batch的输出
confusion_matrix.add(outputs.data.squeeze(), labels.type(torch.LongTensor)) #添加每个batch输出
cm_value = confusion_matrix.value()#输出整个测试集的结果
accuracy = 100. * (cm_value[0][0] + cm_value[1][1]) / (cm_value.sum()) #根据混淆矩阵的tp+tn及测试集数目，计算准确率

 #模型预测，求准确率的常用方式：
 outputs = model(images) #一个batch的输出
 _, predicts = torch.max(outputs, 1) #取每个样本的预测的最可能的类别
 corrects += torch.sum(predicts.view(-1) == labels.view(-1)).item() #预测值与真实值对比，统计正确预测的数目
 accuracy = corrects / len(test_loader.dataset) #准确率
 
#混淆矩阵用在numpy数组统计时：
二分类：正例数->15，反例数->18
y_ture = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
y_pred= np.array([1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1])
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_true, y_pred)
accuracy = (tp+tn)/sum = (cm[0][0]+cm[1][1])/np.sum(cm)
cm: #混淆矩阵
 [[ 12  6]  [[tn, fp]
  [ 6   9]]  [fn, tp]]

二、fps假正例数与tps真正例数：

真实值与预测值(类别或概率)进行比较。

预测值是类别时，以类别id作为阈值列表，进行统计。
预测值是概率时，具体做法就是在预测概率中选取几个不重复的值作为阈值，得到对应TP和FP的结果，组成fps和tps，后续再计算对应的fpr和tpr，绘制对应的roc，求得相应的auc。

#源码：(roc_curve函数中调用)
fps, tps, thresholds = _binary_clf_curve( y_true, y_score, pos_label=pos_label, sample_weight=sample_weight) 
#添加一个额外的阈值位置，以确保曲线从（0，0）开始。
#thresholds = np.r_[thresholds[0] + 1, thresholds]

#例：
from sklearn.metrics import roc_curve
y_ture = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
y_pred= np.array([1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1])
fpr, tpr, thresholds = roc_curve(y_ture, y_pred, pos_label=1)
# thresholds: [2 1 0]
print('fpr: ', fpr, '\n', 'tpr: ', tpr)
#tps和fps统计直观实现：
pos_label = 1
tps = []
fps = []
for threshold in thresholds: #将预测概率进行排序，并选取不重复概率值作为阈值。同时添加一个额外阈值
    # 阈值化, 预测为正例：tp+fp
    positive = [1 if i >= threshold else 0 for i in y_score]
    # 真正例：tp
    tp = [i == pos_label and j == 1 for i, j in zip(y_true, positive)]
    # 假正例：fp
    fp = [i != pos_label and j == 1 for i, j in zip(y_true, positive)]
    tps.append(tp.count(True))
    fps.append(fp.count(True))
print('fps: ', fps, '\n', 'tps: ', tps)
#输出：
fpr:  [0.  0.33333333 1. ] 
tpr:  [0.  0.6        1. ]
fps:  [0, 6, 18] 
tps:  [0, 9, 15]

三、fpr假正例率和tpr真正例率：

假正例率：fpr = fp / 假例数
正正利率：tpr = tp / 正例数

from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_ture, y_pred, pos_label=1)

#预测值是类别
二分类：正例数->15，反例数->18
1. 先计算出真实值与预测值的fps和tps
fps:  [6. 18.] ->[fp, 反例数]
tps:  [9. 15.] ->[tp, 正例数]
2. 计算对应的fpr和tpr（添加一个额外的位置，确保曲线从（0，0）开始）
fps:  [ 0.  6. 18.] # fps = np.r_[0, fps]
tps:  [ 0.  9. 15.] # tps = np.r_[0, tps]
fpr: [0.  0.33333333 1. ] # fpr = fps / fps[-1]
tpr: [0.  0.6        1. ] # tpr = tps / tps[-1]

#预测值是概率：
y_ture = np.array([1, 1, 2, 2])
y_pred = np.array([0.1, 0.4, 0.35, 0.8])
fpr, tpr, thresholds = roc_curve(y_ture, y_pred, pos_label=2)
# 阈值从大到小进行统计
TH:   [1.8 0.8 0.4 0.35 0.1] #np.r_[thresholds[0] + 1, thresholds]
fps:  [0.  0.  1.  1.  2.]  # fps = np.r_[0, fps]
tps:  [0.  1.  1.  2.  2.]  # tps = np.r_[0, tps]
fpr:  [0.  0.  0.5 0.5 1. ] # fpr = fps / fps[-1]
tpr:  [0.  0.5 0.5 1.  1. ] # tpr = tps / tps[-1]

绘制roc曲线：

fpr - x轴
tpr - y轴

plt.figure()
lw = 2
#plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) #面积表示auc
plt.plot(fpr, tpr, color='darkorange', lw=lw)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC')
plt.legend(loc="lower right")
plt.show()

四、AUC计算：

即roc曲线下方的面积。

from sklearn.metrics import auc
roc_auc = auc(fpr, tpr) #fpr(假正例率)和tpr(真正例率)
fpr: [0.  0.33333333 1. ] 
tpr: [0.  0.6        1. ]
auc:  0.6333333333333334 #面积
#np函数
aa = np.trapz([0, 0.6, 1], x=[0, 0.33333333, 1])
#具体计算(梯形面积：(上底+下底)*高/2)
bb = (0+0.6)*(0.33333333-0)*0.5+(0.6+1)*(1-0.33333333)*0.5

五、查准率P与查全率R计算：

查准率：P = tp / (tp+fp)
查全率：R = tp / (tp+fn)

from sklearn.metrics import precision_recall_curve
p, r, _ = precision_recall_curve(y_ture, y_pred, pos_label=2)
# p的计算过程
1. precision=tps/(tps+fps) 
2. last_ind=tps.searchsorted(tps[-1], #初次全召回的位置索引 
3. 切片方式：根据last_ind反转:sl = slice(last_ind, None, -1)  #将last_ind之前的数反转
4. 矩阵连接：补1：np.r_[p[sl],1] #反转之后补1 

# r的计算过程
1. recall = tps/tps[-1] 
2. 补0：np.r_[r[sl], 0]

# 预测值是类别
二分类：正例数->15，反例数->18
fps:  [ 6. 18.] 
tps:  [ 9. 15.]
# precision = tps/(tps+fps) = [9/(9+6), 15/(15+18)] = [0.6, 0.45454545]
# recall = tps/tps[-1] =[9/15, 15/15] = [0.6, 1]
#last_ind=1, sl=slice(1, None, -1), 将起始位置至last_ind位置元素反转
p:  [0.45454545 0.6  1. ] # 反转后补1:np.r_[p[sl],1]
r:  [1.         0.6  0. ] # 反转后补0:np.r_[r[sl], 0] 

# 预测值是概率
y_ture = np.array([1, 1, 2, 2])
y_pred = np.array([0.1, 0.4, 0.35, 0.8])
fps:  [0. 1. 1. 2.]
tps:  [1. 1. 2. 2.]
p:  [0.66666667 0.5  1.  1.] 
r:  [1.         0.5  0.5 0. ]

绘制p-r曲线：

r - x轴
p - y轴

plt.figure()
plt.step(r, p, color='b', alpha=0.2, where='post')
plt.fill_between(r, p, step='post', alpha=0.2, color='b')
plt.xlabel('R')
plt.ylabel('P')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('p_r')
plt.show()