bootstrap方法绘制带有95%置信区间的ROC曲线(python实现)

最新推荐文章于 2024-07-24 19:56:38 发布

matthewsyc

最新推荐文章于 2024-07-24 19:56:38 发布

阅读量249

点赞数 2

文章标签： bootstrap python 算法机器学习

本文链接：https://blog.csdn.net/weixin_42404713/article/details/139826887

版权

无置信区间

from sklearn import metrics
import pylab as plt
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score, accuracy_score
import pandas as pd
import numpy as np
from sklearn.svm import SVC #以SVC为例
from sklearn.model_selection import train_test_split

datafile = '.csv'
data = pd.read_csv(datafile)
x = data.iloc[: ,0:-1]
y = data['label']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2,random_state=21)
svc = SVC()
svc.fit(x_train, y_train)
y_train_prob = svc.predict_proba(x_train)[:,1]
y_test_prob = svc.predict_proba(x_test)[:,1]
# 不带有置信区间
fpr1,tpr1, thresholds1 = roc_curve(y_test,y_test_prob)
fpr2,tpr2, thresholds2 = roc_curve(y_train,y_train_prob)
roc_auc1 = metrics.auc(fpr1, tpr1)
roc_auc2 = metrics.auc(fpr2, tpr2)
plt.plot(fpr1,tpr1,label = 'test cohort AUC = %0.3f' % roc_auc1)
plt.plot(fpr2,tpr2,label = 'training cohort AUC = %0.3f' % roc_auc2)
plt.plot([0,1],[0,1],'r--')
plt.xlabel('1-Specificity ')
plt.ylabel('Sensitivity')
plt.legend()
plt.show()

带有置信区间

采用的是bootstrap方法，进行抽样。
循环中 fpr 维度是动态变化的，因此为了保证维度一致，进行了插值。

#带有置信区间
from sklearn import metrics
import pylab as plt
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score, accuracy_score
import pandas as pd
import numpy as np
from sklearn.svm import SVC #以SVC为例
from sklearn.model_selection import train_test_split

datafile = '.csv'
data = pd.read_csv(datafile)
x = data.iloc[: ,0:-1]
y = data['label']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2,random_state=21)
svc = SVC()
svc.fit(x_train, y_train)
y_train_prob = svc.predict_proba(x_train)[:,1]
y_test_prob = svc.predict_proba(x_test)[:,1]
# 计算ROC曲线的数据点  
y_test = np.array(y_test)  #注意转换为array格式
y_score = y_test_prob  

fpr1, tpr1, _ = roc_curve(y_test, y_score)  
  
# 计算AUC  
roc_auc = metrics.auc(fpr1, tpr1)  
  
# 计算95%置信区间  
n_bootstraps = 1000  
rng = np.random.RandomState(42)  
tprs = []  
aucs = []  
accs = []
mean_fpr = np.linspace(0, 1, 100)  
  
for i in range(n_bootstraps):  
    sample = rng.choice(len(y_test), len(y_test), replace=True)  
    fpr, tpr, _ = roc_curve(y_test[sample], y_score[sample]) 
    
    roc_auc = metrics.auc(fpr, tpr)  
    acc = accuracy_score(y_test[sample], (y_score[sample] > 0.5).astype(int))
    interp_tpr = np.interp(mean_fpr, fpr, tpr)  
    interp_tpr[0] = 0.0  
    tprs.append(interp_tpr)  
    aucs.append(roc_auc)
    accs.append(acc) 


# 计算95%置信区间
auc_lower = np.percentile(aucs, 2.5)
auc_upper = np.percentile(aucs, 97.5)
accuracy_lower = np.percentile(accs, 2.5)
accuracy_upper = np.percentile(accs, 97.5)

# 计算tprs 95%置信区间
tprs = np.array(tprs)  
mean_tpr = tprs.mean(axis=0)  
std_tpr = tprs.std(axis=0)  
tprs_upper = np.minimum(mean_tpr + 1.96 * std_tpr, 1)  
# tprs_lower = mean_tpr - 1.96 * std_tpr  
tprs_lower = np.maximum(mean_tpr - 1.96 *std_tpr, 0)
  
# 绘制ROC曲线和置信区间  
plt.figure()  
plt.plot(fpr1, tpr1, color='darkorange', lw=2, label=f'test AUC = {roc_auc1:.3f} (95% CI: {auc_lower:.3f} - {auc_upper:.3f})')  
plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=0.3)  
# plt.plot(mean_fpr, tprs_lower, color='grey')  
# plt.plot(mean_fpr, tprs_upper, color='grey')  
  
# 绘制对角线  
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')  
  
# 添加标题和标签  
plt.xlabel('False Positive Rate')  
plt.ylabel('True Positive Rate')  
plt.title('Receiver Operating Characteristic (ROC)')  
  
# 显示图例  
plt.legend(loc="lower right")  
  
# 显示图形  
plt.show()  

print(f"AUC 95% CI: ({auc_lower:.3f}, {auc_upper:.3f})")
print(f"Accuracy 95% CI: ({accuracy_lower:.3f}, {accuracy_upper:.3f})")
# 绘制AUC分布
# plt.figure(figsize=(8, 6))
plt.hist(aucs, bins=30, alpha=0.7, color='blue', label='AUC distribution')
plt.axvline(x=auc_lower, color='r', linestyle='--', label=f'95% CI Lower: {auc_lower:.3f}')
plt.axvline(x=auc_upper, color='r', linestyle='--', label=f'95% CI Upper: {auc_upper:.3f}')
plt.title('AUC Distribution with 95% Confidence Interval')
plt.xlabel('AUC Score')
plt.ylabel('Frequency')
plt.legend()
plt.show()
# 绘制Accuracy分布
# plt.figure(figsize=(8, 6))
plt.hist(accs, bins=30, alpha=0.7, color='green', label='Accuracy distribution')
plt.axvline(x=accuracy_lower, color='r', linestyle='--', label=f'95% CI Lower: {accuracy_lower:.3f}')
plt.axvline(x=accuracy_upper, color='r', linestyle='--', label=f'95% CI Upper: {accuracy_upper:.3f}')
plt.title('Accuracy Distribution with 95% Confidence Interval')
plt.xlabel('Accuracy Score')
plt.ylabel('Frequency')
plt.legend()
plt.show()