无置信区间
from sklearn import metrics
import pylab as plt
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score, accuracy_score
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
datafile = '.csv'
data = pd.read_csv(datafile)
x = data.iloc[: ,0:-1]
y = data['label']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2,random_state=21)
svc = SVC()
svc.fit(x_train, y_train)
y_train_prob = svc.predict_proba(x_train)[:,1]
y_test_prob = svc.predict_proba(x_test)[:,1]
fpr1,tpr1, thresholds1 = roc_curve(y_test,y_test_prob)
fpr2,tpr2, thresholds2 = roc_curve(y_train,y_train_prob)
roc_auc1 = metrics.auc(fpr1, tpr1)
roc_auc2 = metrics.auc(fpr2, tpr2)
plt.plot(fpr1,tpr1,label = 'test cohort AUC = %0.3f' % roc_auc1)
plt.plot(fpr2,tpr2,label = 'training cohort AUC = %0.3f' % roc_auc2)
plt.plot([0,1],[0,1],'r--')
plt.xlabel('1-Specificity ')
plt.ylabel('Sensitivity')
plt.legend()
plt.show()
带有置信区间
- 采用的是bootstrap方法,进行抽样。
- 循环中 fpr 维度是动态变化的,因此为了保证维度一致,进行了插值。
from sklearn import metrics
import pylab as plt
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score, accuracy_score
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
datafile = '.csv'
data = pd.read_csv(datafile)
x = data.iloc[: ,0:-1]
y = data['label']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2,random_state=21)
svc = SVC()
svc.fit(x_train, y_train)
y_train_prob = svc.predict_proba(x_train)[:,1]
y_test_prob = svc.predict_proba(x_test)[:,1]
y_test = np.array(y_test)
y_score = y_test_prob
fpr1, tpr1, _ = roc_curve(y_test, y_score)
roc_auc = metrics.auc(fpr1, tpr1)
n_bootstraps = 1000
rng = np.random.RandomState(42)
tprs = []
aucs = []
accs = []
mean_fpr = np.linspace(0, 1, 100)
for i in range(n_bootstraps):
sample = rng.choice(len(y_test), len(y_test), replace=True)
fpr, tpr, _ = roc_curve(y_test[sample], y_score[sample])
roc_auc = metrics.auc(fpr, tpr)
acc = accuracy_score(y_test[sample], (y_score[sample] > 0.5).astype(int))
interp_tpr = np.interp(mean_fpr, fpr, tpr)
interp_tpr[0] = 0.0
tprs.append(interp_tpr)
aucs.append(roc_auc)
accs.append(acc)
auc_lower = np.percentile(aucs, 2.5)
auc_upper = np.percentile(aucs, 97.5)
accuracy_lower = np.percentile(accs, 2.5)
accuracy_upper = np.percentile(accs, 97.5)
tprs = np.array(tprs)
mean_tpr = tprs.mean(axis=0)
std_tpr = tprs.std(axis=0)
tprs_upper = np.minimum(mean_tpr + 1.96 * std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - 1.96 *std_tpr, 0)
plt.figure()
plt.plot(fpr1, tpr1, color='darkorange', lw=2, label=f'test AUC = {roc_auc1:.3f} (95% CI: {auc_lower:.3f} - {auc_upper:.3f})')
plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=0.3)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()
print(f"AUC 95% CI: ({auc_lower:.3f}, {auc_upper:.3f})")
print(f"Accuracy 95% CI: ({accuracy_lower:.3f}, {accuracy_upper:.3f})")
plt.hist(aucs, bins=30, alpha=0.7, color='blue', label='AUC distribution')
plt.axvline(x=auc_lower, color='r', linestyle='--', label=f'95% CI Lower: {auc_lower:.3f}')
plt.axvline(x=auc_upper, color='r', linestyle='--', label=f'95% CI Upper: {auc_upper:.3f}')
plt.title('AUC Distribution with 95% Confidence Interval')
plt.xlabel('AUC Score')
plt.ylabel('Frequency')
plt.legend()
plt.show()
plt.hist(accs, bins=30, alpha=0.7, color='green', label='Accuracy distribution')
plt.axvline(x=accuracy_lower, color='r', linestyle='--', label=f'95% CI Lower: {accuracy_lower:.3f}')
plt.axvline(x=accuracy_upper, color='r', linestyle='--', label=f'95% CI Upper: {accuracy_upper:.3f}')
plt.title('Accuracy Distribution with 95% Confidence Interval')
plt.xlabel('Accuracy Score')
plt.ylabel('Frequency')
plt.legend()
plt.show()