KeyError: "None of [Int64Index([ 0, 1, 2, 3, 4, 6, 7, 8, 9, 10,\n ...\n 907, 908, 910, 911, 912, 914, 916, 917, 920, 923],\n dtype='int64', length=739)] are in the [columns]"
目录
问题:
validation_test_result_pd = pd.DataFrame()
X = features_train
y = target_train
n_samples, n_features = X.shape
# Classification and ROC analysis
# Run classifier with cross-validation and plot ROC curves
# cv = StratifiedKFold(n_splits=5)
cv =KFold(n_splits=5,shuffle=True,random_state=42)
# classifier = LogisticRegression(class_weight = "balanced", penalty = "l2")
# 'n_estimators': 200, 'min_child_weight': 1, 'max_depth': 5
classifier = pipeline_optimizer.fitted_pipeline_.steps[0][1]
# classifier = XGBClassifier()
tprs = []
aucs = []
accs = []
auc_mean_list = []
auc_std_list = []
acc_mean_list = []
acc_std_list = []
mean_fpr = np.linspace(0, 1, 100)
rep_folds = []
fig, ax = plt.subplots()
for i, (train, test) in enumerate(cv.split(X, y)):
#print(test)
#print(type(test))
#print(train.shape)
classifier.fit(X[train], y[train])
viz = plot_roc_curve(classifier, X[test], y[test],
name='ROC curve of fold {}'.format(i),
alpha=0.2, lw=2, ax=ax)
#print(viz)
interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
interp_tpr[0] = 0.0
tprs.append(interp_tpr)
aucs.append(viz.roc_auc)
y_pred = classifier.predict(X[test])
accs.append(accuracy_score(y[test], y_pred))
print(str(i))
print('---------------classification report of fold %d-------------------' % (i))
y_pred = classifier.predict(X[test])
print(classification_report(y[test], y_pred))
#fold confusion matrix
rep_folds.append(classification_report(y[test], y_pred, output_dict=True,digits=3))
# Compute confusion matrix
cnf_matrix = confusion_matrix(y[test], y_pred)
np.set_printoptions(precision=2)
#np.set_printoptions(precision=3)
# Plot non-normalized confusion matrix
plt.figure()
# radiomics
title_raidomics='Radiomics plot of fold_'+str(i)
plot_radiomics(X[test],y[test],title_raidomics,classifier)
plt.figure(figsize(6,4))
#confustion matrix
plot_confusion_matrix(cnf_matrix, classes=[0,1],
title='Confusion matrix, without normalization of fold %d ' % (i))
# Plot normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=[0,1], normalize=True,
title='Normalized confusion matrix of fold %d ' % (i))
if i == 0:
best = viz.roc_auc
model = classifier
elif viz.roc_auc>best:
best = viz.roc_auc
model = classifier
else:
pass
input_len = len(test)
to_dca = {'y_true':y[test],'y_pred':y_pred,'probailities':classifier.predict_proba(X[test])[:,1]}
dca_pd = pd.DataFrame.from_dict(to_dca)
dca_pd['train_ornot'] = input_len*["fold_" + str(i)]
#fold_pd = pd.DataFrame(X[test],columns = features_train.columns.tolist())
#dca_pd = pd.concat([dca_pd,fold_pd],axis = 1)
train_temp_pd = df_in.iloc[target_train.index.tolist()].reset_index(drop = True)
index_pd = train_temp_pd.iloc[test].reset_index(drop = True)
dca_pd = pd.concat([dca_pd,index_pd],axis = 1)
#fold_value = 'fold'+str(i)
#dca_pd['validation_fold_or_test'] = [fold_value]*index_pd.shape[0]
if i == 0:
validation_test_result_pd = dca_pd
else:
validation_test_result_pd = validation_test_result_pd.append(dca_pd,ignore_index = True)
ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='red',
label='Chance', alpha=.8)
mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
from sklearn.metrics import auc
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
mean_acc = np.mean(accs)
std_acc = np.std(accs)
import math
print(len(aucs) == len(mean_tpr))
std_error = std_auc / math.sqrt(5)
# cv = 5
# so t-test table search and the result is 2.776, cause freedom is 5-1
# ci = 2.776 * std_error
ci = 2.262 * std_error
# print(ci)
lower_bound = mean_auc - ci
upper_bound = mean_auc + ci
# ax.plot(mean_fpr, mean_tpr, color='b',
# label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
# lw=2, alpha=.8)
#https://stats.stackexchange.com/questions/100159/confidence-intervals-for-auc-using-cross-validation
ax.plot(mean_fpr, mean_tpr, color='blue',
label=r'Mean ROC (AUC CI = [%0.3f,%0.3f])' % (lower_bound, upper_bound),
lw=2.5, alpha=.8)
# ax.plot(mean_fpr, mean_tpr, color='b',
# label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
# lw=2, alpha=.8)
std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
# cross validation scores output
print(aucs)
auc_mean_list.append(mean_auc)
auc_std_list.append(std_auc)
print('{} cv auc is :'.format(pipeline_optimizer.fitted_pipeline_.steps[0][0]), str(mean_auc), str(std_auc))
# cross validation scores output
print(accs)
acc_mean_list.append(mean_acc)
acc_std_list.append(std_acc)
print('{} cv acc is :'.format(pipeline_optimizer.fitted_pipeline_.steps[0][0]), str(mean_acc), str(std_acc))
# ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
# label=r'$\pm$ 1 std. dev.')
ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,)
ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05],
title="ROC Curve with 5fold CV")
ax.legend(loc="lower right")
# plt.show()
fig.savefig('{}_5fold.png'.format(pipeline_optimizer.fitted_pipeline_.steps[0][0]),bbox_inches='tight')
plt.show()
解决:
添加如下代码段:
去除原始数据的索引之后转化为numpy数组;
X = features_train.reset_index(drop=True).values
y = target_train.reset_index(drop=True).values
或者在训练时候使用如下语法:
X_train, X_test = X.iloc[train_index], X.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
完整错误:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-57-3fdad9652b83> in <module>
36 #print(type(test))
37 #print(train.shape)
---> 38 classifier.fit(X[train], y[train])
39 viz = plot_roc_curve(classifier, X[test], y[test],
40 name='ROC curve of fold {}'.format(i),
D:\anaconda\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
3028 if is_iterator(key):
3029 key = list(key)
-> 3030 indexer = self.loc._get_listlike_indexer(key, axis=1, raise_missing=True)[1]
3031
3032 # take() does not accept boolean indexers
D:\anaconda\lib\site-packages\pandas\core\indexing.py in _get_listlike_indexer(self, key, axis, raise_missing)
1264 keyarr, indexer, new_indexer = ax._reindex_non_unique(keyarr)
1265
-> 1266 self._validate_read_indexer(keyarr, indexer, axis, raise_missing=raise_missing)
1267 return keyarr, indexer
1268
D:\anaconda\lib\site-packages\pandas\core\indexing.py in _validate_read_indexer(self, key, indexer, axis, raise_missing)
1306 if missing == len(indexer):
1307 axis_name = self.obj._get_axis_name(axis)
-> 1308 raise KeyError(f"None of [{key}] are in the [{axis_name}]")
1309
1310 ax = self.obj._get_axis(axis)
KeyError: "None of [Int64Index([ 0, 1, 2, 3, 4, 6, 7, 8, 9, 10,\n ...\n 907, 908, 910, 911, 912, 914, 916, 917, 920, 923],\n dtype='int64', length=739)] are in the [columns]"