KeyError: “None of [Int64Index([...],\n dtype=‘int64‘, length=739)] are in the [columns]“

最新推荐文章于 2023-10-29 19:24:00 发布

Data+Science+Insight

最新推荐文章于 2023-10-29 19:24:00 发布

阅读量1.3w

点赞数 19

文章标签：机器学习人工智能数据挖掘自然语言处理深度学习

本文链接：https://blog.csdn.net/zhongkeyuanchongqing/article/details/120796789

版权

KeyError: "None of [Int64Index([ 0, 1, 2, 3, 4, 6, 7, 8, 9, 10,\n ...\n 907, 908, 910, 911, 912, 914, 916, 917, 920, 923],\n dtype='int64', length=739)] are in the [columns]"

问题：

解决：

完整错误：

问题：


validation_test_result_pd = pd.DataFrame()
X = features_train
y = target_train

n_samples, n_features = X.shape

# Classification and ROC analysis
# Run classifier with cross-validation and plot ROC curves

# cv = StratifiedKFold(n_splits=5)
cv =KFold(n_splits=5,shuffle=True,random_state=42)

# classifier = LogisticRegression(class_weight = "balanced", penalty = "l2")
# 'n_estimators': 200, 'min_child_weight': 1, 'max_depth': 5
classifier = pipeline_optimizer.fitted_pipeline_.steps[0][1]
# classifier = XGBClassifier()

tprs = []
aucs = []

accs = []
auc_mean_list = []
auc_std_list = []
acc_mean_list = []
acc_std_list = []
mean_fpr = np.linspace(0, 1, 100)

rep_folds = []

fig, ax = plt.subplots()
for i, (train, test) in enumerate(cv.split(X, y)):
    #print(test)
    #print(type(test))
    #print(train.shape)
    classifier.fit(X[train], y[train])
    viz = plot_roc_curve(classifier, X[test], y[test],
                         name='ROC curve of fold {}'.format(i),
                         alpha=0.2, lw=2, ax=ax)
    

    #print(viz)
    interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
    interp_tpr[0] = 0.0
    
    
    tprs.append(interp_tpr)
    aucs.append(viz.roc_auc)
    y_pred = classifier.predict(X[test])
    accs.append(accuracy_score(y[test], y_pred))
    
    print(str(i))
    print('---------------classification report of fold %d-------------------' % (i))
    
    y_pred = classifier.predict(X[test])
    print(classification_report(y[test], y_pred))
    #fold confusion matrix
    rep_folds.append(classification_report(y[test], y_pred, output_dict=True,digits=3))
    # Compute confusion matrix
    cnf_matrix = confusion_matrix(y[test], y_pred)
    np.set_printoptions(precision=2)
    #np.set_printoptions(precision=3)

    # Plot non-normalized confusion matrix
    plt.figure()
    # radiomics
    title_raidomics='Radiomics plot of fold_'+str(i)
    plot_radiomics(X[test],y[test],title_raidomics,classifier)
    plt.figure(figsize(6,4))
    #confustion matrix
    plot_confusion_matrix(cnf_matrix, classes=[0,1],
                          title='Confusion matrix, without normalization of fold %d ' % (i))

    # Plot normalized confusion matrix
    plt.figure()
    plot_confusion_matrix(cnf_matrix, classes=[0,1], normalize=True,
                          title='Normalized confusion matrix of fold %d ' % (i))
    
    if i == 0:
        best = viz.roc_auc
        model = classifier
    elif viz.roc_auc>best:
        best = viz.roc_auc
        model = classifier
    else:
        pass
    
    input_len = len(test)
    to_dca = {'y_true':y[test],'y_pred':y_pred,'probailities':classifier.predict_proba(X[test])[:,1]}
    dca_pd = pd.DataFrame.from_dict(to_dca)
    dca_pd['train_ornot'] = input_len*["fold_" + str(i)]
    
    
    #fold_pd = pd.DataFrame(X[test],columns = features_train.columns.tolist())
    #dca_pd = pd.concat([dca_pd,fold_pd],axis = 1)
    
    train_temp_pd = df_in.iloc[target_train.index.tolist()].reset_index(drop = True)
    index_pd = train_temp_pd.iloc[test].reset_index(drop = True)
    dca_pd = pd.concat([dca_pd,index_pd],axis = 1)
    
    #fold_value = 'fold'+str(i)
    #dca_pd['validation_fold_or_test'] = [fold_value]*index_pd.shape[0]
    
    if i == 0:
        validation_test_result_pd = dca_pd
    else:
        validation_test_result_pd = validation_test_result_pd.append(dca_pd,ignore_index = True)
    
ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='red',
        label='Chance', alpha=.8)

mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
from sklearn.metrics import auc
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)

mean_acc = np.mean(accs)
std_acc = np.std(accs)
import math
print(len(aucs) == len(mean_tpr))

std_error = std_auc / math.sqrt(5)
# cv = 5
# so t-test table search and the result is 2.776, cause freedom is 5-1
# ci =  2.776 * std_error
ci = 2.262 * std_error
# print(ci)
lower_bound = mean_auc - ci
upper_bound = mean_auc + ci

# ax.plot(mean_fpr, mean_tpr, color='b',
#         label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
#         lw=2, alpha=.8)
#https://stats.stackexchange.com/questions/100159/confidence-intervals-for-auc-using-cross-validation
ax.plot(mean_fpr, mean_tpr, color='blue',
        label=r'Mean ROC (AUC CI = [%0.3f,%0.3f])' % (lower_bound, upper_bound),
        lw=2.5, alpha=.8)
# ax.plot(mean_fpr, mean_tpr, color='b',
#         label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
#         lw=2, alpha=.8)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)

# cross validation scores output
print(aucs)
auc_mean_list.append(mean_auc)
auc_std_list.append(std_auc)
print('{} cv auc is :'.format(pipeline_optimizer.fitted_pipeline_.steps[0][0]), str(mean_auc), str(std_auc))

# cross validation scores output
print(accs)
acc_mean_list.append(mean_acc)
acc_std_list.append(std_acc)
print('{} cv acc is :'.format(pipeline_optimizer.fitted_pipeline_.steps[0][0]), str(mean_acc), str(std_acc))

# ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
#                 label=r'$\pm$ 1 std. dev.')
ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,)

ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05],
       title="ROC Curve with 5fold CV")
ax.legend(loc="lower right")
# plt.show()
fig.savefig('{}_5fold.png'.format(pipeline_optimizer.fitted_pipeline_.steps[0][0]),bbox_inches='tight')
plt.show()

解决：

添加如下代码段：

去除原始数据的索引之后转化为numpy数组；

X = features_train.reset_index(drop=True).values
y = target_train.reset_index(drop=True).values

或者在训练时候使用如下语法：

 X_train, X_test = X.iloc[train_index], X.iloc[test_index]

 y_train, y_test = y.iloc[train_index], y.iloc[test_index]

完整错误：

---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-57-3fdad9652b83> in <module>
36 #print(type(test))
37 #print(train.shape)
---> 38 classifier.fit(X[train], y[train])
39 viz = plot_roc_curve(classifier, X[test], y[test],
40 name='ROC curve of fold {}'.format(i),

D:\anaconda\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
3028 if is_iterator(key):
3029 key = list(key)
-> 3030 indexer = self.loc._get_listlike_indexer(key, axis=1, raise_missing=True)[1]
3031
3032 # take() does not accept boolean indexers

D:\anaconda\lib\site-packages\pandas\core\indexing.py in _get_listlike_indexer(self, key, axis, raise_missing)
1264 keyarr, indexer, new_indexer = ax._reindex_non_unique(keyarr)
1265
-> 1266 self._validate_read_indexer(keyarr, indexer, axis, raise_missing=raise_missing)
1267 return keyarr, indexer
1268

D:\anaconda\lib\site-packages\pandas\core\indexing.py in _validate_read_indexer(self, key, indexer, axis, raise_missing)
1306 if missing == len(indexer):
1307 axis_name = self.obj._get_axis_name(axis)
-> 1308 raise KeyError(f"None of [{key}] are in the [{axis_name}]")
1309
1310 ax = self.obj._get_axis(axis)

KeyError: "None of [Int64Index([ 0, 1, 2, 3, 4, 6, 7, 8, 9, 10,\n ...\n 907, 908, 910, 911, 912, 914, 916, 917, 920, 923],\n dtype='int64', length=739)] are in the [columns]"