代码报错处:
#---------------------------------------------------modify the parameter------------------------------------------------
range_m = np.logspace(2, 6, 5, base = 2).astype(int)
best_m = 0
min_scores = 10000
scores_m = []
for m in range_m:
kf = KFold(n_splits=5,shuffle=True)
clf = RandomForestClassifier(n_estimators = 1000 ,max_depth = m,random_state = 4)
scores = 0
for train_index, test_index in kf.split(X_train):
#print("Train:", train_index, "Validation:",test_index)
clf.fit(X_train[train_index], Y_train[train_index])
# pred = clf.predict(X_train[test_index])
# scores += log_loss(Y_train[test_index], pred) / 5
# scores_m.append(scores)
# if scores < min_scores:
# min_scores = scores
# best_m = m
#
# print(best_m, min_scores) # 打印随机森林的树的最佳数量和其损失值
# print(scores_m) # 打印不同数量树的随机森林模型的损失值
错误提示:
KeyError: "None of [Int64Index([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,\n ...\n 826, 828, 829, 830, 831, 833, 834, 835, 836, 837],\n dtype='int64', length=670)] are in the [columns]"
解决方案:
很明显索引出现问题,数据框DataFrame有两种新的索引方式:
.iloc[index,:]
,其中index是索引位置.loc[:,'']
,其中’ '中为列名
选择一种方式:
clf.fit(X_train.iloc[train_index,:], Y_train.iloc[train_index,:])