使用交叉验证时,保存Xgboost特征重要性会因为使用了部分样本而导致将特征重要性导出CSV文件时,出现一些问题。通过创建字典来更新便可以解决。
改进代码:
kf = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)
test_predict = np.zeros(test_x.shape[0])
cv_scores = []
fea_scores = pd.DataFrame()
for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
print('************************************ {} ************************************'.format(str(i+1)))
trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]
xgb_params = {
......
}
train_matrix = xgb.DMatrix(trn_x , label=trn_y)
valid_matrix = xgb.DMatrix(val_x , label=val_y)
test_matrix = xgb.DMatrix(test_x)
watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
model = xgb.train(xgb_params, train_matrix, num_boost_round=8000, evals=watchlist, verbose_eval=400, early_stopping_rounds=80)
val_pred = model.predict(valid_matrix)
test_pred = model.predict(test_matrix)
test_predict += test_pred / kf.n_splits
score = mean_absolute_error(val_y, val_pred)
cv_scores.append(score)
print(cv_scores)
fea_ = model.get_score(importance_type='gain')
fea_name = train_matrix.feature_names
# 创建一个字典,将所有特征的得分初始化为0
all_feature_scores = {feature: 0 for feature in fea_name}
# 更新字典中选中特征的得分
for feature, score in fea_.items():
all_feature_scores[feature] = score
fea_score = pd.DataFrame({'fea_name': fea_name, 'score_'+str(i+1): list(all_feature_scores.values())})
# 获取特征重要性打分,便于评估特征
if i == 0:
fea_scores = fea_score
else:
fea_scores = pd.concat([fea_scores, fea_score['score_'+str(i+1)]], axis=1)
#fea_scores['average_score'] = fea_scores.mean(axis=1)
fea_scores.to_csv("XGB.csv", index=False)