1. 对标签进行对数变换
train_y_ln = np.log(train_y + 1)
2网格搜索
param_grid = {'max_depth':[12,14,16],'num_leaves':[66,68,70,72,74,76],,'subsample':[0.6,0.8],'feature_fraction':[0.6,0.8]}
model = lgb.LGBMRegressor(num_leaves=74,
max_depth=14,
learning_rate=0.1,
n_estimators=15000,
subsample=0.6,
feature_fraction=0.8,
reg_alpha=0.5,
reg_lambda=0.5,
random_state=seed,
metric=None
)
GS = GridSearchCV(model,param_grid,scoring='neg_median_absolute_error',cv=5)#网格搜索
GS.fit(train_df_grid,train_df['price'])
GS.best_params_#显示调整出来的最佳参数
GS.best_score_#返回调整好的最佳参数对应的准确率
3 预测方法的封装
def cv_predict(model,train,test,predict_name,id_name,cv=5):
from sklearn.metrics import mean_absolute_error
oof_cb = np.zeros(len(train))
x_train=train.drop([predict_name,id_name],axis=1)
y_train=train[predict_name]
x_test=test.drop([id_name],axis=1)
prediction = test[[id_name]]
prediction[predict_name] = 0
df_importance_list=[]
kfold = KFold(n_splits=cv, shuffle=False, random_state=2000)
for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(x_train)):
X_train = x_train.iloc[trn_idx]
Y_train = y_train.iloc[trn_idx]
X_val = x_train.iloc[val_idx]
Y_val = y_train.iloc[val_idx]
print('\nFold_{} Training ================================\n'.format(fold_id+1))
lgb_model = model.fit(X_train,
Y_train,
eval_names=['train', 'valid'],
eval_set=[(X_val, Y_val)],
verbose=500,
eval_metric='mae',
early_stopping_rounds=8000)
pred_val = lgb_model.predict(X_val, ntree_end=lgb_model.best_iteration_) #每次取最优的迭代训练结果进行预测
oof_cb[val_idx]=lgb_model.predict(X_val, ntree_end=lgb_model.best_iteration_)
pred_test = lgb_model.predict(x_test, ntree_end=lgb_model.best_iteration_) #每次最优model对模型预测。然后取1/5
prediction['price'] += pred_test / 5
df_importance = pd.DataFrame({'column': list(x_train.columns),'importance': lgb_model.feature_importances_,})
df_importance_list.append(df_importance)
print("CV score: {:<8.8f}".format(mean_absolute_error(oof_cb, train['price'].values)))
return df_importance_list,prediction,oof_cb
4 catboost
from catboost import Pool, CatBoostRegressor
cb_params = {
'n_estimators': 250000,
'loss_function': 'RMSE',
'eval_metric':'MAE',
'learning_rate': 0.1,
'depth': 8,
'use_best_model': True,
'subsample': 0.6,
'bootstrap_type': 'Bernoulli',
# 'reg_lambda': 3
}
model_cb = CatBoostRegressor(**cb_params)
相比较,catboost取得了更好的结果,但运行时间较长
5 贝叶斯调参
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error, make_scorer
def rf_cv(num_leaves, max_depth, subsample,min_child_samples,feature_fraction,reg_alpha,reg_lambda):
val = cross_val_score(
lgb.LGBMRegressor(objective = 'regression',
n_estimators=100,
num_leaves=int(num_leaves),
max_depth=int(max_depth),
subsample = subsample,
min_child_samples = int(min_child_samples),
reg_alpha=reg_alpha,
reg_lambda=reg_lambda,
),
X=train_df_res, y=train_df['price'], verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error)
).mean()
return 1 - val
rf_bo = BayesianOptimization(
rf_cv,
{
'num_leaves': (50, 85),
'max_depth': (10, 30),
'subsample': (0.4, 1),
'min_child_samples' : (10, 30),
'feature_fraction' :(0.5,1),
'reg_alpha':(0,1),
'reg_lambda':(0,1),
}
)
rf_bo.maximize()
rf_bo.max["params"]
效果一般,不知是否要用需要较大的n_estimators才能有较好的效果
6 嵌入式特征选择
threshold=[i for i in range(16)]
score= []
for i in threshold:
X_embedded=SelectFromModel(model,threshold=i).fit_transform(train_df_c,train_df['price'])
once=cross_val_score(model,X_embedded,train_df['price'],cv=5).mean()
score.append(once)
plt.plot(threshold,score)
plt.show()
阈值选择9为最佳
X_embedded=SelectFromModel(model,threshold=9).fit(train_df_c,train_df[‘price’])