task 4 建模与调参

最新推荐文章于 2023-11-28 09:51:44 发布

q759399216

最新推荐文章于 2023-11-28 09:51:44 发布

阅读量218

点赞数

本文链接：https://blog.csdn.net/q759399216/article/details/105252686

版权

1. 对标签进行对数变换

train_y_ln = np.log(train_y + 1)

2网格搜索

param_grid = {'max_depth':[12,14,16],'num_leaves':[66,68,70,72,74,76],,'subsample':[0.6,0.8],'feature_fraction':[0.6,0.8]}


model = lgb.LGBMRegressor(num_leaves=74,
                          max_depth=14,
                          learning_rate=0.1,
                          n_estimators=15000,
                          subsample=0.6,
                          feature_fraction=0.8,
                          reg_alpha=0.5,
                          reg_lambda=0.5,
                          random_state=seed,
                          metric=None
                          )
GS = GridSearchCV(model,param_grid,scoring='neg_median_absolute_error',cv=5)#网格搜索
GS.fit(train_df_grid,train_df['price'])
 
GS.best_params_#显示调整出来的最佳参数
 
GS.best_score_#返回调整好的最佳参数对应的准确率

3 预测方法的封装

def cv_predict(model,train,test,predict_name,id_name,cv=5):
    from sklearn.metrics import mean_absolute_error
    oof_cb = np.zeros(len(train))
    x_train=train.drop([predict_name,id_name],axis=1)
    y_train=train[predict_name]
    x_test=test.drop([id_name],axis=1)
    
    prediction = test[[id_name]]
    prediction[predict_name] = 0
    df_importance_list=[]
    kfold = KFold(n_splits=cv, shuffle=False, random_state=2000)
    for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(x_train)): 
        X_train = x_train.iloc[trn_idx]
        Y_train = y_train.iloc[trn_idx]
        X_val = x_train.iloc[val_idx]
        Y_val = y_train.iloc[val_idx]
        print('\nFold_{} Training ================================\n'.format(fold_id+1))
        lgb_model = model.fit(X_train,
                          Y_train,
                          eval_names=['train', 'valid'],
                          eval_set=[(X_val, Y_val)],
                          verbose=500,
                         eval_metric='mae',
                          early_stopping_rounds=8000)

        pred_val = lgb_model.predict(X_val, ntree_end=lgb_model.best_iteration_)   #每次取最优的迭代训练结果进行预测

        oof_cb[val_idx]=lgb_model.predict(X_val, ntree_end=lgb_model.best_iteration_)
        pred_test = lgb_model.predict(x_test, ntree_end=lgb_model.best_iteration_)  #每次最优model对模型预测。然后取1/5
        prediction['price'] += pred_test / 5
        df_importance = pd.DataFrame({'column': list(x_train.columns),'importance': lgb_model.feature_importances_,})
        df_importance_list.append(df_importance)
    print("CV score: {:<8.8f}".format(mean_absolute_error(oof_cb, train['price'].values)))
    return df_importance_list,prediction,oof_cb

4 catboost

from catboost import Pool, CatBoostRegressor
cb_params = {
     'n_estimators': 250000,
     'loss_function': 'RMSE',
     'eval_metric':'MAE',
     'learning_rate': 0.1,
     'depth': 8,
     'use_best_model': True,
     'subsample': 0.6,
     'bootstrap_type': 'Bernoulli',
#      'reg_lambda': 3
}
model_cb = CatBoostRegressor(**cb_params)

相比较，catboost取得了更好的结果，但运行时间较长

5 贝叶斯调参

from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error,  make_scorer


def rf_cv(num_leaves, max_depth, subsample,min_child_samples,feature_fraction,reg_alpha,reg_lambda):
    val = cross_val_score(
        lgb.LGBMRegressor(objective = 'regression',
            n_estimators=100,
            num_leaves=int(num_leaves),
            max_depth=int(max_depth),
            subsample = subsample,
            min_child_samples = int(min_child_samples),
            reg_alpha=reg_alpha,
            reg_lambda=reg_lambda,                          
        ),
        X=train_df_res, y=train_df['price'], verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error)
    ).mean()
    return 1 - val

rf_bo = BayesianOptimization(
    rf_cv,
    {
    'num_leaves': (50, 85),
    'max_depth': (10, 30),
    'subsample': (0.4, 1),
    'min_child_samples' : (10, 30),
    'feature_fraction' :(0.5,1),
    'reg_alpha':(0,1),
    'reg_lambda':(0,1),
    }
)
rf_bo.maximize()
rf_bo.max["params"]
效果一般，不知是否要用需要较大的n_estimators才能有较好的效果

6 嵌入式特征选择

threshold=[i for i in range(16)]
score= []
for i in threshold:
    X_embedded=SelectFromModel(model,threshold=i).fit_transform(train_df_c,train_df['price'])
    once=cross_val_score(model,X_embedded,train_df['price'],cv=5).mean()
    score.append(once)
plt.plot(threshold,score)
plt.show()

在这里插入图片描述
阈值选择9为最佳
X_embedded=SelectFromModel(model,threshold=9).fit(train_df_c,train_df[‘price’])