Datawhale 零基础入门数据挖掘-Task4 建模调参

最新推荐文章于 2022-11-27 19:10:13 发布

不可一世文哥

最新推荐文章于 2022-11-27 19:10:13 发布

阅读量123

点赞数

文章标签：机器学习

本文链接：https://blog.csdn.net/qq_40955437/article/details/116028807

版权

逻辑回归模型

1 学习内容

逻辑回归模型
树模型
集成模型
基于bagging思想的集成模型
随机森林模型
随机树模型
基于boosting思想的集成模型
基于stacking思想的集成模型
模型对比与性能评估
模型调参
贪心调参方法
网格调参方法
贝叶斯调参方法

2 代码实现

2.1 导入数据

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

2.2 五折交叉验证

sample_feature['notRepairedDamage'] = sample_feature['notRepairedDamage'].astype(np.float32)
train = sample_feature[continuous_feature_names + ['price']]

train_X = train[continuous_feature_names]
train_y = train['price']
# 自定义损失函数
def myFeval(preds, xgbtrain):
    label = xgbtrain.get_label()
    score = mean_absolute_error(np.expm1(label), np.expm1(preds))
    return 'myFeval', score, False


param = {'boosting_type': 'gbdt',       
         'num_leaves': 64,
         'max_depth': 10,
         "lambda_l2": 1,  # 防止过拟合
         "lambda_l1": 1,   #  防止过拟合
         'min_data_in_leaf': 20,  # 防止过拟合，好像都不用怎么调
         'objective': 'regression_l1',
         'learning_rate': 0.01,
         "min_child_samples": 20,
         'verbosity': -1,

         "feature_fraction": 0.8,
         "bagging_freq": 1,
         "bagging_fraction": 0.8,
         "bagging_seed": 11,
         "metric": 'mae',
         }
folds = KFold(n_splits=5, shuffle=True, random_state=2020)
oof_lgb = np.zeros(len(X_data))
predictions_lgb = np.zeros(len(X_test))
predictions_train_lgb = np.zeros(len(X_data))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_data, Y_data)):
    print("fold n°{}".format(fold_ + 1))
    trn_data = lgb.Dataset(X_data[trn_idx], Y_data[trn_idx])
    val_data = lgb.Dataset(X_data[val_idx], Y_data[val_idx])

    num_round = 50000
    clf = lgb.train(param, trn_data, num_round, valid_sets=[trn_data, val_data], verbose_eval=300,
                    early_stopping_rounds=300, feval=myFeval)
    feature_list_name = clf.feature_name()
    df = pd.DataFrame(feature_list_name, columns=['feature'])
    df['importance'] = list(clf.feature_importance())
    df = df.sort_values(by='importance', ascending=False)
    df.to_csv("feature_importance" + str(fold_) + ".csv", index=False)
    oof_lgb[val_idx] = clf.predict(X_data[val_idx], num_iteration=clf.best_iteration)
    predictions_lgb += clf.predict(X_test, num_iteration=clf.best_iteration) / folds.n_splits
    predictions_train_lgb += clf.predict(X_data, num_iteration=clf.best_iteration) / folds.n_splits

print("lightgbm score: {:<8.8f}".format(mean_absolute_error(np.expm1(oof_lgb), np.expm1(Y_data))))

在实际的训练中，结果对于训练集的拟合程度一般挺好，但是对于训练集之外的数据，它的拟合程度并不能令我们满意。因此我们通常并不会把所有的数据集都拿来训练，而是摘出一部分来进行测试，相对客观的判断这些参数对训练集之外的数据的符合程度。这种思想就称为交叉验证

2.3 模拟真实业务情况

import datetime
sample_feature = sample_feature.reset_index(drop=True)
split_point = len(sample_feature) // 5 * 4
train = sample_feature.loc[:split_point].dropna()
val = sample_feature.loc[split_point:].dropna()

train_X = train[continuous_feature_names]
train_y_ln = np.log(train['price'] + 1)
val_X = val[continuous_feature_names]
val_y_ln = np.log(val['price'] + 1)
model = model.fit(train_X, train_y_ln)
mean_absolute_error(val_y_ln, model.predict(val_X))
# 0.19443858353490887

2.4 模型调参

2.4.1 贪心调参算法

best_obj = dict()
for obj in objective:
    model = LGBMRegressor(objective=obj)
    score = np.mean(cross_val_score(model, X=train_X, y=train_y_ln, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error)))
    best_obj[obj] = score
    
best_leaves = dict()
for leaves in num_leaves:
    model = LGBMRegressor(objective=min(best_obj.items(), key=lambda x:x[1])[0], num_leaves=leaves)
    score = np.mean(cross_val_score(model, X=train_X, y=train_y_ln, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error)))
    best_leaves[leaves] = score
    
best_depth = dict()
for depth in max_depth:
    model = LGBMRegressor(objective=min(best_obj.items(), key=lambda x:x[1])[0],
                          num_leaves=min(best_leaves.items(), key=lambda x:x[1])[0],
                          max_depth=depth)
    score = np.mean(cross_val_score(model, X=train_X, y=train_y_ln, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error)))
    best_depth[depth] = score

2.4.2 贝叶斯调参算法

from bayes_opt import BayesianOptimization
def rf_cv(num_leaves, max_depth, min_child_samples,bagging_fraction,feature_fraction,bagging_freq):
    val = cross_val_score(
        LGBMRegressor(objective = 'regression_l1',
            num_leaves=int(num_leaves),
            max_depth=int(max_depth),
            min_child_samples = int(min_child_samples),
            boosting_type = 'gbdt',       
         lambda_l2 = 1,  # 防止过拟合
         lambda_l1 = 1,   #  防止过拟合
         min_data_in_leaf = 20,  # 防止过拟合，好像都不用怎么调
         learning_rate = 0.1,
         verbosity = -1,
        bagging_fraction=round(bagging_fraction, 2), 
        feature_fraction=round(feature_fraction, 2),
        bagging_freq=int(bagging_freq),
         bagging_seed = 11,
         metric = 'mae',
        ),
        X=train_X, y=train_y, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error)
    ).mean()
    return 1 - val
rf_bo = BayesianOptimization(
    rf_cv,
    {
    'num_leaves': (2, 100),
    'max_depth': (2, 100),
    'min_child_samples' : (2, 100),
    'bagging_fraction':(0.5, 1.0),
    'feature_fraction':(0.5, 1.0),
    'bagging_freq':(0, 100),
    }
)

3 总结

本次完成了建模与调参，以此将我们的模型训练到更好，并对其进行了验证。

不可一世文哥

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
1
评论
Datawhale 零基础入门数据挖掘-Task4 建模调参

逻辑回归模型1 学习内容逻辑回归模型树模型集成模型基于bagging思想的集成模型随机森林模型随机树模型基于boosting思想的集成模型XGBOOSTLIGHTGBMCATBOOSTAdaboostGBDT基于stacking思想的集成模型模型对比与性能评估模型调参贪心调参方法网格调参方法贝叶斯调参方法————————————————版权声明：本文为CSDN博主「沧浪之水、」的原创文章，遵循CC 4.0 BY-SA版权协议，转载请附上原文出处链接及本声明。原文链
复制链接

扫一扫