optuna自动调参

M_try的小尾巴

已于 2024-07-23 17:19:10 修改

阅读量242

点赞数 5

分类专栏：人工智能文章标签： python 集成学习机器学习回归

于 2024-07-23 17:09:18 首次发布

本文链接：https://blog.csdn.net/qq_28319843/article/details/140640907

版权

人工智能专栏收录该内容

9 篇文章 3 订阅

订阅专栏

背景

模型训练上线后，经过一段时间，数据的分布发生了变化，之前好不容易调好的参数已经不适应新的数据了，因此需要重新优化参数，自动调参的工具很多，这里记录下optuna调参
涉及到的待调参的算法有catboost\xgbooost\lightgbm

第一步导入需要的包

import optuna
import xgboost as xgb
from catboost import CatBoostRegressor, Pool, cv
import lightgbm as lgb

定义各个模型的调参函数


def objective_xgboost(trial, data, label):
    """对xgboost进行自动调参"""
    param = {
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'subsample': trial.suggest_uniform('subsample', 0.7, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.7, 1.0),
        'verbosity': 0  # 禁用日志
    }
    dtrain = xgb.DMatrix(data, label=label)
    cv_results = xgb.cv(
        params=param,
        dtrain=dtrain,
        num_boost_round=1000,
        nfold=3,
        metrics='rmse',
        early_stopping_rounds=20,
        verbose_eval=False  # 禁用详细评估
    )
    return cv_results['test-rmse-mean'].min()

def objective_lightgbm(trial, data, label):
    """对lightgbm进行自动调参"""
    param = {
        'objective': 'regression',  # 设置为回归任务
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        # 'num_leaves': trial.suggest_int('num_leaves', 31, 2 ** 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
        'n_estimators': trial.suggest_int('n_estimators', 300, 6000),
        'subsample': trial.suggest_uniform('subsample', 0.7, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.7, 1.0),
        'force_col_wise': True,
        # 'min_child_samples': trial.suggest_int('min_child_samples', 20, 100),
        # 'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-3, 10),
        # 'min_split_gain': trial.suggest_loguniform('min_split_gain', 1e-3, 10),
        'verbosity': -1  # 禁用日志
    }

    # 检查标签是否为连续类型
    label = np.array(label)
    if not np.issubdtype(label.dtype, np.number):
        raise ValueError("标签数据类型必须是数值类型")

    lgb_train = lgb.Dataset(data, label=label)
    cv_results = lgb.cv(
        params=param,
        train_set=lgb_train,
        num_boost_round=10000,
        nfold=3,
        stratified=False,  # 不使用 StratifiedKFold
        metrics='rmse',
        early_stopping_rounds=10,
        verbose_eval=False,
        return_cvbooster=True  # 返回交叉验证的模型
    )

    # 从 cv_results 中提取最小的 rmse-mean
    best_rmse = min(cv_results['rmse-mean'])
    trial.report(best_rmse, trial.number)

    if trial.should_prune():
        raise optuna.TrialPruned()

    return best_rmse


def objective_catboost(trial, data, label):
    param = {
        'depth': trial.suggest_int('depth', 3, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
        'iterations': trial.suggest_int('iterations', 300, 6000),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-4, 1e1),
        'bagging_temperature': trial.suggest_uniform('bagging_temperature', 0, 1),
        'loss_function': 'RMSE',  # 指定为回归任务
        'logging_level': 'Silent'
    }
    
    cv_data = cv(
        Pool(data, label),
        params=param,
        fold_count=3,
        plot=False,
        verbose=False
    )

    return np.min(cv_data['test-RMSE-mean'])

自定义调参早停

optuna自带剪枝函数，我这里为了节省时间，设定规则如下，如果连续N轮的参数组合都没有超过当前最佳的参数组合，则调参结束
实现方法如下。

from collections import defaultdict


class EarlyStoppingPruner:
    def __init__(self, patience):
        self.patience = patience
        self.best_value = None
        self.no_improvement_count = 0

    def __call__(self, study, trial):
        if self.best_value is None or study.best_value < self.best_value:
            self.best_value = study.best_value
            self.no_improvement_count = 0
        else:
            self.no_improvement_count += 1

        if self.no_improvement_count >= self.patience:
            print(f"Early stopping triggered after {self.patience} trials with no improvement.")
            study.stop()

定义调优函数

def tune_model(objective_function, data, label, n_trials, early_stopping_pruner):
    """输入模型进行调优"""
    pruner = MedianPruner( n_startup_trials=5, n_warmup_steps=15, interval_steps=3)
    study = optuna.create_study(direction='minimize')
    try:
        study.optimize(lambda trial: objective_function(trial, data, label), n_trials=n_trials,
                       callbacks=[early_stopping_pruner])
    except optuna.exceptions.TrialPruned:
        pass
    return study.best_params

开始调优

设置100次调参，如果超过20次都没有超过当前最优则提前停止

patience = 20
n_trials=100
early_stopping_pruner = EarlyStoppingPruner(patience)
cat_1_best_params = tune_model(objective_catboost, one_data_train[one_data_col_pre_1],
                               one_data_train['log_price'], n_trials, early_stopping_pruner)
print(f"Best Parameters :{cat_1_best_params}")

early_stopping_pruner = EarlyStoppingPruner(patience)
cat_2_best_params = tune_model(objective_catboost, one_data_train[one_data_col_pre_2],
                               one_data_train['log_price'], n_trials, early_stopping_pruner)
print(f"Best Parameters :{cat_2_best_params}")



logger.info(f"xgboost模型训练")
early_stopping_pruner = EarlyStoppingPruner(patience)
xgb_1_best_params = tune_model(objective_xgboost, one_data_train[one_data_col_pre_1],
                               one_data_train['log_price'], n_trials, early_stopping_pruner)
print(f"Best Parameters :{xgb_1_best_params}")

early_stopping_pruner = EarlyStoppingPruner(patience)
xgb_2_best_params = tune_model(objective_xgboost, one_data_train[one_data_col_pre_2],
                               one_data_train['log_price'], n_trials, early_stopping_pruner)
early_stopping_pruner = EarlyStoppingPruner(patience)
print(f"Best Parameters :{xgb_2_best_params}")

# 使用lightgbm进行训练
logger.info(f'lightgbm模型训练')

early_stopping_pruner = EarlyStoppingPruner(patience)
lgb_1_best_params = tune_model(objective_lightgbm, one_data_train[one_data_col_pre_1],
                               one_data_train['log_price'], n_trials, early_stopping_pruner)
print(f"Best Parameters :{lgb_1_best_params}")

early_stopping_pruner = EarlyStoppingPruner(patience)
lgb_2_best_params = tune_model(objective_lightgbm, one_data_train[one_data_col_pre_2],
                               one_data_train['log_price'], n_trials, early_stopping_pruner)
print(f"Best Parameters :{lgb_2_best_params}")