HyperGBM自定义搜索空间(SearchSpace)

最新推荐文章于 2024-01-29 09:51:32 发布

wwqauznii

最新推荐文章于 2024-01-29 09:51:32 发布

阅读量1.3k

点赞数 10

分类专栏： HyperGBM介绍文章标签： python 机器学习

本文链接：https://blog.csdn.net/u012424111/article/details/120264612

版权

HyperGBM介绍专栏收录该内容

6 篇文章 2 订阅

订阅专栏

本文指导如何在HyperGBM中自定义搜索空间，包括设置固定参数、定制参数范围和提供kaggle适用的宽泛搜索。通过实例演示了如何为XGBoost、LightGBM和CatBoost设置不同的参数策略，提升模型性能适应特定项目需求。

摘要由CSDN通过智能技术生成

HyperGBM内部有自己默认的搜索空间，虽然默认的搜索空间可以解决绝大多数问题，但在实际运用中，存在需要我们针对项目特性本身建立自定义的搜索空间的情况，接下来从三个方面来进行介绍如何自定义自己的搜索空间。

1. 设置参数为一个先验值

针对某个项目，已经有了一些先验经验，所以可以直接设置某个参数为一个定值，比如指定catboost的bootstrap_type=‘Poisson’

from hypergbm import make_experiment
from hypergbm.search_space import GeneralSearchSpaceGenerator

my_search_space = \
    GeneralSearchSpaceGenerator(n_estimators=666, catboost_init_kwargs={'bootstrap_type': 'Poisson'})
train_data = ...

experiment = make_experiment(train_data,
                             search_space=my_search_space,
                             ...)

2. 自定义参数搜索空间

指定xgboost的max_depth为[6,7,8,9]的一个搜索范围，推荐定义GeneralSearchSpaceGenerator的一个子类，

代码如下:

from hypergbm import make_experiment
from hypergbm.search_space import GeneralSearchSpaceGenerator
from hypernets.core.search_space import Choice

class MySearchSpace(GeneralSearchSpaceGenerator):
    @property
    def default_xgb_init_kwargs(self):
        return { **super().default_xgb_init_kwargs,
                'max_depth': Choice([10, 20 ,30]),
        }

my_search_space = MySearchSpace()
train_data = ...

experiment = make_experiment(train_data, 
                             search_space=my_search_space,
                             ...)

3. 自定义参数搜索空间(kaggle版)

在这里推荐一个kaggle比较适用搜索空间，相对于原始空间来说，搜索的范围更广，当然也会更加的耗时，对子模型性能有较高要求的情况可以尝试使用。

from hypergbm.search_space import GeneralSearchSpaceGenerator
from hypernets.core.search_space import  Real, Choice, Int
from hypergbm.cfg import HyperGBMCfg as cfg
from hypernets.core import randint

class MyGeneralSearchSpaceGenerator(GeneralSearchSpaceGenerator):
    def __init__(self,**kwargs,):
        super(MyGeneralSearchSpaceGenerator, self).__init__(**kwargs)
    @property
    def default_xgb_init_kwargs(self):
        _result_kwargs=super().default_xgb_init_kwargs
        xgb_init_kwargs={
                         'tree_method': 'gpu_hist',
                         'gpu_id': '0',
                        #  'n_jobs':-1,
                        #  'booster':Choice(['gbtree','dart']),
                         'booster':'gbtree',
                         'colsample_bytree': Real(0.3,1,step=0.1),
                         'colsample_bylevel':Real(0.3,1,step=0.1),
                        #  'n_estimators':Choice([200,400,800,1200,1400,1600,1800,2000,2300,2500,3000,3500,4000,4500,5000,5500,6000,6500,7000,7500,8000,8500,9000,9500,10000,12000,14000,16000,18000,20000]),
                         'n_estimators':Choice([200,400,800,1200,1400,1600,1800,2000,2300,2500,3000,3500,4000,4500,5000,5500,6000,6500,7000,7500,8000,8500,9000,9500,10000]),
                         'gamma': Choice([0.5, 1, 1.5, 2, 3, 4, 5]),
                         'reg_alpha': Choice([0,0.1,0.2,0.3, 0.5,0.7,1,2,5,7,10,13,15,20,40,60,80,100]),
                         'reg_lambda': Choice([0,0.001,0.005, 0.01, 0.05,0.1,0.5,0.8,1]),
                         'min_child_weight':Choice([1,2,3,5,7,10]),
                         'subsample': Real(0.3, 1, step=0.1),
                         'max_depth':  Choice([2,3,4,5,6,7,8,9]),
                         'learning_rate':Choice([0.01,0.005,0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6]),
                         'eval_metric': 'auc',
                         }
        for k,v in xgb_init_kwargs.items():
            _result_kwargs.update({k:v})
        return _result_kwargs

    @property
    def default_lightgbm_init_kwargs(self):
        _result_kwargs = super().default_lightgbm_init_kwargs
        lightgbm_init_kwargs = {'colsample_bytree': Real(0.3, 1, step=0.1),
                                'n_estimators': Choice([200,400,800,1200,1600,2000,2500,3000,3500,4000,4500,5000,5500,6000,6500,7000,7500,8000,8500,9000,9500,10000,12000,14000,16000,18000,20000]),
                                # 'n_estimators': Choice([200,400,800,1200,1600,2000,2500,3000,3500,4000,4500,5000,5500,6000,6500,7000,7500,8000,8500,9000,9500,10000]),
                                'boosting_type': Choice(['gbdt', 'goss']),
                                'learning_rate': Choice([0.01,0.005,0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6]),
                                'max_depth':  Choice([2,3,4,5,6,7,8,9]),
                                'num_leaves': Int(5, 500, 5),
                                'reg_alpha': Choice([0,0.1,0.2,0.3, 0.5,0.7,1,2,5,7,10,13,15,20,40,60,80]),
                                'reg_lambda': Choice([0,0.001,0.005,0.01,0.03,0.05,0.1,0.3,0.5,0.8,1]),
                                'subsample': Real(0.3, 1, step=0.1),
                                'min_child_samples': Int(2,100,step=1),
                                'min_child_weight': Choice([0.001, 0.002]),
                                # 'bagging_fraction': Real(0.5, 1, step=0.1),
                                'metric': 'auc',
                                }
        for k, v in lightgbm_init_kwargs.items():
            _result_kwargs.update({k: v})
        return _result_kwargs

    @property
    def default_catboost_init_kwargs(self):
        _result_kwargs = super().default_catboost_init_kwargs
        catboost_init_kwargs={
                              'task_type': 'GPU',
                              'devices': '0',
                              'n_estimators':Choice([200,400,800,1200,1600,2000,2500,3000,3500,4000,4500,5000,5500,6000,6500,7000,7500,8000,8500,9000,9500,10000,12000,14000,16000,18000,20000]),
                            #   'n_estimators':Choice([200,400,800,1200,1600,2000,2500,3000,3500,4000,4500,5000,5500,6000,6500,7000,7500,8000,8500,9000,9500,10000]),
                              'depth': Choice([2,3,4,5,6,7,8,9]),
                              'learning_rate': Choice([0.01,0.005,0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6]),
                              'l2_leaf_reg':Choice([0,1,2,5,7,10,13,15,20,40,60,80,100]),
                              'min_data_in_leaf': Int(10,100,step=1),
                              'leaf_estimation_method':Choice(['Newton','Gradient']),
                              # 'subsample': Real(0.1, 1, step=0.1), ##closed Cause Bayesian
                              'bootstrap_type':Choice(['Poisson','Bayesian','Bernoulli']),
                            #   'loss_function':'RMSE',
                              'eval_metric': 'AUC',
                              }
        for k, v in catboost_init_kwargs.items():
            _result_kwargs.update({k: v})
        return _result_kwargs