HyperGBM内部有自己默认的搜索空间,虽然默认的搜索空间可以解决绝大多数问题,但在实际运用中,存在需要我们针对项目特性本身建立自定义的搜索空间的情况,接下来从三个方面来进行介绍如何自定义自己的搜索空间。
1. 设置参数为一个先验值
针对某个项目,已经有了一些先验经验,所以可以直接设置某个参数为一个定值,比如指定catboost的bootstrap_type=‘Poisson’
from hypergbm import make_experiment
from hypergbm.search_space import GeneralSearchSpaceGenerator
my_search_space = \
GeneralSearchSpaceGenerator(n_estimators=666, catboost_init_kwargs={'bootstrap_type': 'Poisson'})
train_data = ...
experiment = make_experiment(train_data,
search_space=my_search_space,
...)
2. 自定义参数搜索空间
指定xgboost的max_depth为[6,7,8,9]的一个搜索范围,推荐定义GeneralSearchSpaceGenerator的一个子类,
代码如下:
from hypergbm import make_experiment
from hypergbm.search_space import GeneralSearchSpaceGenerator
from hypernets.core.search_space import Choice
class MySearchSpace(GeneralSearchSpaceGenerator):
@property
def default_xgb_init_kwargs(self):
return { **super().default_xgb_init_kwargs,
'max_depth': Choice([10, 20 ,30]),
}
my_search_space = MySearchSpace()
train_data = ...
experiment = make_experiment(train_data,
search_space=my_search_space,
...)
3. 自定义参数搜索空间(kaggle版)
在这里推荐一个kaggle比较适用搜索空间,相对于原始空间来说,搜索的范围更广,当然也会更加的耗时,对子模型性能有较高要求的情况可以尝试使用。
from hypergbm.search_space import GeneralSearchSpaceGenerator
from hypernets.core.search_space import Real, Choice, Int
from hypergbm.cfg import HyperGBMCfg as cfg
from hypernets.core import randint
class MyGeneralSearchSpaceGenerator(GeneralSearchSpaceGenerator):
def __init__(self,**kwargs,):
super(MyGeneralSearchSpaceGenerator, self).__init__(**kwargs)
@property
def default_xgb_init_kwargs(self):
_result_kwargs=super().default_xgb_init_kwargs
xgb_init_kwargs={
'tree_method': 'gpu_hist',
'gpu_id': '0',
# 'n_jobs':-1,
# 'booster':Choice(['gbtree','dart']),
'booster':'gbtree',
'colsample_bytree': Real(0.3,1,step=0.1),
'colsample_bylevel':Real(0.3,1,step=0.1),
# 'n_estimators':Choice([200,400,800,1200,1400,1600,1800,2000,2300,2500,3000,3500,4000,4500,5000,5500,6000,6500,7000,7500,8000,8500,9000,9500,10000,12000,14000,16000,18000,20000]),
'n_estimators':Choice([200,400,800,1200,1400,1600,1800,2000,2300,2500,3000,3500,4000,4500,5000,5500,6000,6500,7000,7500,8000,8500,9000,9500,10000]),
'gamma': Choice([0.5, 1, 1.5, 2, 3, 4, 5]),
'reg_alpha': Choice([0,0.1,0.2,0.3, 0.5,0.7,1,2,5,7,10,13,15,20,40,60,80,100]),
'reg_lambda': Choice([0,0.001,0.005, 0.01, 0.05,0.1,0.5,0.8,1]),
'min_child_weight':Choice([1,2,3,5,7,10]),
'subsample': Real(0.3, 1, step=0.1),
'max_depth': Choice([2,3,4,5,6,7,8,9]),
'learning_rate':Choice([0.01,0.005,0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6]),
'eval_metric': 'auc',
}
for k,v in xgb_init_kwargs.items():
_result_kwargs.update({k:v})
return _result_kwargs
@property
def default_lightgbm_init_kwargs(self):
_result_kwargs = super().default_lightgbm_init_kwargs
lightgbm_init_kwargs = {'colsample_bytree': Real(0.3, 1, step=0.1),
'n_estimators': Choice([200,400,800,1200,1600,2000,2500,3000,3500,4000,4500,5000,5500,6000,6500,7000,7500,8000,8500,9000,9500,10000,12000,14000,16000,18000,20000]),
# 'n_estimators': Choice([200,400,800,1200,1600,2000,2500,3000,3500,4000,4500,5000,5500,6000,6500,7000,7500,8000,8500,9000,9500,10000]),
'boosting_type': Choice(['gbdt', 'goss']),
'learning_rate': Choice([0.01,0.005,0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6]),
'max_depth': Choice([2,3,4,5,6,7,8,9]),
'num_leaves': Int(5, 500, 5),
'reg_alpha': Choice([0,0.1,0.2,0.3, 0.5,0.7,1,2,5,7,10,13,15,20,40,60,80]),
'reg_lambda': Choice([0,0.001,0.005,0.01,0.03,0.05,0.1,0.3,0.5,0.8,1]),
'subsample': Real(0.3, 1, step=0.1),
'min_child_samples': Int(2,100,step=1),
'min_child_weight': Choice([0.001, 0.002]),
# 'bagging_fraction': Real(0.5, 1, step=0.1),
'metric': 'auc',
}
for k, v in lightgbm_init_kwargs.items():
_result_kwargs.update({k: v})
return _result_kwargs
@property
def default_catboost_init_kwargs(self):
_result_kwargs = super().default_catboost_init_kwargs
catboost_init_kwargs={
'task_type': 'GPU',
'devices': '0',
'n_estimators':Choice([200,400,800,1200,1600,2000,2500,3000,3500,4000,4500,5000,5500,6000,6500,7000,7500,8000,8500,9000,9500,10000,12000,14000,16000,18000,20000]),
# 'n_estimators':Choice([200,400,800,1200,1600,2000,2500,3000,3500,4000,4500,5000,5500,6000,6500,7000,7500,8000,8500,9000,9500,10000]),
'depth': Choice([2,3,4,5,6,7,8,9]),
'learning_rate': Choice([0.01,0.005,0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6]),
'l2_leaf_reg':Choice([0,1,2,5,7,10,13,15,20,40,60,80,100]),
'min_data_in_leaf': Int(10,100,step=1),
'leaf_estimation_method':Choice(['Newton','Gradient']),
# 'subsample': Real(0.1, 1, step=0.1), ##closed Cause Bayesian
'bootstrap_type':Choice(['Poisson','Bayesian','Bernoulli']),
# 'loss_function':'RMSE',
'eval_metric': 'AUC',
}
for k, v in catboost_init_kwargs.items():
_result_kwargs.update({k: v})
return _result_kwargs