贝叶斯优化xgboost的超参数最简实现

from hyperopt import hp
from hyperopt import hp, fmin, tpe, Trials, partial
from hyperopt.early_stop import no_progress_loss
import warnings
warnings.filterwarnings("ignore")
import numpy as np
# from OptMetrics import MyMetric
# from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn import datasets
from MyLogColor import log,LogLevel
import time
from sklearn.metrics import precision_recall_curve,auc,f1_score,roc_curve,auc

cancer=datasets.load_breast_cancer()
x=cancer.data
y=cancer.target

def Rollover(x):
    x = x.astype(bool)
    x = ~x
    x = x.astype(int)
    return x
####TODO:将少数变成正例
y = Rollover(y)

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state = 42)


def ROC_AUC(test_y, proba):
    fpr,tpr,threshold = roc_curve(test_y, proba)
    roc_auc_ = auc(fpr,tpr)
    return roc_auc_


historical_metrics = []
historical_params = {}

boosters = ['gbtree','gblinear','dart']
sampling_methods = ['uniform','gradient_based']
tree_methods = ["auto","exact","approx","hist"]
refresh_leafs = [0,1]
# process_types = ["default","update"]#,"refresh","prune"]
grow_policys = ["depthwise","lossguide"]
# sample_types = ["uniform","weighted"]
normalize_types = ["tree","forest"]
rate_drops = []



param_grid_hp = {
    'boosters':hp.choice('boosters',boosters)
    ,"n_estimators":hp.quniform("n_estimators",50,1000,1)
    ,"learning_rate":hp.uniform("learning_rate",1e-5,1)
    ,"gamma":hp.quniform("gamma",0,100,1)
    ,"max_depth":hp.quniform("max_depth",6,200,1)
    ,"min_child_weight":hp.quniform("min_child_weight",0,100,1)
    ,"max_delta_step":hp.quniform("max_delta_step",0,100,1)
    ,"subsample":hp.uniform("subsample",0,1)
    # ,"sampling_method":hp.choice("sampling_method",sampling_methods)
    ,"colsample_bytree":hp.uniform("colsample_bytree",0,1)
    ,"colsample_bylevel":hp.uniform("colsample_bylevel",0,1)
    ,"colsample_bynode":hp.uniform("colsample_bynode",0,1)
    ,"lambda":hp.quniform("lambda",0,200,1)
    ,"alpha":hp.quniform("alpha",0,200,1)
    ,"tree_method":hp.choice("tree_method",tree_methods)
    # ,"scale_pos_weight":hp.uniform("scale_pos_weight",0,1000)
    ,"refresh_leaf":hp.choice("refresh_leaf",refresh_leafs)
    # ,"process_type":hp.choice("process_type",process_types)
    ,"grow_policy":hp.choice("grow_policy",grow_policys)
    ,"max_leaves":hp.quniform("max_leaves",0,10000,1)
    ,"max_bin":hp.quniform("max_bin",256,1000,1)
    ,"num_parallel_tree":hp.quniform("num_parallel_tree",1,100,1)   
}
# booster_dart_params = {
#     "sample_type":hp.choice("sample_type",sample_types)
#     ,"normalize_type":hp.choice("normalize_type",normalize_types)
#     ,"rate_drop":hp.uniform("rate_drop",0,1)
#     ,"one_drop":hp.quniform("one_drop",0,1000,1)
#     ,"skip_drop":hp.uniform("skip_drop",0,1)
# }

booster_gblinear_params = {
    
}

def PR_AUC(test_y,proba,pred):
    precision,recall,_ = precision_recall_curve(test_y,proba)
    f1 ,pr_auc = f1_score(test_y,pred),auc(recall,precision)
    return pr_auc

def hyperopt_objective(hyperopt_params): 
    params = {
        "objective":"binary:logistic"
        ,'boosters':hyperopt_params['boosters']
        ,"n_estimators":int(hyperopt_params["n_estimators"])
        ,"learning_rate":hyperopt_params["learning_rate"]
        ,"gamma":hyperopt_params["gamma"]
        ,"max_depth":int(hyperopt_params["max_depth"])
        ,"min_child_weight":int(hyperopt_params["min_child_weight"])
        ,"max_delta_step":int(hyperopt_params["max_delta_step"])
        ,"subsample":hyperopt_params["subsample"]
        ,"verbosity":0
        # ,"sampling_method":hyperopt_params["sampling_method"]
        ,"colsample_bytree":hyperopt_params["colsample_bytree"]
        ,"colsample_bylevel":hyperopt_params["colsample_bylevel"]
        ,"colsample_bynode":hyperopt_params["colsample_bynode"]
        ,"lambda":int(hyperopt_params["lambda"])
        ,"alpha":int(hyperopt_params["alpha"])
        ,"tree_method":hyperopt_params["tree_method"]
        ,"scale_pos_weight":(y_train==0).sum()/(y_train==1).sum()
        ,"refresh_leaf":hyperopt_params["refresh_leaf"]
        # ,"process_type":hyperopt_params["process_type"]
        ,"grow_policy":hyperopt_params["grow_policy"]
        ,"max_leaves":int(hyperopt_params["max_leaves"])
        ,"max_bin":int(hyperopt_params["max_bin"])
        ,"num_parallel_tree":int(hyperopt_params["num_parallel_tree"])   
    }
    # booster_dart_params = {
    #     "sample_type":hyperopt_params["sample_type"]
    #     ,"normalize_type":hp.choice("normalize_type",normalize_types)
    #     ,"rate_drop":hyperopt_params["rate_drop"]
    #     ,"one_drop":int(hyperopt_params["one_drop"])
    #     ,"skip_drop":hyperopt_params["skip_drop"]
    # }
    dtrain = xgb.DMatrix(x_train,label=y_train)
    clf = xgb.train(params=params
                   ,dtrain=dtrain
                   ,num_boost_round=100
                   ,evals=[(dtrain,"train")]
                   ,verbose_eval=False # 不显示训练信息就改False
                   # ,obj=logistic_obj
                   )
    dtest = xgb.DMatrix(x_val,label=y_val)
    xgboost_proba = clf.predict(dtest)
    # xgbosst_proba = np.nan_to_num(xgboost_proba,0)
    
    global NOW_FUC_RUN_ITER
    NOW_FUC_RUN_ITER += 1
    metric = ROC_AUC(y_val,xgboost_proba)
    historical_metrics.append(metric)
    historical_params.update({NOW_FUC_RUN_ITER-1:params})
    return - metric

def param_hyperopt(max_evals=100):

    #保存迭代过程
    trials = Trials()

    #设置提前停止
    early_stop_fn = no_progress_loss(100)

    #定义代理模型
    #algo = partial(tpe.suggest, n_startup_jobs=20, n_EI_candidates=50)
    params_best = fmin(hyperopt_objective #目标函数
                       , space = param_grid_hp #参数空间
                       , algo = tpe.suggest #代理模型
                       #, algo = algo
                       , max_evals = max_evals #允许的迭代次数
                       , verbose=True
                       , trials = trials
                       , early_stop_fn = early_stop_fn
                      )

    #打印最优参数,fmin会自动打印最佳分数
    print("\n","\n","best params: ", params_best,
          "\n")
    return params_best, trials

NOW_FUC_RUN_ITER = 0
PARAMS_BEST, Trials = param_hyperopt(600)

historical_metrics = np.array(historical_metrics)
idx = np.argmax(historical_metrics)
params = historical_params[idx]
dtrain = xgb.DMatrix(x_train,label=y_train)
clf = xgb.train(params=params
               ,dtrain=dtrain
               ,num_boost_round=100
               ,evals=[(dtrain,"train")]
               ,verbose_eval=False # 不显示训练信息就改False
               # ,obj=logistic_obj
               )
dtest = xgb.DMatrix(x_val,label=y_val)
xgboost_proba = clf.predict(dtest)
print("测试优化结果",ROC_AUC(y_val,xgboost_proba))
  • 9
    点赞
  • 9
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
### 回答1: 下面是一个使用贝叶斯优化来调整XGBoost超参数的示例代码: ```python import xgboost as xgb from bayes_opt import BayesianOptimization # 定义优化目标函数 def xgb_cv(max_depth, learning_rate, n_estimators, gamma, min_child_weight, subsample, colsample_bytree): params = { 'eval_metric': 'rmse', 'max_depth': int(max_depth), 'learning_rate': learning_rate, 'n_estimators': int(n_estimators), 'gamma': gamma, 'min_child_weight': int(min_child_weight), 'subsample': subsample, 'colsample_bytree': colsample_bytree, 'n_jobs': -1, 'random_state': 42 } # 进行交叉验证 cv_result = xgb.cv(params, dtrain, num_boost_round=100, early_stopping_rounds=10, stratified=False) return -1.0 * cv_result['test-rmse-mean'].iloc[-1] # 定义参数范围 pbounds = {'max_depth': (3, 10), 'learning_rate': (0.01, 0.3), 'n_estimators': (50, 200), 'gamma': (0, 10), 'min_child_weight': (1, 10), 'subsample': (0.5, 1), 'colsample_bytree': (0.1, 1)} # 进行贝叶斯优化,找到最优超参数 optimizer = BayesianOptimization(f=xgb_cv, pbounds=pbounds, random_state=42) optimizer.maximize(init_points=5, n_iter=25) # 输出最优结果 print(optimizer.max) ``` 在上面的代码中,我们使用了XGBoost贝叶斯优化的库。我们首先定义了一个优化的目标函数xgb_cv,它接受一些参数,构建XGBoost模型,并对模型进行交叉验证来计算目标值。然后我们定义了每个参数的范围,然后使用BayesianOptimization库来进行贝叶斯优化,找到最大化目标函数的最优参数组合。最后,我们输出了最优结果。 ### 回答2: 贝叶斯优化是一种用于超参数调优的统计方法,能够更高效地找到最优参数组合。在使用XGBoost算法时,也可以通过贝叶斯优化来搜索最佳的超参数组合。 首先,我们需要定义一个目标函数,用于评估不同超参数组合的性能。目标函数通常会根据给定超参数组合在训练集上进行交叉验证,并返回一个性能指标,如准确率或均方根误差。 接下来,我们需要定义超参数的搜索空间。对于XGBoost算法,常见的超参数包括学习率、树的深度、子采样比例等。贝叶斯优化通过在搜索空间内随机采样一些点,并利用高斯过程模型来建立超参数与性能指标之间的映射关系,进而根据这个模型预测下一个最有可能的超参数组合。 按照这个过程,我们可以进行多次迭代,每次得到一个新的超参数组合,并利用目标函数评估其性能。然后,我们将其加入贝叶斯优化的历史数据集中,并更新高斯过程模型,以便更准确地预测下一个最佳超参数组合。 最后,当迭代次数达到预设值或满足一定终止条件时,我们就可以得到一个在训练集上表现最好的超参数组合。 总结来说,贝叶斯优化XGBoost超参数调优中能够更高效地搜索最佳超参数组合。通过定义目标函数和搜索空间,并利用贝叶斯优化的方法迭代找到最佳超参数组合,可以显著提高XGBoost算法的性能。 ### 回答3: 贝叶斯优化是一种用于调整XGBoost模型的超参数的方法。XGBoost是一种强大的机器学习框架,但正确选择合适的超参数对模型性能至关重要。 首先,我们需要确定要调整的超参数。常见的超参数包括学习率、树的最大深度、叶子节点最小权重等。这些超参数的值将影响模型的准确性和复杂性。 接下来,我们使用贝叶斯优化方法来找到最佳的超参数组合。贝叶斯优化考虑了每次迭代的参数和结果之间的关联性。它建立了一个概率模型,并在每次迭代中根据先前的结果调整超参数来选择下一次迭代的参数。这允许我们在较少的迭代次数内找到最佳的超参数组合,从而节省时间和计算资源。 在选择下一组参数并进行训练之后,我们需要计算所得模型的性能指标,如准确率、精确率、召回率等。根据这些指标,我们可以确定当前超参数组合的性能,并将其与先前的结果进行比较。 接着,我们将优化过程迭代多次,直到找到最佳的超参数组合为止。通过使用贝叶斯优化方法,我们能够在较短时间内找到最优的超参数组合,提高模型的准确性和鲁棒性。 综上所述,贝叶斯优化是一种有效的方法来调整XGBoost模型的超参数。通过建立概率模型并根据先前的结果来选择下一个参数组合,贝叶斯优化能够帮助我们快速找到最佳的超参数组合,从而提高模型的性能。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值