XGBoost 自动化调参模块

import pprint
import sklearn.preprocessing as preprocessing
from xgboost import XGBRegressor
import xgboost as xgb
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
import numpy as np
import time


skfold = KFold(n_splits=5, shuffle=True)

def xgb_cv(clf, train_data, label, cv_folds=5, early_stopping_rounds=50, metric='rmse', \
           eval_metric="rmse", is_print_f_i=False):
    train_X, test_X, train_y, test_y = train_test_split(train_data, label, test_size=0.1, random_state=2017)
    param = clf.get_xgb_params()
    train_data = xgb.DMatrix(train_X, train_y)
    train_test = xgb.DMatrix(test_X, test_y)
    cv_res = xgb.cv(param, train_data, num_boost_round=clf.get_params()['n_estimators'], nfold=cv_folds, metrics=metric,
                    early_stopping_rounds=early_stopping_rounds)
    clf.set_params(n_estimators=cv_res.shape[0])

    clf.set_params(n_estimators=cv_res.shape[0])


    clf.fit(train_X, train_y, eval_metric=metric)

    return cv_res.shape[0]


def grid_search_para(train_data, label, best_para=0, grid_param=0, is_search_estimator=False, search_lr=0.1,
                     scoring='mean_squared_error', search_estimators=10000, iid=False, cv=skfold):
    if not is_search_estimator:
        for key, value in grid_param.items():
            print('start GridSearchCV {} in range {}'.format(key, value))

        xgb_ = XGBRegressor(**best_para)

        grid_search = GridSearchCV(estimator=xgb_, param_grid=grid_param, scoring=scoring, iid=iid, cv=cv)

        grid_search.fit(train_data, label)

        best_para.update(grid_search.best_params_)

        print('the best parameter is ', grid_search.best_params_)
        print('the best score is %f' % grid_search.best_score_)


    else:
        xgb_ = XGBRegressor(booster="dart")
        if best_para == 0:
            best_para = xgb_.get_params()
        best_para['n_estimators'] = search_estimators
        best_para['learning_rate'] = search_lr
        xgb_ = XGBRegressor(**best_para)

        best_estimator = xgb_cv(xgb_, train_data, label)

        best_para['n_estimators'] = best_estimator

    return best_para


def select_parameter(train_data, label, test_size=0.1, scoring='rmse'):
    best_para = grid_search_para(train_data, label, best_para=0, is_search_estimator=True)

    grid_param = {'max_depth': list(range(3, 10, 1)), 'min_child_weight': list(range(1, 12, 2))}

    best_para = grid_search_para(train_data, label, best_para, grid_param=grid_param)

    grid_param = {'gamma': [i / 10.0 for i in range(0, 5)]}
    best_para = grid_search_para(train_data, label, best_para, grid_param=grid_param)

    base_para = grid_search_para(train_data, label, best_para, is_search_estimator=True)

    grid_param = {'subsample': [i / 10.0 for i in range(6, 10)], 'colsample_bytree': [i / 10.0 for i in range(6, 10)]}
    best_para = grid_search_para(train_data, label, best_para, grid_param=grid_param)

    grid_param = {'reg_alpha': [0, 0.001, 0.005, 0.01, 0.05, 1e-5, 1e-2, 0.1, 1, 100]}
    best_para = grid_search_para(train_data, label, best_para, grid_param=grid_param)

    best_para = grid_search_para(train_data, label, best_para, is_search_estimator=True, search_lr=0.1)

    pprint.pprint("The best parameter is \n {}".format(base_para))

    return best_para



if __name__ == "__main__":
    pass

### XGBoost模型构建与优化模块的功能 在网络用户行为分析系统中,XGBoost模型构建与优化模块主要负责创建并调优用于预测用户行为的梯度提升决策树模型。这一过程不仅涉及模型本身的搭建,还包括通过特定方法提高模型性能。 #### 模型构建 为了确保模型能够有效捕捉到复杂的数据模式,在构建阶段会精心设计模型结构及其超参数设置。具体来说: - **初始化配置**:定义基本的学习率、最大深度等初始参数值[^1]。 - **自定义评估函数**:引入了专门针对XGBoost训练过程中使用的评估指标——即`eval_metric`参数被设定为计算均方误差(MSE)一半的形式,这有助于更精确地衡量模型表现。 ```python import xgboost as xgb params = { 'objective': 'reg:squarederror', 'eval_metric': custom_eval_function, } def custom_eval_function(preds, dtrain): labels = dtrain.get_label() half_mse = 0.5 * np.mean((preds - labels)**2) return 'half-mse', half_mse ``` #### 模型优化 对于已建立的基础模型而言,进一步优化至关重要。这里采用了智能麻雀搜索算法(SSA),这是一种启发式的全局寻优策略,旨在找到最佳组合下的超参数集合以增强模型泛化能力[^2]。 - **参数空间探索**:利用SSA遍历可能存在的高维参数空间,自动调整诸如树的数量(`n_estimators`)、列采样比例(`colsample_bytree`)等多个影响因素。 - **交叉验证机制**:在整个优化流程中融入k折交叉验证技术,从而保证所选参数能在不同子集上稳定发挥作用,减少过拟合风险。 ```python from sklearn.model_selection import KFold from ssa_xgboost_optimizer import SSAXGBOptimizer optimizer = SSAXGBOptimizer( param_space={ "max_depth": (3, 7), "learning_rate": (0.01, 0.2), "subsample": (0.5, 1.0), "colsample_bytree": (0.5, 1.0), "n_estimators": (50, 200), }, ) kf = KFold(n_splits=5) for train_index, test_index in kf.split(X): optimizer.fit(X[train_index], y[train_index]) optimized_params = optimizer.best_params_ ```
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值