一个简单的遗传算法迭代xgboost最优参数的示例,这里用的是自定义损失函数
import pandas as pd
import numpy as np
import xgboost as xgb
from sko. GA import GA
from sklearn. model_selection import train_test_split
from sklearn. linear_model import LogisticRegression
from sklearn. metrics import classification_report
from sklearn import metrics
from log_color import log, LogLevel
from tqdm import tqdm
train_df = pd. read_csv( './train_v2.csv' )
test_df = pd. read_csv( './test_v2.csv' )
x = train_df. drop( [ 'user_id' , 'merchant_id' , 'label' ] , axis= 1 )
y = train_df[ 'label' ]
x_train, x_val, y_train, y_val = train_test_split( x, y, test_size= 0.2 , random_state = 42 )
gamma = 0
train_Y = y_train
alpha = ( train_Y== 0 ) . sum ( ) / train_Y. size
def logistic_obj ( p, dtrain) :
y = dtrain. get_label( )
p = 1.0 / ( 1.0 + np. exp( - p) )
grad = p * ( 1 - p) * ( alpha * gamma * y * ( 1 - p) ** gamma * np. log( p) / ( 1 - p) - alpha * y * (
1 - p) ** gamma / p - gamma * p ** gamma * ( 1 - alpha) * ( 1 - y) * np. log( 1 - p) / p + p ** gamma * (
1 - alpha) * ( 1 - y) / ( 1 - p) )
hess = p * ( 1 - p) * ( p * ( 1 - p) * (
- alpha * gamma ** 2 * y * ( 1 - p) ** gamma * np. log( p) / ( 1 - p) ** 2 + alpha * gamma * y * (
1 - p) ** gamma * np. log( p) / ( 1 - p) ** 2 + 2 * alpha * gamma * y * ( 1 - p) ** gamma / (
p * ( 1 - p) ) + alpha * y * ( 1 - p) ** gamma / p ** 2 - gamma ** 2 * p ** gamma * (
1 - alpha) * ( 1 - y) * np. log( 1 - p) / p ** 2 + 2 * gamma * p ** gamma * ( 1 - alpha) * (
1 - y) / ( p * ( 1 - p) ) + gamma * p ** gamma * ( 1 - alpha) * ( 1 - y) * np. log(
1 - p) / p ** 2 + p ** gamma * ( 1 - alpha) * ( 1 - y) / ( 1 - p) ** 2 ) - p * (
alpha * gamma * y * ( 1 - p) ** gamma * np. log( p) / ( 1 - p) - alpha * y * (
1 - p) ** gamma / p - gamma * p ** gamma * ( 1 - alpha) * ( 1 - y) * np. log(
1 - p) / p + p ** gamma * ( 1 - alpha) * ( 1 - y) / ( 1 - p) ) + ( 1 - p) * (
alpha * gamma * y * ( 1 - p) ** gamma * np. log( p) / ( 1 - p) - alpha * y * (
1 - p) ** gamma / p - gamma * p ** gamma * ( 1 - alpha) * ( 1 - y) * np. log(
1 - p) / p + p ** gamma * ( 1 - alpha) * ( 1 - y) / ( 1 - p) ) )
return grad, hess
def XGBoostAUC ( p) :
etas = [ 0.0001 , 0.001 , 0.01 , 0.1 ]
sampling_methods = [ "uniform" , "gradient_based" ]
w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15, w16, w17 = p
params = {
"learning_rate" : w1
, "n_estimators" : int ( w2)
, "max_depth" : int ( w3)
, "min_child_weight" : w4
, "gamma" : w5
, "subsample" : w6
, "nthread" : 5
, "scale_pos_weight" : ( train_Y== 0 ) . sum ( ) / ( train_Y== 1 ) . sum ( )
, "lambda" : w7
, "eta" : etas[ int ( w8) ]
, "verbosity" : 1
, "eval_metric" : "auc"
, "seed" : int ( w9)
, "max_delta_step" : w10
, "subsample" : w11
, "sampling_method" : sampling_methods[ int ( w12) ]
, 'colsample_bytree' : w13
, 'colsample_bylevel' : w14
, 'colsample_bynode' : w15
, "gpu_id" : 0
, "tree_method" : "gpu_hist"
, "max_leaves" : int ( w16)
, "num_parallel_tree" : int ( w17)
}
dtrain = xgb. DMatrix( x_train, label= y_train)
clf = xgb. train( params= params
, dtrain= dtrain
, num_boost_round= 100
, evals= [ ( dtrain, "train" ) ]
, verbose_eval= False
, obj= logistic_obj
)
dtest = xgb. DMatrix( x_val, label= y_val)
lr_proba = clf. predict( dtest)
lr_proba = np. nan_to_num( lr_proba, 0 )
fpr, tpr, threshold = metrics. roc_curve( y_val, lr_proba)
roc_auc = metrics. auc( fpr, tpr)
dtrain= None
clf = None
dtest = None
lr_proba = None
fpr, tpr, threshold = None , None , None
log( f"本次迭代AUC分数为:[ { roc_auc} ],本次X值为:[ { p} ]" , LogLevel. PASS)
return - roc_auc
ga = GA( func= XGBoostAUC
, n_dim= 17
, size_pop= 10
, max_iter= 5
, prob_mut= 0.01
, lb= [ 0.1 , 5 , 1 , 0 , 0 , 0 , 0 , 0 , 10 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 1 ]
, ub= [ 1 , 20 , 20 , 100 , 1 , 1 , 100 , 3 , 100 , 10 , 1 , 1 , 1 , 1 , 1 , 10 , 10 ]
, precision= [ 0.1 , 1 , 1 , 0.1 , 0.1 , 0.1 , 0.1 , 1 , 1 , 0.1 , 0.1 , 1 , 0.1 , 0.1 , 0.1 , 1 , 1 ]
)
best_x, best_y = ga. run( )
print ( 'best_x:' , best_x, '\n' , 'best_y:' , best_y)
opt_x_log = pd. DataFrame( {
"best_x" : [ best_x]
, "best_y" : [ best_y]
} )
print ( f"优化结果表: { opt_x_log} " )
opt_x_log. to_csv( "best_x2.csv" )
w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15, w16, w17 = best_x
etas = [ 0.0001 , 0.001 , 0.01 , 0.1 ]
sampling_methods = [ "uniform" , "gradient_based" ]
params = {
"learning_rate" : w1
, "n_estimators" : int ( w2)
, "max_depth" : int ( w3)
, "min_child_weight" : w4
, "gamma" : w5
, "subsample" : w6
, "nthread" : 5
, "scale_pos_weight" : ( train_Y== 0 ) . sum ( ) / ( train_Y== 1 ) . sum ( )
, "lambda" : w7
, "eta" : etas[ int ( w8) ]
, "verbosity" : 1
, "eval_metric" : "auc"
, "seed" : int ( w9)
, "max_delta_step" : w10
, "subsample" : w11
, "sampling_method" : sampling_methods[ int ( w12) ]
, 'colsample_bytree' : w13
, 'colsample_bylevel' : w14
, 'colsample_bynode' : w15
, "gpu_id" : 0
, "tree_method" : "gpu_hist"
, "max_leaves" : int ( w16)
, "num_parallel_tree" : int ( w17)
}
params. update( { "best_auc" : best_y} )
best_params_table = pd. DataFrame( { k: [ v] for k, v in params. items( ) } )
best_params_table. to_csv( "best_params_table.csv" )