模拟退火在迭代最优参数时也有很多问题会出现,比如初始解就在最优值附近,造成的stay_counter参数会反复回零,导致无法快速退出。还有就是T温度参数会很难累计到最小温度以下,导致迟迟无法收敛,可以将初始炉温T_max设置低一些来解决这个问题
import pandas as pd
import numpy as np
import lightgbm as lgb
from sko.GA import GA
from sko.tools import set_run_mode
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn import metrics
from log_color import log,LogLevel
from tqdm import tqdm
from sklearn import metrics
from matplotlib import pyplot as plt
from sko.SA import SAFast
import time
import datetime
import os
def plot_roc(y_test, y_score):
fpr,tpr,threshold = metrics.roc_curve(y_test, y_score)
roc_auc = metrics.auc(fpr,tpr)
plt.stackplot(fpr, tpr, color='steelblue', alpha = 0.5, edgecolor = 'black')
plt.plot(fpr, tpr, color='black', lw = 1)
plt.plot([0,1],[0,1], color = 'red', linestyle = '--')
plt.text(0.5,0.3,'ROC curve (area = %0.10f)' % roc_auc)
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC Curve')
plt.show()
train_df = pd.read_csv('./train_v2.csv', index_col=0)
test_df = pd.read_csv('./test_v2.csv', index_col=0)
print(train_df)
x = train_df.drop(['user_id','merchant_id','label'],axis=1)
y = train_df['label']
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state = 42)
gamma = 0
train_positive = (y_train==1).sum()
train_negative = (y_train==0).sum()
train_y_counter = y_train.size
alpha = train_negative/train_y_counter
log(f"""训练数据中,正例有【{train_positive}】个占比【{train_positive/train_y_counter}】
,负例有【{train_negative}】个占比【{train_negative/train_y_counter}】
,alpha值为【{alpha}】,""",LogLevel.INFO)
test_positive = (y_val==1).sum()
test_negative = (y_val==0).sum()
test_y_counter = y_val.size
log(f"""测试数据中,正例有【{test_positive}】个占比【{test_positive/test_y_counter }】
,负例有【{test_negative}】个占比【{test_negative/test_y_counter }】
,alpha值为【{test_negative/test_y_counter}】,""",LogLevel.INFO)
def params_logic(x):
"""
处理一下 当: 'bagging_fraction': 1.0, 'feature_fraction': 1.0,时候
'boosting': 'rf'会导致lightGBM模型 参数冲突的问题
"""
if x[10] ==10 and x [11]== 10 and int(x[4]) == 2:
index = np.random.randint(0,2)
if index == 0:
x[10] = 0.999999999
elif index == 1:
x[11] = 0.999999999
else:
x[2] = 0
return x
def LightGBM_Func(x):
boostings = ["gbdt","rf","dart"]
tree_learners = ["serial","feature","data","voting"]
x = params_logic(x)
func_start = time.time()
params = {
'verbose':-1,
'min_data_in_leaf': int(x[0]),
'objective': 'binary',
'metric': 'auc',
'num_leaves': int(x[1]),
"boosting": boostings[int(x[2])],
'n_estimators':int(x[3]),
"tree_learner":tree_learners[int(x[4])],
'max_bin': int(x[5]),
"min_data_in_bin":int(x[6]),
'max_depth':int(x[7]),
"learning_rate": x[8],
"bagging_fraction": x[9],
"feature_fraction":x[10],
"lambda_l1":x[11],
"lambda_l2":x[12],
'n_jobs': -1,
'seed': int(x[13]),
'bagging_freq':int(x[14]),
'is_unbalance':bool(int(x[15])),
"early_stopping_rounds":int(x[16])<1 and 1 or int(x[16]),
"device_type":"cpu"
}
log(f"本次参数为:[{params}]",LogLevel.INFO)
try:
gbm = lgb.LGBMClassifier(**params)
gbm.fit(x_train, y_train,
eval_metric='auc',
eval_set=[(x_train, y_train), (x_val, y_val)]
)
gbm_pred = gbm.predict(x_val)
gbm_proba = gbm.predict_proba(x_val)[:,1]
fpr,tpr,threshold = metrics.roc_curve(y_val, gbm_proba)
roc_auc = metrics.auc(fpr,tpr)
dtrain=None
clf = None
dtest = None
lr_proba = None
fpr,tpr,threshold = None,None,None
except Exception as e:
roc_auc = 0
func_end = time.time()
params_log_path = "./lightGBM_opt_params_log_binary.csv"
params_log = {k:[v] for k,v in params.items()}
params_log.update({
"roc_auc":[roc_auc]
})
params_log = pd.DataFrame(params_log)
if os.path.exists(params_log_path):
params_log.to_csv(params_log_path,mode='a', header=False, index=None)
else:
params_log.to_csv(params_log_path,index=None)
global NOW_FUC_RUN_ITER
NOW_FUC_RUN_ITER += 1
log(f"""本次迭代AUC分数为:[{roc_auc}],
用时:[{func_end-func_start}]秒,
当前优化第:[{NOW_FUC_RUN_ITER}]次,
已运行:[{NOW_FUC_RUN_ITER}]次,
用时总计:[{datetime.timedelta(seconds=(func_end-SA_start_time))}]秒,
""",LogLevel.PASS)
return -roc_auc
SA_start_time = time.time()
NOW_FUC_RUN_ITER = 0
SA_params_table = pd.read_csv("SA_params_for_LightGMB.csv")
lbs=[5.0, 20.0, 0.0, 100.0, 0.0, 20.0, 1.0, 0.0, 0.0001, 0.1, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
ubs=[100, 100, 1, 2000, 3, 1000, 20, 50, 1, 1, 1, 1000, 1000, 100, 99, 1, 1000]
x0s=[20.0, 31.0, 0.0, 100.0, 0.0, 50.0, 3.0, 15.0, 0.01, 0.8, 0.8, 0.0, 0.0, 42.0, 3.0, 1.0, 30.0]
print(f"""lbs:{lbs},ubs:{ubs},x0s:{X0s}""")
sa = SAFast(func=LightGBM_Func
, x0=X0s
, T_max=1
, T_min=0.9
, L=300
, max_stay_counter=10
,lb = lbs
,ub = ubs
,m = 1
,n = 1
,quench = 1
)
best_x, best_y = sa.run()
print('best_x:', best_x, 'best_y:', best_y,"y_history:",len(sa.best_y_history),sa.iter_cycle)
boostings = ["gbdt","rf","dart"]
tree_learners = ["serial","feature","data","voting"]
x = best_x
params = {
'verbose':-1,
'min_data_in_leaf': int(x[0]),
'objective': 'binary',
'metric': 'auc',
'num_leaves': int(x[1]),
"boosting": boostings[int(x[2])],
'n_estimators':int(x[3]),
"tree_learner":tree_learners[int(x[4])],
'max_bin': int(x[5]),
"min_data_in_bin":int(x[6]),
'max_depth':int(x[7]),
"learning_rate": x[8],
"bagging_fraction": x[9],
"feature_fraction":x[10],
"lambda_l1":x[11],
"lambda_l2":x[12],
'n_jobs': -1,
'seed': int(x[13]),
'bagging_freq':int(x[14]),
'is_unbalance':bool(int(x[15])),
"early_stopping_rounds":int(x[16])<1 and 1 or int(x[16]),
"device_type":"cpu"
}
best_lightGBM_params_binary_path = "best_lightGBM_params_binary.csv"
params.update({"best_auc":best_y})
best_params_table = pd.DataFrame({k:[v] for k,v in params.items()})
if os.path.exists(best_lightGBM_params_binary_path ):
best_params_table.to_csv(best_lightGBM_params_binary_path ,mode='a', header=False, index=None)
else:
best_params_table.to_csv(best_lightGBM_params_binary_path ,index=False)