【模型调参】lgb的参数调节

Table of Contents

import numpy as np              # 导入numpy库
import pandas as pd             # 导入pandas库
import matplotlib as mpl        # 导入matplotlib库
import matplotlib.pyplot as plt 
import seaborn as sns           # 导入seaborn库
%matplotlib inline
plt.rcParams['font.sans-serif'] = ['KaiTi'] # 指定默认字体
plt.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns',None)

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost.sklearn import XGBClassifier
import lightgbm as lgb

from sklearn.model_selection import GridSearchCV 
from sklearn.preprocessing import StandardScaler

数据导入

op = pd.read_csv('op_done.csv',index_col='user')
base = pd.read_csv('base_done.csv',index_col='user')
tr = pd.read_csv('tr_done.csv',index_col='user')
label = pd.read_csv('train_label.csv',index_col='user')
sumbit = pd.read_csv('submit_example.csv',index_col='user')
train = label.join(base).join(op).join(tr)
train.fillna(0,inplace=True)
train.head()
labelsexageproviderlevelverifiedusing_timeregist_typecard_a_cntcard_b_cntcard_c_cntagreement1op1_cntop2_cntcard_d_cntagreement_totalservice1_cntservice1_amtservice2_cntagreement2agreement3agreement4acc_countlogin_cnt_period1login_cnt_period2ip_cntlogin_cnt_avglogin_days_cntprovincecitybalancebalance_avgbalance1balance1_avgbalance2balance2_avgservice3service3_levelproduct1_amountproduct2_amountproduct3_amountproduct4_amountproduct5_amountproduct6_amountproduct7_cntproduct7_fail_cntop_timeis_all116a2503b987ea81b131ac74aa38a121b2e7fa260df4998dtype4op_type0op_type1op_type2op_type3op_type4mode0mode1mode2mode3mode4channel0channel1channel2channel3channel4ip_numplatform_0platform_1platform_2platform_3platform_4platform_5tunnel_in_0tunnel_in_1tunnel_in_2tunnel_in_3tunnel_in_4tunnel_out_0tunnel_out_1tunnel_out_2tunnel_out_3type1_0type1_1type1_2type1_3type1_4type1_5type1_6type1_7type1_8type1_9type1_10type1_11type1_12type1_13type1_14type1_15type1_16type1_17type1_18type1_19type2_0type2_1type2_2type2_3type2_4type2_5type2_6type2_7type2_8type2_9type2_10type2_11type2_12type2_13tr_timemean_amountip_ture
user
Train_00000002487101024712124712247122470602473124719247062474324706247062470610024737250412493824719247372474910141411165041110012471224706102.00.7058821.01.028.072.012.00.044.00.00.00.00.00.012.00.00.00.00.02.00.04.06.07.00.00.00.00.00.00.00.00.07.00.013.00.00.00.00.01.00.00.00.00.02.04.00.00.00.00.00.00.00.00.06.00.00.00.07.00.00.00.00.00.00.00.00.00.00.00.00.013.053330.3076920.384615
Train_0000110248890202471612471924719247060247122471224706247552470624706247061002473725443249312473124749247372138661504231006247122470618.00.33333311.00.01.06.06.00.00.00.00.00.00.00.06.00.00.00.01.00.00.02.01.01.00.00.00.00.00.00.00.00.01.00.02.00.00.00.00.00.00.00.00.00.00.01.00.00.00.00.00.00.00.00.01.00.00.00.01.00.00.00.00.00.00.00.00.00.00.00.00.02.036098.0000000.500000
Train_000020024963020247367247122471224706024712247122470624743247062470624706100247312658426524247742477424859208811950411100124719247198.00.1250000.05.02.01.01.00.00.00.00.00.00.00.01.00.00.00.00.02.00.01.05.07.00.00.00.00.00.00.00.00.07.00.012.00.00.00.00.01.00.00.00.00.00.06.00.00.00.00.00.00.00.00.05.00.00.00.07.00.00.00.00.00.00.00.00.00.00.00.00.012.057329.5833330.500000
Train_00003002484022024719324712247122470602471924706247062473724706247062470601024712255712552924908247372484621111111041110012471224706108.00.4722220.04.046.058.024.00.014.00.00.00.00.00.024.00.00.00.00.06.00.019.02.09.00.00.00.00.00.00.00.00.09.00.011.00.00.01.00.00.00.01.00.03.00.04.00.00.00.00.00.00.00.00.02.00.00.00.09.00.00.00.00.00.00.00.00.00.00.00.00.011.061652.4545450.727273
Train_0000400248712102470732471224712247060247062470624706247252470624706247060002471225838258382475524816247671096111030411100124706247065.00.2000000.04.00.01.01.00.00.00.00.00.00.00.01.00.00.00.00.00.00.01.01.00.00.00.00.00.00.00.00.00.00.00.01.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.01.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.01.036689.0000000.000000
train_X = train.iloc[:,1:].values
train_y = train.iloc[:,0].values
stand = StandardScaler()
train_X = stand.fit_transform(train_X)
test = sumbit.join(base).join(op).join(tr)
test.fillna(0,inplace =True)
test_X = test.iloc[:,1:].values
test_X = stand.fit_transform(test_X)

模型挑选

lr = LogisticRegression(random_state=2018)  # 逻辑回归模型
svm = SVC(probability=True,random_state=2018)  # SVM模型
forest=RandomForestClassifier(n_estimators=100,random_state=2018) # 随机森林
Gbdt=GradientBoostingClassifier(random_state=2018) #CBDT
Xgbc=XGBClassifier(random_state=2018)  #Xgbc
gbm=lgb.LGBMClassifier(random_state=2018)  #lgb
model_name=["lr","svm","forest","Gbdt","Xgbc","gbm"]
def muti_score(model):
    auc = cross_val_score(model, train_X, train_y, scoring='roc_auc', cv=3)
    return auc.mean()
scores = []
for name in model_name:
    model = eval(name)
    socre = muti_score(model)
    scores.append((name,socre))
scores
[('lr', 0.6374291913334925),
 ('svm', 0.42584336157620334),
 ('forest', 0.6732019222635085),
 ('Gbdt', 0.6995580705824883),
 ('Xgbc', 0.6890128512134231),
 ('gbm', 0.7027585172289985)]

模型调参

经过对比 gbdt和gbm效果较好 
调参顺序 n_estimators -- max_depth/num_leaves -- min_child_samples/min_child_weight -- subsample/colsample_bytree --reg_alpha/reg_lambda -- 学习率
设立初始参数
params = {'boosting_type':'gbdt','objective': 'binary','subsample': 0.8,'colsample_bytree': 0.8}
gbm=lgb.LGBMClassifier(**params)
gbm.get_params()
{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 0.8,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': -1,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'num_leaves': 31,
 'objective': 'binary',
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'silent': True,
 'subsample': 0.8,
 'subsample_for_bin': 200000,
 'subsample_freq': 0}
调解n_estimators
param_1 = {'n_estimators':range(50,150,5)}
cv = GridSearchCV(gbm,param_grid=param_1,scoring='roc_auc',cv=5)
grid_result = cv.fit(train_X, train_y)
print(grid_result.best_score_,grid_result.best_params_)
result = pd.DataFrame(grid_result.cv_results_)
plt.plot(result['param_n_estimators'],result['mean_test_score'])
0.718750789233066 {'n_estimators': 80}





[<matplotlib.lines.Line2D at 0x1b2b4aab0f0>]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-hqXl8ymd-1597326586451)(output_17_2.png)]

params.update(grid_result.best_params_)
gbm=lgb.LGBMClassifier(**params)
max_depth/num_leaves
param_2 = {'max_depth':range(5,9),'num_leaves ':range(20,50,2)}
cv = GridSearchCV(gbm,param_grid=param_2,scoring='roc_auc',cv=4)
grid_result = cv.fit(train_X, train_y)
print(grid_result.best_score_,grid_result.best_params_)
result = pd.DataFrame(grid_result.cv_results_)
0.7191457708890046 {'max_depth': 8, 'num_leaves ': 20}
params.update({'max_depth': 8, 'num_leaves ': 20})
gbm=lgb.LGBMClassifier(**params)
gbm.get_params()
{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 0.8,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': 8,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 80,
 'n_jobs': -1,
 'num_leaves': 31,
 'objective': 'binary',
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'silent': True,
 'subsample': 0.8,
 'subsample_for_bin': 200000,
 'subsample_freq': 0,
 'num_leaves ': 20}
min_child_samples/min_child_weight
param_3 = {'min_child_samples':range(10,30,2),'min_child_weight':[i/1000 for i in range(0,20,2)]}
cv = GridSearchCV(gbm,param_grid=param_3,scoring='roc_auc',cv=4)
grid_result = cv.fit(train_X, train_y)
print(grid_result.best_score_,grid_result.best_params_)
result = pd.DataFrame(grid_result.cv_results_)
0.7191457708890046 {'min_child_samples': 20, 'min_child_weight': 0.0}
params.update({'min_child_samples': 20, 'min_child_weight': 0.0})
gbm=lgb.LGBMClassifier(**params)
gbm.get_params()
{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 0.8,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': 8,
 'min_child_samples': 20,
 'min_child_weight': 0.0,
 'min_split_gain': 0.0,
 'n_estimators': 80,
 'n_jobs': -1,
 'num_leaves': 31,
 'objective': 'binary',
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'silent': True,
 'subsample': 0.8,
 'subsample_for_bin': 200000,
 'subsample_freq': 0,
 'num_leaves ': 20}
subsample/colsample_bytree(0.6,1)
param_4 = {'subsample':[i/10 for i in range(6,10,1)],'colsample_bytree':[i/10 for i in range(6,10,1)]}
cv = GridSearchCV(gbm,param_grid=param_4,scoring='roc_auc',cv=4)
grid_result = cv.fit(train_X, train_y)
print(grid_result.best_score_,grid_result.best_params_)
result = pd.DataFrame(grid_result.cv_results_)
0.7191457708890046 {'colsample_bytree': 0.8, 'subsample': 0.6}
reg_alpha/reg_lamb
param_5 = {'subsample':[i/10 for i in range(10)],'colsample_bytree':[i/10 for i in range(10)]}
cv = GridSearchCV(gbm,param_grid=param_5,scoring='roc_auc',cv=4)
grid_result = cv.fit(train_X, train_y)
print(grid_result.best_score_,grid_result.best_params_)
0.7191457708890046 {'colsample_bytree': 0.8, 'subsample': 0.1}
学习率
param_6 = {'learning_rate':[i/100 for i in range(20)]}
cv = GridSearchCV(gbm,param_grid=param_6,scoring='roc_auc',cv=4)
grid_result = cv.fit(train_X, train_y)
print(grid_result.best_score_,grid_result.best_params_)
0.7191457708890046 {'learning_rate': 0.1}

测试集生成结果

gbm.fit(train_X, train_y)
LGBMClassifier(colsample_bytree=0.8, max_depth=8, min_child_weight=0.0,
               n_estimators=80, num_leaves =20, objective='binary',
               subsample=0.8)
gbm.feature_importances_
array([ 17,  82,  16,  13,   8,  86,  16,  12,  13,  15,   0,  25,  33,
        12,  25,   0,   6,   1,  11,  12,   6,  49,  87,  61,  56,  51,
        58,  36,  21,  20,  38,  20,  24,   7,  17,  14,   7,   2,  34,
         5,   0,   0,  39,  60, 121,  41,  67,  35,  30,  68,  66,  53,
        12,  49,   3,  12,   1,  18,   2,   9,   1,  24,  26,   8,  13,
         1,  28,  18,  24,   9,   0,   5,   1,   0,   0,   0,   1,  23,
        11,  19,   5,   0,   8,   5,  31,  11,   4,   6,   7,  92,  26,
         0,   0,  12,   0,   0,   0,   1,   0,  16,   0,   0,   0,  35,
         0,   5,   0,  10,   9,  16,   0,   0,   0,   3,   5,   0,  23,
       107,  49])
train.iloc[:,1:].columns
Index(['sex', 'age', 'provider', 'level', 'verified', 'using_time',
       'regist_type', 'card_a_cnt', 'card_b_cnt', 'card_c_cnt',
       ...
       'type2_7', 'type2_8', 'type2_9', 'type2_10', 'type2_11', 'type2_12',
       'type2_13', 'tr_time', 'mean_amount', 'ip_ture'],
      dtype='object', length=119)
feature_importance = pd.DataFrame({'feature':train.iloc[:,1:].columns,'importance':gbm.feature_importances_})
feature_importance.sort_values(by='importance',ascending=False).head(20)
featureimportance
44product7_fail_cnt121
117mean_amount107
89type1_792
22login_cnt_period187
5using_time86
1age82
49b2e7fa260df4998d68
46is_all67
50type466
23login_cnt_period261
43product7_cnt60
26login_days_cnt58
24ip_cnt56
51op_type053
25login_cnt_avg51
118ip_ture49
21acc_count49
53op_type249
45op_time41
42product6_amount39
y_pre = gbm.predict(train_X)
y_pre = gbm.predict_proba(train_X)
roc_auc_score(train_y,y_pre[:,1])
0.7876456295250149
y = gbm.predict_proba(test_X)
y[:,1]
array([0.02967902, 0.44846496, 0.02377314, ..., 0.21914047, 0.28423991,
       0.16758796])
test['prob'] = y[:,1]
pd.DataFrame(test.iloc[:,0]).to_csv('result.csv')
auc = cross_val_score(gbm, train_X, train_y, scoring='roc_auc', cv=10)

array([0.73246514, 0.72179747, 0.72288483, 0.72767674, 0.72240485,
       0.72194103, 0.71986724, 0.71257605, 0.71155348, 0.7186749 ])
auc.mean()
0.7211841722940205

特征选择

list_feature = feature_importance.sort_values(by='importance',ascending=False)['feature'].to_list()
list_socre = []
for i in range(50,120,10):
    fearture = list_feature[:i]
    train_X = stand.fit_transform(train.loc[:,fearture].values)
    auc = cross_val_score(gbm, train_X, train_y, scoring='roc_auc', cv=5)
    list_socre.append((i,auc.mean()))
list_socre
[(50, 0.7164324796787287),
 (60, 0.7178106094882282),
 (70, 0.7200468611823796),
 (80, 0.7193456575143582),
 (90, 0.7190751868013574),
 (100, 0.7190497035344566),
 (110, 0.7182153617821309)]
test_X = stand.fit_transform(test.loc[:,list_feature[:70]].values)
train_X = stand.fit_transform(train.loc[:,list_feature[:70]].values)
gbm.fit(train_X,train_y)
LGBMClassifier(colsample_bytree=0.8, max_depth=8, min_child_weight=0.0,
               n_estimators=80, num_leaves =20, objective='binary',
               subsample=0.8)

  • 2
    点赞
  • 25
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值