【模型调参】lgb的参数调节

最新推荐文章于 2024-08-23 15:57:45 发布

我叫陈叉叉叉叉

最新推荐文章于 2024-08-23 15:57:45 发布

阅读量3.2k

点赞数 2

分类专栏：机器学习鸭

本文链接：https://blog.csdn.net/wwqnmdhmp/article/details/107992516

版权

机器学习鸭专栏收录该内容

12 篇文章 4 订阅

订阅专栏

1 数据导入
2 模型挑选
3 模型调参
4 测试集生成结果
5 特征选择

import numpy as np              # 导入numpy库
import pandas as pd             # 导入pandas库
import matplotlib as mpl        # 导入matplotlib库
import matplotlib.pyplot as plt 
import seaborn as sns           # 导入seaborn库
%matplotlib inline
plt.rcParams['font.sans-serif'] = ['KaiTi'] # 指定默认字体
plt.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns',None)

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost.sklearn import XGBClassifier
import lightgbm as lgb

from sklearn.model_selection import GridSearchCV 
from sklearn.preprocessing import StandardScaler

数据导入

op = pd.read_csv('op_done.csv',index_col='user')
base = pd.read_csv('base_done.csv',index_col='user')
tr = pd.read_csv('tr_done.csv',index_col='user')
label = pd.read_csv('train_label.csv',index_col='user')
sumbit = pd.read_csv('submit_example.csv',index_col='user')

train = label.join(base).join(op).join(tr)

train.fillna(0,inplace=True)
train.head()

	label	sex	age	provider	level	verified	using_time	regist_type	card_a_cnt	card_b_cnt	card_c_cnt	agreement1	op1_cnt	op2_cnt	card_d_cnt	agreement_total	service1_cnt	service1_amt	service2_cnt	agreement2	agreement3	agreement4	acc_count	login_cnt_period1	login_cnt_period2	ip_cnt	login_cnt_avg	login_days_cnt	province	city	balance	balance_avg	balance1	balance1_avg	balance2	balance2_avg	service3	service3_level	product1_amount	product2_amount	product3_amount	product4_amount	product5_amount	product6_amount	product7_cnt	product7_fail_cnt	op_time	is_all	116a2503b987ea81	b131ac74aa38a121	b2e7fa260df4998d	type4	op_type0	op_type1	op_type2	op_type3	op_type4	mode0	mode1	mode2	mode3	mode4	channel0	channel1	channel2	channel3	channel4	ip_num	platform_0	platform_1	platform_2	platform_3	platform_4	platform_5	tunnel_in_0	tunnel_in_1	tunnel_in_2	tunnel_in_3	tunnel_in_4	tunnel_out_0	tunnel_out_1	tunnel_out_2	tunnel_out_3	type1_0	type1_1	type1_2	type1_3	type1_4	type1_5	type1_6	type1_7	type1_8	type1_9	type1_10	type1_11	type1_12	type1_13	type1_14	type1_15	type1_16	type1_17	type1_18	type1_19	type2_0	type2_1	type2_2	type2_3	type2_4	type2_5	type2_6	type2_7	type2_8	type2_9	type2_10	type2_11	type2_12	type2_13	tr_time	mean_amount	ip_ture
user
Train_00000	0	0	24871	0	1	0	24712	1	24712	24712	24706	0	24731	24719	24706	24743	24706	24706	24706	1	0	0	24737	25041	24938	24719	24737	24749	1	0	14	14	1	1	16	5	0	4	1	1	1	0	0	1	24712	24706	102.0	0.705882	1.0	1.0	28.0	72.0	12.0	0.0	44.0	0.0	0.0	0.0	0.0	0.0	12.0	0.0	0.0	0.0	0.0	2.0	0.0	4.0	6.0	7.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	7.0	0.0	13.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	2.0	4.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	6.0	0.0	0.0	0.0	7.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	13.0	53330.307692	0.384615
Train_00001	1	0	24889	0	2	0	24716	1	24719	24719	24706	0	24712	24712	24706	24755	24706	24706	24706	1	0	0	24737	25443	24931	24731	24749	24737	2	1	3	8	6	6	1	5	0	4	2	3	1	0	0	6	24712	24706	18.0	0.333333	11.0	0.0	1.0	6.0	6.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	6.0	0.0	0.0	0.0	1.0	0.0	0.0	2.0	1.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	2.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	2.0	36098.000000	0.500000
Train_00002	0	0	24963	0	2	0	24736	7	24712	24712	24706	0	24712	24712	24706	24743	24706	24706	24706	1	0	0	24731	26584	26524	24774	24774	24859	2	0	8	8	1	1	9	5	0	4	1	1	1	0	0	1	24719	24719	8.0	0.125000	0.0	5.0	2.0	1.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	2.0	0.0	1.0	5.0	7.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	7.0	0.0	12.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	6.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	5.0	0.0	0.0	0.0	7.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	12.0	57329.583333	0.500000
Train_00003	0	0	24840	2	2	0	24719	3	24712	24712	24706	0	24719	24706	24706	24737	24706	24706	24706	0	1	0	24712	25571	25529	24908	24737	24846	2	1	1	1	1	1	1	1	0	4	1	1	1	0	0	1	24712	24706	108.0	0.472222	0.0	4.0	46.0	58.0	24.0	0.0	14.0	0.0	0.0	0.0	0.0	0.0	24.0	0.0	0.0	0.0	0.0	6.0	0.0	19.0	2.0	9.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	9.0	0.0	11.0	0.0	0.0	1.0	0.0	0.0	0.0	1.0	0.0	3.0	0.0	4.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	2.0	0.0	0.0	0.0	9.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	11.0	61652.454545	0.727273
Train_00004	0	0	24871	2	1	0	24707	3	24712	24712	24706	0	24706	24706	24706	24725	24706	24706	24706	0	0	0	24712	25838	25838	24755	24816	24767	1	0	9	6	1	1	10	3	0	4	1	1	1	0	0	1	24706	24706	5.0	0.200000	0.0	4.0	0.0	1.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	36689.000000	0.000000

train_X = train.iloc[:,1:].values
train_y = train.iloc[:,0].values
stand = StandardScaler()
train_X = stand.fit_transform(train_X)

test = sumbit.join(base).join(op).join(tr)
test.fillna(0,inplace =True)
test_X = test.iloc[:,1:].values
test_X = stand.fit_transform(test_X)

模型挑选

lr = LogisticRegression(random_state=2018)  # 逻辑回归模型
svm = SVC(probability=True,random_state=2018)  # SVM模型
forest=RandomForestClassifier(n_estimators=100,random_state=2018) #　随机森林
Gbdt=GradientBoostingClassifier(random_state=2018) #CBDT
Xgbc=XGBClassifier(random_state=2018)  #Xgbc
gbm=lgb.LGBMClassifier(random_state=2018)  #lgb
model_name=["lr","svm","forest","Gbdt","Xgbc","gbm"]

def muti_score(model):
    auc = cross_val_score(model, train_X, train_y, scoring='roc_auc', cv=3)
    return auc.mean()

scores = []
for name in model_name:
    model = eval(name)
    socre = muti_score(model)
    scores.append((name,socre))

scores

[('lr', 0.6374291913334925),
 ('svm', 0.42584336157620334),
 ('forest', 0.6732019222635085),
 ('Gbdt', 0.6995580705824883),
 ('Xgbc', 0.6890128512134231),
 ('gbm', 0.7027585172289985)]

模型调参

经过对比 gbdt和gbm效果较好 
调参顺序 n_estimators -- max_depth/num_leaves -- min_child_samples/min_child_weight -- subsample/colsample_bytree --reg_alpha/reg_lambda -- 学习率

设立初始参数

params = {'boosting_type':'gbdt','objective': 'binary','subsample': 0.8,'colsample_bytree': 0.8}
gbm=lgb.LGBMClassifier(**params)
gbm.get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 0.8,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': -1,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'num_leaves': 31,
 'objective': 'binary',
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'silent': True,
 'subsample': 0.8,
 'subsample_for_bin': 200000,
 'subsample_freq': 0}

调解n_estimators

param_1 = {'n_estimators':range(50,150,5)}
cv = GridSearchCV(gbm,param_grid=param_1,scoring='roc_auc',cv=5)
grid_result = cv.fit(train_X, train_y)
print(grid_result.best_score_,grid_result.best_params_)
result = pd.DataFrame(grid_result.cv_results_)
plt.plot(result['param_n_estimators'],result['mean_test_score'])

0.718750789233066 {'n_estimators': 80}





[<matplotlib.lines.Line2D at 0x1b2b4aab0f0>]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-hqXl8ymd-1597326586451)(output_17_2.png)]

params.update(grid_result.best_params_)
gbm=lgb.LGBMClassifier(**params)

max_depth/num_leaves

param_2 = {'max_depth':range(5,9),'num_leaves ':range(20,50,2)}
cv = GridSearchCV(gbm,param_grid=param_2,scoring='roc_auc',cv=4)
grid_result = cv.fit(train_X, train_y)
print(grid_result.best_score_,grid_result.best_params_)
result = pd.DataFrame(grid_result.cv_results_)

0.7191457708890046 {'max_depth': 8, 'num_leaves ': 20}

params.update({'max_depth': 8, 'num_leaves ': 20})
gbm=lgb.LGBMClassifier(**params)
gbm.get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 0.8,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': 8,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 80,
 'n_jobs': -1,
 'num_leaves': 31,
 'objective': 'binary',
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'silent': True,
 'subsample': 0.8,
 'subsample_for_bin': 200000,
 'subsample_freq': 0,
 'num_leaves ': 20}

min_child_samples/min_child_weight

param_3 = {'min_child_samples':range(10,30,2),'min_child_weight':[i/1000 for i in range(0,20,2)]}
cv = GridSearchCV(gbm,param_grid=param_3,scoring='roc_auc',cv=4)
grid_result = cv.fit(train_X, train_y)
print(grid_result.best_score_,grid_result.best_params_)
result = pd.DataFrame(grid_result.cv_results_)

0.7191457708890046 {'min_child_samples': 20, 'min_child_weight': 0.0}

params.update({'min_child_samples': 20, 'min_child_weight': 0.0})
gbm=lgb.LGBMClassifier(**params)
gbm.get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 0.8,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': 8,
 'min_child_samples': 20,
 'min_child_weight': 0.0,
 'min_split_gain': 0.0,
 'n_estimators': 80,
 'n_jobs': -1,
 'num_leaves': 31,
 'objective': 'binary',
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'silent': True,
 'subsample': 0.8,
 'subsample_for_bin': 200000,
 'subsample_freq': 0,
 'num_leaves ': 20}

subsample/colsample_bytree（0.6,1）

param_4 = {'subsample':[i/10 for i in range(6,10,1)],'colsample_bytree':[i/10 for i in range(6,10,1)]}
cv = GridSearchCV(gbm,param_grid=param_4,scoring='roc_auc',cv=4)
grid_result = cv.fit(train_X, train_y)
print(grid_result.best_score_,grid_result.best_params_)
result = pd.DataFrame(grid_result.cv_results_)

0.7191457708890046 {'colsample_bytree': 0.8, 'subsample': 0.6}

reg_alpha/reg_lamb

param_5 = {'subsample':[i/10 for i in range(10)],'colsample_bytree':[i/10 for i in range(10)]}
cv = GridSearchCV(gbm,param_grid=param_5,scoring='roc_auc',cv=4)
grid_result = cv.fit(train_X, train_y)
print(grid_result.best_score_,grid_result.best_params_)

0.7191457708890046 {'colsample_bytree': 0.8, 'subsample': 0.1}

学习率

param_6 = {'learning_rate':[i/100 for i in range(20)]}
cv = GridSearchCV(gbm,param_grid=param_6,scoring='roc_auc',cv=4)
grid_result = cv.fit(train_X, train_y)
print(grid_result.best_score_,grid_result.best_params_)

0.7191457708890046 {'learning_rate': 0.1}

测试集生成结果

gbm.fit(train_X, train_y)

LGBMClassifier(colsample_bytree=0.8, max_depth=8, min_child_weight=0.0,
               n_estimators=80, num_leaves =20, objective='binary',
               subsample=0.8)

gbm.feature_importances_

array([ 17,  82,  16,  13,   8,  86,  16,  12,  13,  15,   0,  25,  33,
        12,  25,   0,   6,   1,  11,  12,   6,  49,  87,  61,  56,  51,
        58,  36,  21,  20,  38,  20,  24,   7,  17,  14,   7,   2,  34,
         5,   0,   0,  39,  60, 121,  41,  67,  35,  30,  68,  66,  53,
        12,  49,   3,  12,   1,  18,   2,   9,   1,  24,  26,   8,  13,
         1,  28,  18,  24,   9,   0,   5,   1,   0,   0,   0,   1,  23,
        11,  19,   5,   0,   8,   5,  31,  11,   4,   6,   7,  92,  26,
         0,   0,  12,   0,   0,   0,   1,   0,  16,   0,   0,   0,  35,
         0,   5,   0,  10,   9,  16,   0,   0,   0,   3,   5,   0,  23,
       107,  49])

train.iloc[:,1:].columns

Index(['sex', 'age', 'provider', 'level', 'verified', 'using_time',
       'regist_type', 'card_a_cnt', 'card_b_cnt', 'card_c_cnt',
       ...
       'type2_7', 'type2_8', 'type2_9', 'type2_10', 'type2_11', 'type2_12',
       'type2_13', 'tr_time', 'mean_amount', 'ip_ture'],
      dtype='object', length=119)

feature_importance = pd.DataFrame({'feature':train.iloc[:,1:].columns,'importance':gbm.feature_importances_})

feature_importance.sort_values(by='importance',ascending=False).head(20)

	feature	importance
44	product7_fail_cnt	121
117	mean_amount	107
89	type1_7	92
22	login_cnt_period1	87
5	using_time	86
1	age	82
49	b2e7fa260df4998d	68
46	is_all	67
50	type4	66
23	login_cnt_period2	61
43	product7_cnt	60
26	login_days_cnt	58
24	ip_cnt	56
51	op_type0	53
25	login_cnt_avg	51
118	ip_ture	49
21	acc_count	49
53	op_type2	49
45	op_time	41
42	product6_amount	39

y_pre = gbm.predict(train_X)
y_pre = gbm.predict_proba(train_X)

roc_auc_score(train_y,y_pre[:,1])

0.7876456295250149

y = gbm.predict_proba(test_X)
y[:,1]

array([0.02967902, 0.44846496, 0.02377314, ..., 0.21914047, 0.28423991,
       0.16758796])

test['prob'] = y[:,1]

pd.DataFrame(test.iloc[:,0]).to_csv('result.csv')

auc = cross_val_score(gbm, train_X, train_y, scoring='roc_auc', cv=10)

array([0.73246514, 0.72179747, 0.72288483, 0.72767674, 0.72240485,
       0.72194103, 0.71986724, 0.71257605, 0.71155348, 0.7186749 ])

auc.mean()

0.7211841722940205

特征选择

list_feature = feature_importance.sort_values(by='importance',ascending=False)['feature'].to_list()

list_socre = []
for i in range(50,120,10):
    fearture = list_feature[:i]
    train_X = stand.fit_transform(train.loc[:,fearture].values)
    auc = cross_val_score(gbm, train_X, train_y, scoring='roc_auc', cv=5)
    list_socre.append((i,auc.mean()))
list_socre

[(50, 0.7164324796787287),
 (60, 0.7178106094882282),
 (70, 0.7200468611823796),
 (80, 0.7193456575143582),
 (90, 0.7190751868013574),
 (100, 0.7190497035344566),
 (110, 0.7182153617821309)]

test_X = stand.fit_transform(test.loc[:,list_feature[:70]].values)
train_X = stand.fit_transform(train.loc[:,list_feature[:70]].values)

gbm.fit(train_X,train_y)

LGBMClassifier(colsample_bytree=0.8, max_depth=8, min_child_weight=0.0,
               n_estimators=80, num_leaves =20, objective='binary',
               subsample=0.8)