Table of Contents
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib. pyplot as plt
import seaborn as sns
% matplotlib inline
plt. rcParams[ 'font.sans-serif' ] = [ 'KaiTi' ]
plt. rcParams[ 'axes.unicode_minus' ] = False
import warnings
warnings. filterwarnings( "ignore" )
pd. set_option( 'display.max_columns' , None )
from sklearn. metrics import roc_auc_score
from sklearn. model_selection import cross_val_score
from sklearn. linear_model import LogisticRegression
from sklearn. tree import DecisionTreeClassifier
from sklearn. svm import SVC
from sklearn. ensemble import RandomForestClassifier
from sklearn. ensemble import GradientBoostingClassifier
from xgboost. sklearn import XGBClassifier
import lightgbm as lgb
from sklearn. model_selection import GridSearchCV
from sklearn. preprocessing import StandardScaler
数据导入
op = pd. read_csv( 'op_done.csv' , index_col= 'user' )
base = pd. read_csv( 'base_done.csv' , index_col= 'user' )
tr = pd. read_csv( 'tr_done.csv' , index_col= 'user' )
label = pd. read_csv( 'train_label.csv' , index_col= 'user' )
sumbit = pd. read_csv( 'submit_example.csv' , index_col= 'user' )
train = label. join( base) . join( op) . join( tr)
train. fillna( 0 , inplace= True )
train. head( )
label sex age provider level verified using_time regist_type card_a_cnt card_b_cnt card_c_cnt agreement1 op1_cnt op2_cnt card_d_cnt agreement_total service1_cnt service1_amt service2_cnt agreement2 agreement3 agreement4 acc_count login_cnt_period1 login_cnt_period2 ip_cnt login_cnt_avg login_days_cnt province city balance balance_avg balance1 balance1_avg balance2 balance2_avg service3 service3_level product1_amount product2_amount product3_amount product4_amount product5_amount product6_amount product7_cnt product7_fail_cnt op_time is_all 116a2503b987ea81 b131ac74aa38a121 b2e7fa260df4998d type4 op_type0 op_type1 op_type2 op_type3 op_type4 mode0 mode1 mode2 mode3 mode4 channel0 channel1 channel2 channel3 channel4 ip_num platform_0 platform_1 platform_2 platform_3 platform_4 platform_5 tunnel_in_0 tunnel_in_1 tunnel_in_2 tunnel_in_3 tunnel_in_4 tunnel_out_0 tunnel_out_1 tunnel_out_2 tunnel_out_3 type1_0 type1_1 type1_2 type1_3 type1_4 type1_5 type1_6 type1_7 type1_8 type1_9 type1_10 type1_11 type1_12 type1_13 type1_14 type1_15 type1_16 type1_17 type1_18 type1_19 type2_0 type2_1 type2_2 type2_3 type2_4 type2_5 type2_6 type2_7 type2_8 type2_9 type2_10 type2_11 type2_12 type2_13 tr_time mean_amount ip_ture user Train_00000 0 0 24871 0 1 0 24712 1 24712 24712 24706 0 24731 24719 24706 24743 24706 24706 24706 1 0 0 24737 25041 24938 24719 24737 24749 1 0 14 14 1 1 16 5 0 4 1 1 1 0 0 1 24712 24706 102.0 0.705882 1.0 1.0 28.0 72.0 12.0 0.0 44.0 0.0 0.0 0.0 0.0 0.0 12.0 0.0 0.0 0.0 0.0 2.0 0.0 4.0 6.0 7.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 7.0 0.0 13.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 2.0 4.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 6.0 0.0 0.0 0.0 7.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 13.0 53330.307692 0.384615 Train_00001 1 0 24889 0 2 0 24716 1 24719 24719 24706 0 24712 24712 24706 24755 24706 24706 24706 1 0 0 24737 25443 24931 24731 24749 24737 2 1 3 8 6 6 1 5 0 4 2 3 1 0 0 6 24712 24706 18.0 0.333333 11.0 0.0 1.0 6.0 6.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 6.0 0.0 0.0 0.0 1.0 0.0 0.0 2.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 36098.000000 0.500000 Train_00002 0 0 24963 0 2 0 24736 7 24712 24712 24706 0 24712 24712 24706 24743 24706 24706 24706 1 0 0 24731 26584 26524 24774 24774 24859 2 0 8 8 1 1 9 5 0 4 1 1 1 0 0 1 24719 24719 8.0 0.125000 0.0 5.0 2.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 2.0 0.0 1.0 5.0 7.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 7.0 0.0 12.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 6.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 5.0 0.0 0.0 0.0 7.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 12.0 57329.583333 0.500000 Train_00003 0 0 24840 2 2 0 24719 3 24712 24712 24706 0 24719 24706 24706 24737 24706 24706 24706 0 1 0 24712 25571 25529 24908 24737 24846 2 1 1 1 1 1 1 1 0 4 1 1 1 0 0 1 24712 24706 108.0 0.472222 0.0 4.0 46.0 58.0 24.0 0.0 14.0 0.0 0.0 0.0 0.0 0.0 24.0 0.0 0.0 0.0 0.0 6.0 0.0 19.0 2.0 9.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 9.0 0.0 11.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 3.0 0.0 4.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 9.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 11.0 61652.454545 0.727273 Train_00004 0 0 24871 2 1 0 24707 3 24712 24712 24706 0 24706 24706 24706 24725 24706 24706 24706 0 0 0 24712 25838 25838 24755 24816 24767 1 0 9 6 1 1 10 3 0 4 1 1 1 0 0 1 24706 24706 5.0 0.200000 0.0 4.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 36689.000000 0.000000
train_X = train. iloc[ : , 1 : ] . values
train_y = train. iloc[ : , 0 ] . values
stand = StandardScaler( )
train_X = stand. fit_transform( train_X)
test = sumbit. join( base) . join( op) . join( tr)
test. fillna( 0 , inplace = True )
test_X = test. iloc[ : , 1 : ] . values
test_X = stand. fit_transform( test_X)
模型挑选
lr = LogisticRegression( random_state= 2018 )
svm = SVC( probability= True , random_state= 2018 )
forest= RandomForestClassifier( n_estimators= 100 , random_state= 2018 )
Gbdt= GradientBoostingClassifier( random_state= 2018 )
Xgbc= XGBClassifier( random_state= 2018 )
gbm= lgb. LGBMClassifier( random_state= 2018 )
model_name= [ "lr" , "svm" , "forest" , "Gbdt" , "Xgbc" , "gbm" ]
def muti_score ( model) :
auc = cross_val_score( model, train_X, train_y, scoring= 'roc_auc' , cv= 3 )
return auc. mean( )
scores = [ ]
for name in model_name:
model = eval ( name)
socre = muti_score( model)
scores. append( ( name, socre) )
scores
[('lr', 0.6374291913334925),
('svm', 0.42584336157620334),
('forest', 0.6732019222635085),
('Gbdt', 0.6995580705824883),
('Xgbc', 0.6890128512134231),
('gbm', 0.7027585172289985)]
模型调参
经过对比 gbdt和gbm效果较好
调参顺序 n_estimators -- max_depth/num_leaves -- min_child_samples/min_child_weight -- subsample/colsample_bytree --reg_alpha/reg_lambda -- 学习率
设立初始参数
params = { 'boosting_type' : 'gbdt' , 'objective' : 'binary' , 'subsample' : 0.8 , 'colsample_bytree' : 0.8 }
gbm= lgb. LGBMClassifier( ** params)
gbm. get_params( )
{'boosting_type': 'gbdt',
'class_weight': None,
'colsample_bytree': 0.8,
'importance_type': 'split',
'learning_rate': 0.1,
'max_depth': -1,
'min_child_samples': 20,
'min_child_weight': 0.001,
'min_split_gain': 0.0,
'n_estimators': 100,
'n_jobs': -1,
'num_leaves': 31,
'objective': 'binary',
'random_state': None,
'reg_alpha': 0.0,
'reg_lambda': 0.0,
'silent': True,
'subsample': 0.8,
'subsample_for_bin': 200000,
'subsample_freq': 0}
调解n_estimators
param_1 = { 'n_estimators' : range ( 50 , 150 , 5 ) }
cv = GridSearchCV( gbm, param_grid= param_1, scoring= 'roc_auc' , cv= 5 )
grid_result = cv. fit( train_X, train_y)
print ( grid_result. best_score_, grid_result. best_params_)
result = pd. DataFrame( grid_result. cv_results_)
plt. plot( result[ 'param_n_estimators' ] , result[ 'mean_test_score' ] )
0.718750789233066 {'n_estimators': 80}
[<matplotlib.lines.Line2D at 0x1b2b4aab0f0>]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-hqXl8ymd-1597326586451)(output_17_2.png)]
params. update( grid_result. best_params_)
gbm= lgb. LGBMClassifier( ** params)
max_depth/num_leaves
param_2 = { 'max_depth' : range ( 5 , 9 ) , 'num_leaves ' : range ( 20 , 50 , 2 ) }
cv = GridSearchCV( gbm, param_grid= param_2, scoring= 'roc_auc' , cv= 4 )
grid_result = cv. fit( train_X, train_y)
print ( grid_result. best_score_, grid_result. best_params_)
result = pd. DataFrame( grid_result. cv_results_)
0.7191457708890046 {'max_depth': 8, 'num_leaves ': 20}
params. update( { 'max_depth' : 8 , 'num_leaves ' : 20 } )
gbm= lgb. LGBMClassifier( ** params)
gbm. get_params( )
{'boosting_type': 'gbdt',
'class_weight': None,
'colsample_bytree': 0.8,
'importance_type': 'split',
'learning_rate': 0.1,
'max_depth': 8,
'min_child_samples': 20,
'min_child_weight': 0.001,
'min_split_gain': 0.0,
'n_estimators': 80,
'n_jobs': -1,
'num_leaves': 31,
'objective': 'binary',
'random_state': None,
'reg_alpha': 0.0,
'reg_lambda': 0.0,
'silent': True,
'subsample': 0.8,
'subsample_for_bin': 200000,
'subsample_freq': 0,
'num_leaves ': 20}
min_child_samples/min_child_weight
param_3 = { 'min_child_samples' : range ( 10 , 30 , 2 ) , 'min_child_weight' : [ i/ 1000 for i in range ( 0 , 20 , 2 ) ] }
cv = GridSearchCV( gbm, param_grid= param_3, scoring= 'roc_auc' , cv= 4 )
grid_result = cv. fit( train_X, train_y)
print ( grid_result. best_score_, grid_result. best_params_)
result = pd. DataFrame( grid_result. cv_results_)
0.7191457708890046 {'min_child_samples': 20, 'min_child_weight': 0.0}
params. update( { 'min_child_samples' : 20 , 'min_child_weight' : 0.0 } )
gbm= lgb. LGBMClassifier( ** params)
gbm. get_params( )
{'boosting_type': 'gbdt',
'class_weight': None,
'colsample_bytree': 0.8,
'importance_type': 'split',
'learning_rate': 0.1,
'max_depth': 8,
'min_child_samples': 20,
'min_child_weight': 0.0,
'min_split_gain': 0.0,
'n_estimators': 80,
'n_jobs': -1,
'num_leaves': 31,
'objective': 'binary',
'random_state': None,
'reg_alpha': 0.0,
'reg_lambda': 0.0,
'silent': True,
'subsample': 0.8,
'subsample_for_bin': 200000,
'subsample_freq': 0,
'num_leaves ': 20}
subsample/colsample_bytree(0.6,1)
param_4 = { 'subsample' : [ i/ 10 for i in range ( 6 , 10 , 1 ) ] , 'colsample_bytree' : [ i/ 10 for i in range ( 6 , 10 , 1 ) ] }
cv = GridSearchCV( gbm, param_grid= param_4, scoring= 'roc_auc' , cv= 4 )
grid_result = cv. fit( train_X, train_y)
print ( grid_result. best_score_, grid_result. best_params_)
result = pd. DataFrame( grid_result. cv_results_)
0.7191457708890046 {'colsample_bytree': 0.8, 'subsample': 0.6}
reg_alpha/reg_lamb
param_5 = { 'subsample' : [ i/ 10 for i in range ( 10 ) ] , 'colsample_bytree' : [ i/ 10 for i in range ( 10 ) ] }
cv = GridSearchCV( gbm, param_grid= param_5, scoring= 'roc_auc' , cv= 4 )
grid_result = cv. fit( train_X, train_y)
print ( grid_result. best_score_, grid_result. best_params_)
0.7191457708890046 {'colsample_bytree': 0.8, 'subsample': 0.1}
学习率
param_6 = { 'learning_rate' : [ i/ 100 for i in range ( 20 ) ] }
cv = GridSearchCV( gbm, param_grid= param_6, scoring= 'roc_auc' , cv= 4 )
grid_result = cv. fit( train_X, train_y)
print ( grid_result. best_score_, grid_result. best_params_)
0.7191457708890046 {'learning_rate': 0.1}
测试集生成结果
gbm. fit( train_X, train_y)
LGBMClassifier(colsample_bytree=0.8, max_depth=8, min_child_weight=0.0,
n_estimators=80, num_leaves =20, objective='binary',
subsample=0.8)
gbm. feature_importances_
array([ 17, 82, 16, 13, 8, 86, 16, 12, 13, 15, 0, 25, 33,
12, 25, 0, 6, 1, 11, 12, 6, 49, 87, 61, 56, 51,
58, 36, 21, 20, 38, 20, 24, 7, 17, 14, 7, 2, 34,
5, 0, 0, 39, 60, 121, 41, 67, 35, 30, 68, 66, 53,
12, 49, 3, 12, 1, 18, 2, 9, 1, 24, 26, 8, 13,
1, 28, 18, 24, 9, 0, 5, 1, 0, 0, 0, 1, 23,
11, 19, 5, 0, 8, 5, 31, 11, 4, 6, 7, 92, 26,
0, 0, 12, 0, 0, 0, 1, 0, 16, 0, 0, 0, 35,
0, 5, 0, 10, 9, 16, 0, 0, 0, 3, 5, 0, 23,
107, 49])
train. iloc[ : , 1 : ] . columns
Index(['sex', 'age', 'provider', 'level', 'verified', 'using_time',
'regist_type', 'card_a_cnt', 'card_b_cnt', 'card_c_cnt',
...
'type2_7', 'type2_8', 'type2_9', 'type2_10', 'type2_11', 'type2_12',
'type2_13', 'tr_time', 'mean_amount', 'ip_ture'],
dtype='object', length=119)
feature_importance = pd. DataFrame( { 'feature' : train. iloc[ : , 1 : ] . columns, 'importance' : gbm. feature_importances_} )
feature_importance. sort_values( by= 'importance' , ascending= False ) . head( 20 )
feature importance 44 product7_fail_cnt 121 117 mean_amount 107 89 type1_7 92 22 login_cnt_period1 87 5 using_time 86 1 age 82 49 b2e7fa260df4998d 68 46 is_all 67 50 type4 66 23 login_cnt_period2 61 43 product7_cnt 60 26 login_days_cnt 58 24 ip_cnt 56 51 op_type0 53 25 login_cnt_avg 51 118 ip_ture 49 21 acc_count 49 53 op_type2 49 45 op_time 41 42 product6_amount 39
y_pre = gbm. predict( train_X)
y_pre = gbm. predict_proba( train_X)
roc_auc_score( train_y, y_pre[ : , 1 ] )
0.7876456295250149
y = gbm. predict_proba( test_X)
y[ : , 1 ]
array([0.02967902, 0.44846496, 0.02377314, ..., 0.21914047, 0.28423991,
0.16758796])
test[ 'prob' ] = y[ : , 1 ]
pd. DataFrame( test. iloc[ : , 0 ] ) . to_csv( 'result.csv' )
auc = cross_val_score( gbm, train_X, train_y, scoring= 'roc_auc' , cv= 10 )
array([0.73246514, 0.72179747, 0.72288483, 0.72767674, 0.72240485,
0.72194103, 0.71986724, 0.71257605, 0.71155348, 0.7186749 ])
auc. mean( )
0.7211841722940205
特征选择
list_feature = feature_importance. sort_values( by= 'importance' , ascending= False ) [ 'feature' ] . to_list( )
list_socre = [ ]
for i in range ( 50 , 120 , 10 ) :
fearture = list_feature[ : i]
train_X = stand. fit_transform( train. loc[ : , fearture] . values)
auc = cross_val_score( gbm, train_X, train_y, scoring= 'roc_auc' , cv= 5 )
list_socre. append( ( i, auc. mean( ) ) )
list_socre
[(50, 0.7164324796787287),
(60, 0.7178106094882282),
(70, 0.7200468611823796),
(80, 0.7193456575143582),
(90, 0.7190751868013574),
(100, 0.7190497035344566),
(110, 0.7182153617821309)]
test_X = stand. fit_transform( test. loc[ : , list_feature[ : 70 ] ] . values)
train_X = stand. fit_transform( train. loc[ : , list_feature[ : 70 ] ] . values)
gbm. fit( train_X, train_y)
LGBMClassifier(colsample_bytree=0.8, max_depth=8, min_child_weight=0.0,
n_estimators=80, num_leaves =20, objective='binary',
subsample=0.8)