Table of Contents
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.rcParams['font.sans-serif'] = ['KaiTi']
plt.rcParams['axes.unicode_minus'] = False
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns',None)
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost.sklearn import XGBClassifier
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
数据导入
op = pd.read_csv('op_done.csv',index_col='user')
base = pd.read_csv('base_done.csv',index_col='user')
tr = pd.read_csv('tr_done.csv',index_col='user')
label = pd.read_csv('train_label.csv',index_col='user')
sumbit = pd.read_csv('submit_example.csv',index_col='user')
train = label.join(base).join(op).join(tr)
train.fillna(0,inplace=True)
train.head()
| label | sex | age | provider | level | verified | using_time | regist_type | card_a_cnt | card_b_cnt | card_c_cnt | agreement1 | op1_cnt | op2_cnt | card_d_cnt | agreement_total | service1_cnt | service1_amt | service2_cnt | agreement2 | agreement3 | agreement4 | acc_count | login_cnt_period1 | login_cnt_period2 | ip_cnt | login_cnt_avg | login_days_cnt | province | city | balance | balance_avg | balance1 | balance1_avg | balance2 | balance2_avg | service3 | service3_level | product1_amount | product2_amount | product3_amount | product4_amount | product5_amount | product6_amount | product7_cnt | product7_fail_cnt | op_time | is_all | 116a2503b987ea81 | b131ac74aa38a121 | b2e7fa260df4998d | type4 | op_type0 | op_type1 | op_type2 | op_type3 | op_type4 | mode0 | mode1 | mode2 | mode3 | mode4 | channel0 | channel1 | channel2 | channel3 | channel4 | ip_num | platform_0 | platform_1 | platform_2 | platform_3 | platform_4 | platform_5 | tunnel_in_0 | tunnel_in_1 | tunnel_in_2 | tunnel_in_3 | tunnel_in_4 | tunnel_out_0 | tunnel_out_1 | tunnel_out_2 | tunnel_out_3 | type1_0 | type1_1 | type1_2 | type1_3 | type1_4 | type1_5 | type1_6 | type1_7 | type1_8 | type1_9 | type1_10 | type1_11 | type1_12 | type1_13 | type1_14 | type1_15 | type1_16 | type1_17 | type1_18 | type1_19 | type2_0 | type2_1 | type2_2 | type2_3 | type2_4 | type2_5 | type2_6 | type2_7 | type2_8 | type2_9 | type2_10 | type2_11 | type2_12 | type2_13 | tr_time | mean_amount | ip_ture |
---|
user | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
---|
Train_00000 | 0 | 0 | 24871 | 0 | 1 | 0 | 24712 | 1 | 24712 | 24712 | 24706 | 0 | 24731 | 24719 | 24706 | 24743 | 24706 | 24706 | 24706 | 1 | 0 | 0 | 24737 | 25041 | 24938 | 24719 | 24737 | 24749 | 1 | 0 | 14 | 14 | 1 | 1 | 16 | 5 | 0 | 4 | 1 | 1 | 1 | 0 | 0 | 1 | 24712 | 24706 | 102.0 | 0.705882 | 1.0 | 1.0 | 28.0 | 72.0 | 12.0 | 0.0 | 44.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 12.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | 0.0 | 4.0 | 6.0 | 7.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 7.0 | 0.0 | 13.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | 4.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 6.0 | 0.0 | 0.0 | 0.0 | 7.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 13.0 | 53330.307692 | 0.384615 |
---|
Train_00001 | 1 | 0 | 24889 | 0 | 2 | 0 | 24716 | 1 | 24719 | 24719 | 24706 | 0 | 24712 | 24712 | 24706 | 24755 | 24706 | 24706 | 24706 | 1 | 0 | 0 | 24737 | 25443 | 24931 | 24731 | 24749 | 24737 | 2 | 1 | 3 | 8 | 6 | 6 | 1 | 5 | 0 | 4 | 2 | 3 | 1 | 0 | 0 | 6 | 24712 | 24706 | 18.0 | 0.333333 | 11.0 | 0.0 | 1.0 | 6.0 | 6.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 6.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 2.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | 36098.000000 | 0.500000 |
---|
Train_00002 | 0 | 0 | 24963 | 0 | 2 | 0 | 24736 | 7 | 24712 | 24712 | 24706 | 0 | 24712 | 24712 | 24706 | 24743 | 24706 | 24706 | 24706 | 1 | 0 | 0 | 24731 | 26584 | 26524 | 24774 | 24774 | 24859 | 2 | 0 | 8 | 8 | 1 | 1 | 9 | 5 | 0 | 4 | 1 | 1 | 1 | 0 | 0 | 1 | 24719 | 24719 | 8.0 | 0.125000 | 0.0 | 5.0 | 2.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | 0.0 | 1.0 | 5.0 | 7.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 7.0 | 0.0 | 12.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 6.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 5.0 | 0.0 | 0.0 | 0.0 | 7.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 12.0 | 57329.583333 | 0.500000 |
---|
Train_00003 | 0 | 0 | 24840 | 2 | 2 | 0 | 24719 | 3 | 24712 | 24712 | 24706 | 0 | 24719 | 24706 | 24706 | 24737 | 24706 | 24706 | 24706 | 0 | 1 | 0 | 24712 | 25571 | 25529 | 24908 | 24737 | 24846 | 2 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 0 | 4 | 1 | 1 | 1 | 0 | 0 | 1 | 24712 | 24706 | 108.0 | 0.472222 | 0.0 | 4.0 | 46.0 | 58.0 | 24.0 | 0.0 | 14.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 24.0 | 0.0 | 0.0 | 0.0 | 0.0 | 6.0 | 0.0 | 19.0 | 2.0 | 9.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 9.0 | 0.0 | 11.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 3.0 | 0.0 | 4.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | 0.0 | 0.0 | 0.0 | 9.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 11.0 | 61652.454545 | 0.727273 |
---|
Train_00004 | 0 | 0 | 24871 | 2 | 1 | 0 | 24707 | 3 | 24712 | 24712 | 24706 | 0 | 24706 | 24706 | 24706 | 24725 | 24706 | 24706 | 24706 | 0 | 0 | 0 | 24712 | 25838 | 25838 | 24755 | 24816 | 24767 | 1 | 0 | 9 | 6 | 1 | 1 | 10 | 3 | 0 | 4 | 1 | 1 | 1 | 0 | 0 | 1 | 24706 | 24706 | 5.0 | 0.200000 | 0.0 | 4.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 36689.000000 | 0.000000 |
---|
train_X = train.iloc[:,1:].values
train_y = train.iloc[:,0].values
stand = StandardScaler()
train_X = stand.fit_transform(train_X)
test = sumbit.join(base).join(op).join(tr)
test.fillna(0,inplace =True)
test_X = test.iloc[:,1:].values
test_X = stand.fit_transform(test_X)
模型挑选
lr = LogisticRegression(random_state=2018)
svm = SVC(probability=True,random_state=2018)
forest=RandomForestClassifier(n_estimators=100,random_state=2018)
Gbdt=GradientBoostingClassifier(random_state=2018)
Xgbc=XGBClassifier(random_state=2018)
gbm=lgb.LGBMClassifier(random_state=2018)
model_name=["lr","svm","forest","Gbdt","Xgbc","gbm"]
def muti_score(model):
auc = cross_val_score(model, train_X, train_y, scoring='roc_auc', cv=3)
return auc.mean()
scores = []
for name in model_name:
model = eval(name)
socre = muti_score(model)
scores.append((name,socre))
scores
[('lr', 0.6374291913334925),
('svm', 0.42584336157620334),
('forest', 0.6732019222635085),
('Gbdt', 0.6995580705824883),
('Xgbc', 0.6890128512134231),
('gbm', 0.7027585172289985)]
模型调参
经过对比 gbdt和gbm效果较好
调参顺序 n_estimators -- max_depth/num_leaves -- min_child_samples/min_child_weight -- subsample/colsample_bytree --reg_alpha/reg_lambda -- 学习率
设立初始参数
params = {'boosting_type':'gbdt','objective': 'binary','subsample': 0.8,'colsample_bytree': 0.8}
gbm=lgb.LGBMClassifier(**params)
gbm.get_params()
{'boosting_type': 'gbdt',
'class_weight': None,
'colsample_bytree': 0.8,
'importance_type': 'split',
'learning_rate': 0.1,
'max_depth': -1,
'min_child_samples': 20,
'min_child_weight': 0.001,
'min_split_gain': 0.0,
'n_estimators': 100,
'n_jobs': -1,
'num_leaves': 31,
'objective': 'binary',
'random_state': None,
'reg_alpha': 0.0,
'reg_lambda': 0.0,
'silent': True,
'subsample': 0.8,
'subsample_for_bin': 200000,
'subsample_freq': 0}
调解n_estimators
param_1 = {'n_estimators':range(50,150,5)}
cv = GridSearchCV(gbm,param_grid=param_1,scoring='roc_auc',cv=5)
grid_result = cv.fit(train_X, train_y)
print(grid_result.best_score_,grid_result.best_params_)
result = pd.DataFrame(grid_result.cv_results_)
plt.plot(result['param_n_estimators'],result['mean_test_score'])
0.718750789233066 {'n_estimators': 80}
[<matplotlib.lines.Line2D at 0x1b2b4aab0f0>]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-hqXl8ymd-1597326586451)(output_17_2.png)]
params.update(grid_result.best_params_)
gbm=lgb.LGBMClassifier(**params)
max_depth/num_leaves
param_2 = {'max_depth':range(5,9),'num_leaves ':range(20,50,2)}
cv = GridSearchCV(gbm,param_grid=param_2,scoring='roc_auc',cv=4)
grid_result = cv.fit(train_X, train_y)
print(grid_result.best_score_,grid_result.best_params_)
result = pd.DataFrame(grid_result.cv_results_)
0.7191457708890046 {'max_depth': 8, 'num_leaves ': 20}
params.update({'max_depth': 8, 'num_leaves ': 20})
gbm=lgb.LGBMClassifier(**params)
gbm.get_params()
{'boosting_type': 'gbdt',
'class_weight': None,
'colsample_bytree': 0.8,
'importance_type': 'split',
'learning_rate': 0.1,
'max_depth': 8,
'min_child_samples': 20,
'min_child_weight': 0.001,
'min_split_gain': 0.0,
'n_estimators': 80,
'n_jobs': -1,
'num_leaves': 31,
'objective': 'binary',
'random_state': None,
'reg_alpha': 0.0,
'reg_lambda': 0.0,
'silent': True,
'subsample': 0.8,
'subsample_for_bin': 200000,
'subsample_freq': 0,
'num_leaves ': 20}
min_child_samples/min_child_weight
param_3 = {'min_child_samples':range(10,30,2),'min_child_weight':[i/1000 for i in range(0,20,2)]}
cv = GridSearchCV(gbm,param_grid=param_3,scoring='roc_auc',cv=4)
grid_result = cv.fit(train_X, train_y)
print(grid_result.best_score_,grid_result.best_params_)
result = pd.DataFrame(grid_result.cv_results_)
0.7191457708890046 {'min_child_samples': 20, 'min_child_weight': 0.0}
params.update({'min_child_samples': 20, 'min_child_weight': 0.0})
gbm=lgb.LGBMClassifier(**params)
gbm.get_params()
{'boosting_type': 'gbdt',
'class_weight': None,
'colsample_bytree': 0.8,
'importance_type': 'split',
'learning_rate': 0.1,
'max_depth': 8,
'min_child_samples': 20,
'min_child_weight': 0.0,
'min_split_gain': 0.0,
'n_estimators': 80,
'n_jobs': -1,
'num_leaves': 31,
'objective': 'binary',
'random_state': None,
'reg_alpha': 0.0,
'reg_lambda': 0.0,
'silent': True,
'subsample': 0.8,
'subsample_for_bin': 200000,
'subsample_freq': 0,
'num_leaves ': 20}
subsample/colsample_bytree(0.6,1)
param_4 = {'subsample':[i/10 for i in range(6,10,1)],'colsample_bytree':[i/10 for i in range(6,10,1)]}
cv = GridSearchCV(gbm,param_grid=param_4,scoring='roc_auc',cv=4)
grid_result = cv.fit(train_X, train_y)
print(grid_result.best_score_,grid_result.best_params_)
result = pd.DataFrame(grid_result.cv_results_)
0.7191457708890046 {'colsample_bytree': 0.8, 'subsample': 0.6}
reg_alpha/reg_lamb
param_5 = {'subsample':[i/10 for i in range(10)],'colsample_bytree':[i/10 for i in range(10)]}
cv = GridSearchCV(gbm,param_grid=param_5,scoring='roc_auc',cv=4)
grid_result = cv.fit(train_X, train_y)
print(grid_result.best_score_,grid_result.best_params_)
0.7191457708890046 {'colsample_bytree': 0.8, 'subsample': 0.1}
学习率
param_6 = {'learning_rate':[i/100 for i in range(20)]}
cv = GridSearchCV(gbm,param_grid=param_6,scoring='roc_auc',cv=4)
grid_result = cv.fit(train_X, train_y)
print(grid_result.best_score_,grid_result.best_params_)
0.7191457708890046 {'learning_rate': 0.1}
测试集生成结果
gbm.fit(train_X, train_y)
LGBMClassifier(colsample_bytree=0.8, max_depth=8, min_child_weight=0.0,
n_estimators=80, num_leaves =20, objective='binary',
subsample=0.8)
gbm.feature_importances_
array([ 17, 82, 16, 13, 8, 86, 16, 12, 13, 15, 0, 25, 33,
12, 25, 0, 6, 1, 11, 12, 6, 49, 87, 61, 56, 51,
58, 36, 21, 20, 38, 20, 24, 7, 17, 14, 7, 2, 34,
5, 0, 0, 39, 60, 121, 41, 67, 35, 30, 68, 66, 53,
12, 49, 3, 12, 1, 18, 2, 9, 1, 24, 26, 8, 13,
1, 28, 18, 24, 9, 0, 5, 1, 0, 0, 0, 1, 23,
11, 19, 5, 0, 8, 5, 31, 11, 4, 6, 7, 92, 26,
0, 0, 12, 0, 0, 0, 1, 0, 16, 0, 0, 0, 35,
0, 5, 0, 10, 9, 16, 0, 0, 0, 3, 5, 0, 23,
107, 49])
train.iloc[:,1:].columns
Index(['sex', 'age', 'provider', 'level', 'verified', 'using_time',
'regist_type', 'card_a_cnt', 'card_b_cnt', 'card_c_cnt',
...
'type2_7', 'type2_8', 'type2_9', 'type2_10', 'type2_11', 'type2_12',
'type2_13', 'tr_time', 'mean_amount', 'ip_ture'],
dtype='object', length=119)
feature_importance = pd.DataFrame({'feature':train.iloc[:,1:].columns,'importance':gbm.feature_importances_})
feature_importance.sort_values(by='importance',ascending=False).head(20)
| feature | importance |
---|
44 | product7_fail_cnt | 121 |
---|
117 | mean_amount | 107 |
---|
89 | type1_7 | 92 |
---|
22 | login_cnt_period1 | 87 |
---|
5 | using_time | 86 |
---|
1 | age | 82 |
---|
49 | b2e7fa260df4998d | 68 |
---|
46 | is_all | 67 |
---|
50 | type4 | 66 |
---|
23 | login_cnt_period2 | 61 |
---|
43 | product7_cnt | 60 |
---|
26 | login_days_cnt | 58 |
---|
24 | ip_cnt | 56 |
---|
51 | op_type0 | 53 |
---|
25 | login_cnt_avg | 51 |
---|
118 | ip_ture | 49 |
---|
21 | acc_count | 49 |
---|
53 | op_type2 | 49 |
---|
45 | op_time | 41 |
---|
42 | product6_amount | 39 |
---|
y_pre = gbm.predict(train_X)
y_pre = gbm.predict_proba(train_X)
roc_auc_score(train_y,y_pre[:,1])
0.7876456295250149
y = gbm.predict_proba(test_X)
y[:,1]
array([0.02967902, 0.44846496, 0.02377314, ..., 0.21914047, 0.28423991,
0.16758796])
test['prob'] = y[:,1]
pd.DataFrame(test.iloc[:,0]).to_csv('result.csv')
auc = cross_val_score(gbm, train_X, train_y, scoring='roc_auc', cv=10)
array([0.73246514, 0.72179747, 0.72288483, 0.72767674, 0.72240485,
0.72194103, 0.71986724, 0.71257605, 0.71155348, 0.7186749 ])
auc.mean()
0.7211841722940205
特征选择
list_feature = feature_importance.sort_values(by='importance',ascending=False)['feature'].to_list()
list_socre = []
for i in range(50,120,10):
fearture = list_feature[:i]
train_X = stand.fit_transform(train.loc[:,fearture].values)
auc = cross_val_score(gbm, train_X, train_y, scoring='roc_auc', cv=5)
list_socre.append((i,auc.mean()))
list_socre
[(50, 0.7164324796787287),
(60, 0.7178106094882282),
(70, 0.7200468611823796),
(80, 0.7193456575143582),
(90, 0.7190751868013574),
(100, 0.7190497035344566),
(110, 0.7182153617821309)]
test_X = stand.fit_transform(test.loc[:,list_feature[:70]].values)
train_X = stand.fit_transform(train.loc[:,list_feature[:70]].values)
gbm.fit(train_X,train_y)
LGBMClassifier(colsample_bytree=0.8, max_depth=8, min_child_weight=0.0,
n_estimators=80, num_leaves =20, objective='binary',
subsample=0.8)