数据挖掘(五):参数调优
import numpy as np
import pandas as pd
import matplotlib. pyplot as plt
from sklearn. model_selection import train_test_split
from sklearn. model_selection import GridSearchCV, ParameterGrid
from sklearn. svm import SVC
from sklearn. linear_model import LogisticRegression
from sklearn. tree import DecisionTreeClassifier
from sklearn. ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn. ensemble import GradientBoostingClassifier
from sklearn import metrics
import warnings
plt. rcParams[ 'font.sans-serif' ] = [ 'SimHei' ]
plt. rcParams[ 'axes.unicode_minus' ] = False
warnings. filterwarnings( 'ignore' )
org_data = pd. read_csv( "org_data.csv" , encoding = 'gbk' )
print ( org_data. shape)
(4754, 58)
var_total = org_data. columns
var_y = [ 'status' ]
var_x = list ( set ( var_total) - set ( var_y) )
y = org_data[ var_y]
x = org_data[ var_x]
x_train, x_test, y_train, y_test = train_test_split( x, y, test_size= 0.3 , random_state= 2018 )
print ( x. shape)
print ( y. shape)
print ( x_train. shape)
print ( y_train. shape)
print ( x_test. shape)
print ( y_test. shape)
(4754, 57)
(4754, 1)
(3327, 57)
(3327, 1)
(1427, 57)
(1427, 1)
tf = RandomForestClassifier( criterion= 'gini' )
tf_model = tf. fit( x_train, y_train)
tf_model
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
oob_score=False, random_state=None, verbose=0,
warm_start=False)
importance_dict = pd. DataFrame( tf_model. feature_importances_, list ( x_train. columns) )
importance_dict = pd. DataFrame( )
importance_dict[ "features" ] = list ( x_train. columns)
importance_dict[ "importance" ] = list ( tf_model. feature_importances_)
importance_dict= importance_dict. set_index( "features" , drop= True )
var_sort = importance_dict. sort_values( by= "importance" , ascending= False )
print ( var_sort)
importance
features
trans_fail_top_count_enum_last_1_month_woe 0.059223
history_fail_fee 0.056541
loans_score 0.032077
latest_one_month_fail 0.031654
apply_score 0.031323
loans_overdue_count 0.026386
first_transaction_day 0.024519
trans_amount_3_month 0.023320
trans_day_last_12_month 0.022150
historical_trans_amount 0.021684
loans_latest_day 0.020472
consfin_avg_limit 0.019320
max_cumulative_consume_later_1_month 0.019216
avg_price_last_12_month 0.019063
number_of_trans_from_2011 0.018446
historical_trans_day 0.018180
pawns_auctions_trusts_consume_last_6_month 0.017757
consume_top_time_last_6_month 0.017473
consfin_credit_limit 0.017428
trans_activity_day 0.017384
trans_fail_top_count_enum_last_6_month 0.017367
query_sum_count 0.017095
loans_latest_time_days 0.016930
rank_trad_1_month_woe 0.016894
trans_top_time_last_6_month 0.016527
latest_query_day 0.016235
latest_query_time_days 0.016090
apply_credibility 0.016050
consfin_max_limit 0.015665
consfin_credibility 0.015618
consume_top_time_last_1_month 0.015464
pawns_auctions_trusts_consume_last_1_month 0.015397
loans_max_limit 0.015029
loans_credit_limit 0.014474
history_suc_fee 0.014274
latest_three_month_loan 0.013838
loans_settle_count 0.013797
avg_price_top_last_12_valid_month_woe 0.013660
query_org_count 0.012944
latest_six_month_loan 0.012813
middle_volume_percent 0.012598
consume_mini_time_last_1_month 0.012405
latest_one_month_suc_woe 0.012299
loans_org_count_behavior 0.012265
loans_cash_count 0.012263
loans_count 0.012134
latest_one_month_apply 0.011842
trans_fail_top_count_enum_last_12_month 0.011343
trans_top_time_last_1_month 0.010996
loans_org_count_current 0.010605
consfin_product_count_woe 0.010191
consfin_org_count_current_woe 0.009683
loans_product_count 0.009406
query_cash_count_woe 0.008782
top_trans_count_last_1_month_woe 0.008133
consfin_org_count_behavior_woe 0.007659
low_volume_percent 0.007617
var_x = list ( var_sort. importance[ var_sort. importance > 0.02 ] . index)
var_x
['trans_fail_top_count_enum_last_1_month_woe',
'history_fail_fee',
'loans_score',
'latest_one_month_fail',
'apply_score',
'loans_overdue_count',
'first_transaction_day',
'trans_amount_3_month',
'trans_day_last_12_month',
'historical_trans_amount',
'loans_latest_day']
y = org_data[ var_y]
x = org_data[ var_x]
x_train, x_test, y_train, y_test = train_test_split( x, y, test_size= 0.3 , random_state= 2018 )
Lr = LogisticRegression( )
svc = SVC( )
dt = DecisionTreeClassifier( )
rf = RandomForestClassifier( n_estimators= 200 )
xgb = XGBClassifier( n_jobs= 2 )
C = np. logspace( - 3 , 0 , 20 , base= 10 )
lr_param_grid = { 'C' : C, 'penalty' : [ 'l1' , 'l2' ] }
Lr_cv = GridSearchCV( estimator= Lr,
param_grid= lr_param_grid,
cv= 5 ,
scoring= 'f1' )
svc_param_grid = { 'C' : C}
svc_cv = GridSearchCV( estimator= svc,
param_grid= svc_param_grid,
cv= 5 ,
scoring= 'f1' )
dt_param_grid = { 'min_samples_leaf' : range ( 5 , 10 ) , 'criterion' : [ 'gini' , 'entropy' ] , 'max_depth' : range ( 2 , 5 ) }
dt_cv = GridSearchCV( estimator= dt,
param_grid= dt_param_grid,
cv= 5 ,
scoring= 'f1' )
rf_param_grid = { 'min_samples_leaf' : range ( 5 , 10 ) , 'criterion' : [ 'gini' , 'entropy' ] , 'max_depth' : range ( 2 , 5 ) }
rf_cv = GridSearchCV( estimator= rf,
param_grid= rf_param_grid,
cv= 5 ,
scoring= 'f1' )
xgb_param_grid = { 'learning_rate' : [ 0.01 , 0.05 , 0.1 , 0.2 , 0.5 ] , 'n_estimators' : [ 100 , 200 , 300 , 500 ] }
xgb_cv = GridSearchCV( estimator= xgb,
param_grid= xgb_param_grid,
cv= 5 ,
scoring= 'f1' )
model_dict = { "逻辑回归" : Lr_cv, "SVM" : svc_cv, "决策树" : dt_cv, "随机森林" : rf_cv, "XGBoost" : xgb_cv}
results = pd. DataFrame( )
def model_est ( model_dict, x_train, x_test, y_train, y_test) :
for name, model in model_dict. items( ) :
model_trian = model. fit( x_train, y_train)
y_pred_train = model_trian. predict( x_train)
y_pred_test = model_trian. predict( x_test)
acc_score_train = metrics. accuracy_score( y_pred_train, y_train)
precision_score_train = metrics. precision_score( y_pred_train, y_train)
recall_score_train = metrics. recall_score( y_pred_train, y_train)
f1_score_train = metrics. f1_score( y_pred_train, y_train)
roc_auc_score_train = metrics. roc_auc_score( y_pred_train, y_train)
acc_score_test = metrics. accuracy_score( y_pred_test, y_test)
precision_score_test = metrics. precision_score( y_pred_test, y_test)
recall_score_test = metrics. recall_score( y_pred_test, y_test)
f1_score_test = metrics. f1_score( y_pred_test, y_test)
roc_auc_score_test = metrics. roc_auc_score( y_pred_test, y_test)
print ( '{} 训练集准确率:{}' . format ( name, acc_score_train) )
print ( '{} 测试集准确率:{}\n' . format ( name, acc_score_test) )
print ( '{} 训练集精确率:{}' . format ( name, precision_score_train) )
print ( '{} 测试集精确率:{}\n' . format ( name, precision_score_test) )
print ( '{} 训练集召回率:{}' . format ( name, recall_score_train) )
print ( '{} 测试集召回率:{}\n' . format ( name, recall_score_test) )
print ( '{} 训练集f1评分:{}' . format ( name, f1_score_train) )
print ( '{} 测试集f1评分:{}\n' . format ( name, f1_score_test) )
print ( '{} 训练集AUC值:{}' . format ( name, roc_auc_score_train) )
print ( '{} 测试集AUC值:{}\n' . format ( name, roc_auc_score_test) )
fpr, tpr, th = metrics. roc_curve( y_train, y_pred_train)
fpr_t, tpr_t, th_t = metrics. roc_curve( y_test, y_pred_test)
plt. figure( figsize= [ 10 , 8 ] )
plt. plot( fpr, tpr, 'b--' )
plt. plot( fpr_t, tpr_t, 'r--' )
plt. title( label= '{} ROC curve' . format ( name) )
plt. xlabel( "fpr" , fontsize= 13 )
plt. ylabel( "tpr" , fontsize= 13 )
plt. show( )
model_est( model_dict, x_train, x_test, y_train, y_test)
逻辑回归 训练集准确率:0.7938082356477307
逻辑回归 测试集准确率:0.7778556412053259
逻辑回归 训练集精确率:0.31894484412470026
逻辑回归 测试集精确率:0.27019498607242337
逻辑回归 训练集召回率:0.6927083333333334
逻辑回归 测试集召回率:0.6381578947368421
逻辑回归 训练集f1评分:0.4367816091954024
逻辑回归 测试集f1评分:0.37964774951076313
逻辑回归 训练集AUC值:0.7498539967720014
逻辑回归 测试集AUC值:0.7163338493292053
SVM 训练集准确率:0.7682596934174932
SVM 测试集准确率:0.7659425367904695
SVM 训练集精确率:0.08752997601918465
SVM 测试集精确率:0.08356545961002786
SVM 训练集召回率:0.8795180722891566
SVM 测试集召回率:0.8571428571428571
SVM 训练集f1评分:0.15921483097055614
SVM 测试集f1评分:0.15228426395939088
SVM 训练集AUC值:0.8224655712863784
SVM 测试集AUC值:0.8103961412151068
决策树 训练集准确率:0.7992185151788398
决策树 测试集准确率:0.7631394533987386
决策树 训练集精确率:0.2973621103117506
决策树 测试集精确率:0.23676880222841226
决策树 训练集召回率:0.7515151515151515
决策树 测试集召回率:0.5704697986577181
决策树 训练集f1评分:0.42611683848797255
决策树 测试集f1评分:0.3346456692913386
决策树 训练集AUC值:0.7779931446598113
决策树 测试集AUC值:0.6780361512850406
随机森林 训练集准确率:0.8028253681995792
随机森林 测试集准确率:0.7778556412053259
随机森林 训练集精确率:0.27817745803357313
随机森林 测试集精确率:0.22284122562674094
随机森林 训练集召回率:0.8111888111888111
随机森林 测试集召回率:0.6779661016949152
随机森林 训练集f1评分:0.4142857142857143
随机森林 测试集f1评分:0.33542976939203356
随机森林 训练集AUC值:0.8066138071070659
随机森林 测试集AUC值:0.7324131501599099
XGBoost 训练集准确率:0.8557258791704238
XGBoost 测试集准确率:0.7876664330763841
XGBoost 训练集精确率:0.511990407673861
XGBoost 测试集精确率:0.3593314763231198
XGBoost 训练集召回率:0.854
XGBoost 测试集召回率:0.6386138613861386
XGBoost 训练集f1评分:0.6401799100449775
XGBoost 测试集f1评分:0.45989304812834225
XGBoost 训练集AUC值:0.8550155642023348
XGBoost 测试集AUC值:0.7254293796726612