数据挖掘(四):模型评估
import pandas as pd
import matplotlib. pyplot as plt
from sklearn. model_selection import train_test_split
from sklearn. svm import SVC
from sklearn. linear_model import LogisticRegression
from sklearn. tree import DecisionTreeClassifier
from sklearn. ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn. ensemble import GradientBoostingClassifier
from sklearn import metrics
import warnings
plt. rcParams[ 'font.sans-serif' ] = [ 'SimHei' ]
plt. rcParams[ 'axes.unicode_minus' ] = False
warnings. filterwarnings( 'ignore' )
org_data = pd. read_csv( "org_data.csv" , encoding = 'gbk' )
print ( org_data. shape)
(4754, 58)
var_total = org_data. columns
var_y = [ 'status' ]
var_x = list ( set ( var_total) - set ( var_y) )
y = org_data[ var_y]
x = org_data[ var_x]
x_train, x_test, y_train, y_test = train_test_split( x, y, test_size= 0.3 , random_state= 2018 )
print ( x. shape)
print ( y. shape)
print ( x_train. shape)
print ( y_train. shape)
print ( x_test. shape)
print ( y_test. shape)
(4754, 57)
(4754, 1)
(3327, 57)
(3327, 1)
(1427, 57)
(1427, 1)
tf = RandomForestClassifier( criterion= 'gini' )
tf_model = tf. fit( x_train, y_train)
tf_model
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
oob_score=False, random_state=None, verbose=0,
warm_start=False)
importance_dict = pd. DataFrame( tf_model. feature_importances_, list ( x_train. columns) )
importance_dict = pd. DataFrame( )
importance_dict[ "features" ] = list ( x_train. columns)
importance_dict[ "importance" ] = list ( tf_model. feature_importances_)
importance_dict= importance_dict. set_index( "features" , drop= True )
var_sort = importance_dict. sort_values( by= "importance" , ascending= False )
print ( var_sort)
importance
features
trans_fail_top_count_enum_last_1_month_woe 0.062645
history_fail_fee 0.056417
loans_score 0.046255
apply_score 0.033745
latest_one_month_fail 0.029400
trans_day_last_12_month 0.027378
loans_overdue_count 0.023206
historical_trans_day 0.021960
consfin_avg_limit 0.020693
max_cumulative_consume_later_1_month 0.020565
avg_price_last_12_month 0.020489
trans_amount_3_month 0.019648
loans_latest_day 0.019615
number_of_trans_from_2011 0.019153
first_transaction_day 0.019045
loans_max_limit 0.018373
pawns_auctions_trusts_consume_last_6_month 0.017959
consume_top_time_last_6_month 0.017543
consfin_max_limit 0.017293
trans_fail_top_count_enum_last_12_month 0.017243
history_suc_fee 0.017230
consfin_credit_limit 0.017200
trans_top_time_last_6_month 0.016818
apply_credibility 0.016500
latest_query_time_days 0.016421
query_org_count 0.016184
historical_trans_amount 0.016157
query_sum_count 0.015885
rank_trad_1_month_woe 0.015321
avg_price_top_last_12_valid_month_woe 0.014694
trans_activity_day 0.014690
consume_top_time_last_1_month 0.014600
consfin_credibility 0.014533
latest_query_day 0.014241
trans_fail_top_count_enum_last_6_month 0.014174
loans_latest_time_days 0.013789
trans_top_time_last_1_month 0.013761
consume_mini_time_last_1_month 0.013660
pawns_auctions_trusts_consume_last_1_month 0.013265
latest_one_month_suc_woe 0.012844
loans_count 0.012570
latest_six_month_loan 0.012368
loans_credit_limit 0.011892
loans_settle_count 0.011867
loans_org_count_behavior 0.011187
loans_org_count_current 0.010854
consfin_org_count_behavior_woe 0.010742
middle_volume_percent 0.010723
consfin_product_count_woe 0.010007
query_cash_count_woe 0.009518
top_trans_count_last_1_month_woe 0.009387
loans_product_count 0.009130
loans_cash_count 0.009084
latest_three_month_loan 0.008288
latest_one_month_apply 0.008153
consfin_org_count_current_woe 0.007309
low_volume_percent 0.006332
var_x = list ( var_sort. importance[ var_sort. importance > 0.02 ] . index)
var_x
['trans_fail_top_count_enum_last_1_month_woe',
'history_fail_fee',
'loans_score',
'apply_score',
'latest_one_month_fail',
'trans_day_last_12_month',
'loans_overdue_count',
'historical_trans_day',
'consfin_avg_limit',
'max_cumulative_consume_later_1_month',
'avg_price_last_12_month']
y = org_data[ var_y]
x = org_data[ var_x]
x_train, x_test, y_train, y_test = train_test_split( x, y, test_size= 0.3 , random_state= 2018 )
Lr = LogisticRegression( )
svc = SVC( kernel = 'rbf' )
dt = DecisionTreeClassifier( max_depth= 5 )
rf = RandomForestClassifier( n_estimators= 200 , min_samples_leaf= 5 )
xgb = XGBClassifier( )
model_dict = { "逻辑回归" : Lr, "SVM" : svc, "决策树" : dt, "随机森林" : rf, "XGBoost" : xgb}
results = pd. DataFrame( )
def model_est ( model_dict, x_train, x_test, y_train, y_test) :
for name, model in model_dict. items( ) :
model_trian = model. fit( x_train, y_train)
y_pred_train = model_trian. predict( x_train)
y_pred_test = model_trian. predict( x_test)
acc_score_train = metrics. accuracy_score( y_pred_train, y_train)
precision_score_train = metrics. precision_score( y_pred_train, y_train)
recall_score_train = metrics. recall_score( y_pred_train, y_train)
f1_score_train = metrics. f1_score( y_pred_train, y_train)
roc_auc_score_train = metrics. roc_auc_score( y_pred_train, y_train)
acc_score_test = metrics. accuracy_score( y_pred_test, y_test)
precision_score_test = metrics. precision_score( y_pred_test, y_test)
recall_score_test = metrics. recall_score( y_pred_test, y_test)
f1_score_test = metrics. f1_score( y_pred_test, y_test)
roc_auc_score_test = metrics. roc_auc_score( y_pred_test, y_test)
print ( '{} 训练集准确率:{}' . format ( name, acc_score_train) )
print ( '{} 测试集准确率:{}\n' . format ( name, acc_score_test) )
print ( '{} 训练集精确率:{}' . format ( name, precision_score_train) )
print ( '{} 测试集精确率:{}\n' . format ( name, precision_score_test) )
print ( '{} 训练集召回率:{}' . format ( name, recall_score_train) )
print ( '{} 测试集召回率:{}\n' . format ( name, recall_score_test) )
print ( '{} 训练集f1评分:{}' . format ( name, f1_score_train) )
print ( '{} 测试集f1评分:{}\n' . format ( name, f1_score_test) )
print ( '{} 训练集AUC值:{}' . format ( name, roc_auc_score_train) )
print ( '{} 测试集AUC值:{}\n' . format ( name, roc_auc_score_test) )
fpr, tpr, th = metrics. roc_curve( y_train, y_pred_train)
fpr_t, tpr_t, th_t = metrics. roc_curve( y_test, y_pred_test)
plt. figure( figsize= [ 10 , 8 ] )
plt. plot( fpr, tpr, 'b--' )
plt. plot( fpr_t, tpr_t, 'r--' )
plt. title( label= '{} ROC curve' . format ( name) )
plt. xlabel( "fpr" , fontsize= 13 )
plt. ylabel( "tpr" , fontsize= 13 )
plt. show( )
model_est( model_dict, x_train, x_test, y_train, y_test)
逻辑回归 训练集准确率:0.7923053802224226
逻辑回归 测试集准确率:0.7743517869656622
逻辑回归 训练集精确率:0.3057553956834532
逻辑回归 测试集精确率:0.2590529247910863
逻辑回归 训练集召回率:0.6948228882833788
逻辑回归 测试集召回率:0.6241610738255033
逻辑回归 训练集f1评分:0.4246461282264779
逻辑回归 测试集f1评分:0.3661417322834645
逻辑回归 训练集AUC值:0.7496073900876353
逻辑回归 测试集AUC值:0.7080116793227673
SVM 训练集准确率:0.7718665464382326
SVM 测试集准确率:0.7659425367904695
SVM 训练集精确率:0.1091127098321343
SVM 测试集精确率:0.09192200557103064
SVM 训练集召回率:0.8504672897196262
SVM 测试集召回率:0.8048780487804879
SVM 训练集f1评分:0.19341126461211477
SVM 测试集f1评分:0.165
SVM 训练集AUC值:0.8098609740523597
SVM 测试集AUC值:0.7848344067856263
决策树 训练集准确率:0.8154493537721671
决策树 测试集准确率:0.7610371408549405
决策树 训练集精确率:0.38848920863309355
决策树 测试集精确率:0.2590529247910863
决策树 训练集召回率:0.7570093457943925
决策树 测试集召回率:0.5535714285714286
决策树 训练集f1评分:0.5134706814580032
决策树 测试集f1评分:0.35294117647058826
决策树 训练集AUC值:0.7905433069089244
决策树 测试集AUC值:0.6711463179394077
随机森林 训练集准确率:0.8899909828674482
随机森林 测试集准确率:0.7834618079887876
随机森林 训练集精确率:0.5947242206235012
随机森林 测试集精确率:0.31197771587743733
随机森林 训练集召回率:0.9465648854961832
随机森林 测试集召回率:0.6436781609195402
随机森林 训练集f1评分:0.7304860088365241
随机森林 测试集f1评分:0.4202626641651032
随机森林 训练集AUC值:0.9129898990449165
随机森林 测试集AUC值:0.7232756327343113
XGBoost 训练集准确率:0.8295761947700632
XGBoost 测试集准确率:0.7876664330763841
XGBoost 训练集精确率:0.434052757793765
XGBoost 测试集精确率:0.3370473537604457
XGBoost 训练集召回率:0.7921225382932167
XGBoost 测试集召回率:0.6505376344086021
XGBoost 训练集f1评分:0.5608055770720373
XGBoost 测试集f1评分:0.44403669724770645
XGBoost 训练集AUC值:0.813831303989814
XGBoost 测试集AUC值:0.729378406245397