数据挖掘(四):模型评估

数据挖掘(四):模型评估

# 导入包
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics
import warnings

plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False
warnings.filterwarnings('ignore')
# 导入数据
org_data = pd.read_csv("org_data.csv", encoding = 'gbk')
print(org_data.shape)
(4754, 58)
var_total = org_data.columns
var_y = ['status']
var_x = list(set(var_total) - set(var_y))
# # 数据拆分
y = org_data[var_y]
x = org_data[var_x]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=2018)

print(x.shape)
print(y.shape)
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)
(4754, 57)
(4754, 1)
(3327, 57)
(3327, 1)
(1427, 57)
(1427, 1)
# 通过随机森林进一步筛选变量
tf = RandomForestClassifier(criterion='gini')
# tf = RandomForestClassifier(criterion='entropy', n_estimators=3, max_features=0.5, min_samples_split=5)
tf_model = tf.fit(x_train, y_train)
tf_model
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
importance_dict = pd.DataFrame(tf_model.feature_importances_, list(x_train.columns))
importance_dict = pd.DataFrame()
importance_dict["features"] = list(x_train.columns)
importance_dict["importance"] = list(tf_model.feature_importances_)
importance_dict=importance_dict.set_index("features",drop=True)
var_sort = importance_dict.sort_values(by="importance",ascending=False)
# var_sort.plot(kind="bar")
print(var_sort)
                                            importance
features                                              
trans_fail_top_count_enum_last_1_month_woe    0.062645
history_fail_fee                              0.056417
loans_score                                   0.046255
apply_score                                   0.033745
latest_one_month_fail                         0.029400
trans_day_last_12_month                       0.027378
loans_overdue_count                           0.023206
historical_trans_day                          0.021960
consfin_avg_limit                             0.020693
max_cumulative_consume_later_1_month          0.020565
avg_price_last_12_month                       0.020489
trans_amount_3_month                          0.019648
loans_latest_day                              0.019615
number_of_trans_from_2011                     0.019153
first_transaction_day                         0.019045
loans_max_limit                               0.018373
pawns_auctions_trusts_consume_last_6_month    0.017959
consume_top_time_last_6_month                 0.017543
consfin_max_limit                             0.017293
trans_fail_top_count_enum_last_12_month       0.017243
history_suc_fee                               0.017230
consfin_credit_limit                          0.017200
trans_top_time_last_6_month                   0.016818
apply_credibility                             0.016500
latest_query_time_days                        0.016421
query_org_count                               0.016184
historical_trans_amount                       0.016157
query_sum_count                               0.015885
rank_trad_1_month_woe                         0.015321
avg_price_top_last_12_valid_month_woe         0.014694
trans_activity_day                            0.014690
consume_top_time_last_1_month                 0.014600
consfin_credibility                           0.014533
latest_query_day                              0.014241
trans_fail_top_count_enum_last_6_month        0.014174
loans_latest_time_days                        0.013789
trans_top_time_last_1_month                   0.013761
consume_mini_time_last_1_month                0.013660
pawns_auctions_trusts_consume_last_1_month    0.013265
latest_one_month_suc_woe                      0.012844
loans_count                                   0.012570
latest_six_month_loan                         0.012368
loans_credit_limit                            0.011892
loans_settle_count                            0.011867
loans_org_count_behavior                      0.011187
loans_org_count_current                       0.010854
consfin_org_count_behavior_woe                0.010742
middle_volume_percent                         0.010723
consfin_product_count_woe                     0.010007
query_cash_count_woe                          0.009518
top_trans_count_last_1_month_woe              0.009387
loans_product_count                           0.009130
loans_cash_count                              0.009084
latest_three_month_loan                       0.008288
latest_one_month_apply                        0.008153
consfin_org_count_current_woe                 0.007309
low_volume_percent                            0.006332
# 以 2% 作为选取变量的阈值
var_x = list(var_sort.importance[var_sort.importance > 0.02].index)
var_x
['trans_fail_top_count_enum_last_1_month_woe',
 'history_fail_fee',
 'loans_score',
 'apply_score',
 'latest_one_month_fail',
 'trans_day_last_12_month',
 'loans_overdue_count',
 'historical_trans_day',
 'consfin_avg_limit',
 'max_cumulative_consume_later_1_month',
 'avg_price_last_12_month']
# 重新进行数据拆分
y = org_data[var_y]
x = org_data[var_x]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=2018)
# 模型实例化
# 逻辑回归、svm和决策树;随机森林和XGBoost
# svm模型结果又问题,检查一下筛选的变量和模型参数设置
Lr = LogisticRegression()
svc = SVC(kernel = 'rbf')
dt = DecisionTreeClassifier( max_depth=5)
rf = RandomForestClassifier(n_estimators=200, min_samples_leaf=5)
xgb = XGBClassifier()

model_dict = {"逻辑回归":Lr, "SVM":svc, "决策树":dt, "随机森林":rf, "XGBoost":xgb}
# 进行模型训练,打印模型评估指标
results = pd.DataFrame()
# name_list = f1_score = accuracy = []
def model_est(model_dict, x_train, x_test, y_train, y_test):
    for name, model in model_dict.items():
        model_trian = model.fit(x_train, y_train)
        y_pred_train = model_trian.predict(x_train)
        y_pred_test = model_trian.predict(x_test)
        
        # 训练集评估指标
        acc_score_train = metrics.accuracy_score(y_pred_train, y_train)
        precision_score_train = metrics.precision_score(y_pred_train, y_train)
        recall_score_train = metrics.recall_score(y_pred_train, y_train)
        f1_score_train = metrics.f1_score(y_pred_train, y_train)
        roc_auc_score_train = metrics.roc_auc_score(y_pred_train, y_train)
             
        # 测试集评估指标
        acc_score_test = metrics.accuracy_score(y_pred_test, y_test)
        precision_score_test = metrics.precision_score(y_pred_test, y_test)
        recall_score_test = metrics.recall_score(y_pred_test, y_test)
        f1_score_test = metrics.f1_score(y_pred_test, y_test)
        roc_auc_score_test = metrics.roc_auc_score(y_pred_test, y_test)
        
        print('{} 训练集准确率:{}'.format(name,acc_score_train))
        print('{} 测试集准确率:{}\n'.format(name,acc_score_test))
        
        print('{} 训练集精确率:{}'.format(name,precision_score_train))
        print('{} 测试集精确率:{}\n'.format(name,precision_score_test))
        
        print('{} 训练集召回率:{}'.format(name,recall_score_train))
        print('{} 测试集召回率:{}\n'.format(name,recall_score_test))
        
        print('{} 训练集f1评分:{}'.format(name,f1_score_train))
        print('{} 测试集f1评分:{}\n'.format(name,f1_score_test))
        
        print('{} 训练集AUC值:{}'.format(name,roc_auc_score_train))
        print('{} 测试集AUC值:{}\n'.format(name,roc_auc_score_test))
        
        # roc曲线
        fpr, tpr, th = metrics.roc_curve(y_train, y_pred_train)
        fpr_t, tpr_t, th_t = metrics.roc_curve(y_test, y_pred_test)
        plt.figure(figsize=[10, 8])
        plt.plot(fpr, tpr, 'b--')
        plt.plot(fpr_t, tpr_t, 'r--')
        plt.title(label='{} ROC curve'.format(name))
        plt.xlabel("fpr", fontsize=13)
        plt.ylabel("tpr", fontsize=13)
        plt.show()

model_est(model_dict, x_train, x_test, y_train, y_test)
逻辑回归 训练集准确率:0.7923053802224226
逻辑回归 测试集准确率:0.7743517869656622

逻辑回归 训练集精确率:0.3057553956834532
逻辑回归 测试集精确率:0.2590529247910863

逻辑回归 训练集召回率:0.6948228882833788
逻辑回归 测试集召回率:0.6241610738255033

逻辑回归 训练集f1评分:0.4246461282264779
逻辑回归 测试集f1评分:0.3661417322834645

逻辑回归 训练集AUC值:0.7496073900876353
逻辑回归 测试集AUC值:0.7080116793227673

在这里插入图片描述

SVM 训练集准确率:0.7718665464382326
SVM 测试集准确率:0.7659425367904695

SVM 训练集精确率:0.1091127098321343
SVM 测试集精确率:0.09192200557103064

SVM 训练集召回率:0.8504672897196262
SVM 测试集召回率:0.8048780487804879

SVM 训练集f1评分:0.19341126461211477
SVM 测试集f1评分:0.165

SVM 训练集AUC值:0.8098609740523597
SVM 测试集AUC值:0.7848344067856263

在这里插入图片描述

决策树 训练集准确率:0.8154493537721671
决策树 测试集准确率:0.7610371408549405

决策树 训练集精确率:0.38848920863309355
决策树 测试集精确率:0.2590529247910863

决策树 训练集召回率:0.7570093457943925
决策树 测试集召回率:0.5535714285714286

决策树 训练集f1评分:0.5134706814580032
决策树 测试集f1评分:0.35294117647058826

决策树 训练集AUC值:0.7905433069089244
决策树 测试集AUC值:0.6711463179394077

在这里插入图片描述

随机森林 训练集准确率:0.8899909828674482
随机森林 测试集准确率:0.7834618079887876

随机森林 训练集精确率:0.5947242206235012
随机森林 测试集精确率:0.31197771587743733

随机森林 训练集召回率:0.9465648854961832
随机森林 测试集召回率:0.6436781609195402

随机森林 训练集f1评分:0.7304860088365241
随机森林 测试集f1评分:0.4202626641651032

随机森林 训练集AUC值:0.9129898990449165
随机森林 测试集AUC值:0.7232756327343113

在这里插入图片描述

XGBoost 训练集准确率:0.8295761947700632
XGBoost 测试集准确率:0.7876664330763841

XGBoost 训练集精确率:0.434052757793765
XGBoost 测试集精确率:0.3370473537604457

XGBoost 训练集召回率:0.7921225382932167
XGBoost 测试集召回率:0.6505376344086021

XGBoost 训练集f1评分:0.5608055770720373
XGBoost 测试集f1评分:0.44403669724770645

XGBoost 训练集AUC值:0.813831303989814
XGBoost 测试集AUC值:0.729378406245397

在这里插入图片描述


  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值