数据挖掘(五):参数调优

数据挖掘(五):参数调优

# 导入包
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV,ParameterGrid
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics
import warnings

plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False
warnings.filterwarnings('ignore')
# 导入数据
org_data = pd.read_csv("org_data.csv", encoding = 'gbk')
print(org_data.shape)
(4754, 58)
var_total = org_data.columns
var_y = ['status']
var_x = list(set(var_total) - set(var_y))
# # 数据拆分
y = org_data[var_y]
x = org_data[var_x]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=2018)

print(x.shape)
print(y.shape)
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)
(4754, 57)
(4754, 1)
(3327, 57)
(3327, 1)
(1427, 57)
(1427, 1)
# 
# 通过随机森林进一步筛选变量
tf = RandomForestClassifier(criterion='gini')
# tf = RandomForestClassifier(criterion='entropy', n_estimators=3, max_features=0.5, min_samples_split=5)
tf_model = tf.fit(x_train, y_train)
tf_model
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
importance_dict = pd.DataFrame(tf_model.feature_importances_, list(x_train.columns))
importance_dict = pd.DataFrame()
importance_dict["features"] = list(x_train.columns)
importance_dict["importance"] = list(tf_model.feature_importances_)
importance_dict=importance_dict.set_index("features",drop=True)
var_sort = importance_dict.sort_values(by="importance",ascending=False)
# var_sort.plot(kind="bar")
print(var_sort)
                                            importance
features                                              
trans_fail_top_count_enum_last_1_month_woe    0.059223
history_fail_fee                              0.056541
loans_score                                   0.032077
latest_one_month_fail                         0.031654
apply_score                                   0.031323
loans_overdue_count                           0.026386
first_transaction_day                         0.024519
trans_amount_3_month                          0.023320
trans_day_last_12_month                       0.022150
historical_trans_amount                       0.021684
loans_latest_day                              0.020472
consfin_avg_limit                             0.019320
max_cumulative_consume_later_1_month          0.019216
avg_price_last_12_month                       0.019063
number_of_trans_from_2011                     0.018446
historical_trans_day                          0.018180
pawns_auctions_trusts_consume_last_6_month    0.017757
consume_top_time_last_6_month                 0.017473
consfin_credit_limit                          0.017428
trans_activity_day                            0.017384
trans_fail_top_count_enum_last_6_month        0.017367
query_sum_count                               0.017095
loans_latest_time_days                        0.016930
rank_trad_1_month_woe                         0.016894
trans_top_time_last_6_month                   0.016527
latest_query_day                              0.016235
latest_query_time_days                        0.016090
apply_credibility                             0.016050
consfin_max_limit                             0.015665
consfin_credibility                           0.015618
consume_top_time_last_1_month                 0.015464
pawns_auctions_trusts_consume_last_1_month    0.015397
loans_max_limit                               0.015029
loans_credit_limit                            0.014474
history_suc_fee                               0.014274
latest_three_month_loan                       0.013838
loans_settle_count                            0.013797
avg_price_top_last_12_valid_month_woe         0.013660
query_org_count                               0.012944
latest_six_month_loan                         0.012813
middle_volume_percent                         0.012598
consume_mini_time_last_1_month                0.012405
latest_one_month_suc_woe                      0.012299
loans_org_count_behavior                      0.012265
loans_cash_count                              0.012263
loans_count                                   0.012134
latest_one_month_apply                        0.011842
trans_fail_top_count_enum_last_12_month       0.011343
trans_top_time_last_1_month                   0.010996
loans_org_count_current                       0.010605
consfin_product_count_woe                     0.010191
consfin_org_count_current_woe                 0.009683
loans_product_count                           0.009406
query_cash_count_woe                          0.008782
top_trans_count_last_1_month_woe              0.008133
consfin_org_count_behavior_woe                0.007659
low_volume_percent                            0.007617
# 以 2% 作为选取变量的阈值
var_x = list(var_sort.importance[var_sort.importance > 0.02].index)
var_x
['trans_fail_top_count_enum_last_1_month_woe',
 'history_fail_fee',
 'loans_score',
 'latest_one_month_fail',
 'apply_score',
 'loans_overdue_count',
 'first_transaction_day',
 'trans_amount_3_month',
 'trans_day_last_12_month',
 'historical_trans_amount',
 'loans_latest_day']
# 重新进行数据拆分
y = org_data[var_y]
x = org_data[var_x]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=2018)
# 模型实例化
# 逻辑回归、svm和决策树;随机森林和XGBoost
# svm模型结果又问题,检查一下筛选的变量和模型参数设置
Lr = LogisticRegression()
svc = SVC()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier(n_estimators=200)
xgb = XGBClassifier(n_jobs=2)
# 模型参数调优
C = np.logspace(-3,0,20,base=10)
lr_param_grid = {'C': C, 'penalty':['l1', 'l2']}
Lr_cv = GridSearchCV(estimator=Lr, 
                      param_grid=lr_param_grid, 
                      cv=5, 
                      scoring='f1')

svc_param_grid = {'C': C}
svc_cv = GridSearchCV(estimator=svc, 
                      param_grid=svc_param_grid, 
                      cv=5, 
                      scoring='f1')

dt_param_grid = {'min_samples_leaf':range(5, 10), 'criterion': ['gini', 'entropy'], 'max_depth':range(2, 5)}
dt_cv = GridSearchCV(estimator=dt, 
                      param_grid=dt_param_grid, 
                      cv=5, 
                      scoring='f1')

rf_param_grid = {'min_samples_leaf':range(5, 10), 'criterion': ['gini', 'entropy'], 'max_depth':range(2, 5)}
rf_cv = GridSearchCV(estimator=rf, 
                      param_grid=rf_param_grid, 
                      cv=5, 
                      scoring='f1')

xgb_param_grid = {'learning_rate':[0.01, 0.05, 0.1, 0.2, 0.5], 'n_estimators':[100, 200, 300, 500]}
xgb_cv = GridSearchCV(estimator=xgb, 
                      param_grid=xgb_param_grid, 
                      cv=5, 
                      scoring='f1')

model_dict = {"逻辑回归":Lr_cv, "SVM":svc_cv, "决策树":dt_cv, "随机森林":rf_cv, "XGBoost":xgb_cv}
# 进行模型训练,打印模型评估指标
results = pd.DataFrame()
# name_list = f1_score = accuracy = []
def model_est(model_dict, x_train, x_test, y_train, y_test):
    for name, model in model_dict.items():
        model_trian = model.fit(x_train, y_train)
        y_pred_train = model_trian.predict(x_train)
        y_pred_test = model_trian.predict(x_test)
        
        # 训练集评估指标
        acc_score_train = metrics.accuracy_score(y_pred_train, y_train)
        precision_score_train = metrics.precision_score(y_pred_train, y_train)
        recall_score_train = metrics.recall_score(y_pred_train, y_train)
        f1_score_train = metrics.f1_score(y_pred_train, y_train)
        roc_auc_score_train = metrics.roc_auc_score(y_pred_train, y_train)
             
        # 测试集评估指标
        acc_score_test = metrics.accuracy_score(y_pred_test, y_test)
        precision_score_test = metrics.precision_score(y_pred_test, y_test)
        recall_score_test = metrics.recall_score(y_pred_test, y_test)
        f1_score_test = metrics.f1_score(y_pred_test, y_test)
        roc_auc_score_test = metrics.roc_auc_score(y_pred_test, y_test)
        
        print('{} 训练集准确率:{}'.format(name,acc_score_train))
        print('{} 测试集准确率:{}\n'.format(name,acc_score_test))
        
        print('{} 训练集精确率:{}'.format(name,precision_score_train))
        print('{} 测试集精确率:{}\n'.format(name,precision_score_test))
        
        print('{} 训练集召回率:{}'.format(name,recall_score_train))
        print('{} 测试集召回率:{}\n'.format(name,recall_score_test))
        
        print('{} 训练集f1评分:{}'.format(name,f1_score_train))
        print('{} 测试集f1评分:{}\n'.format(name,f1_score_test))
        
        print('{} 训练集AUC值:{}'.format(name,roc_auc_score_train))
        print('{} 测试集AUC值:{}\n'.format(name,roc_auc_score_test))
        
        # roc曲线
        fpr, tpr, th = metrics.roc_curve(y_train, y_pred_train)
        fpr_t, tpr_t, th_t = metrics.roc_curve(y_test, y_pred_test)
        plt.figure(figsize=[10, 8])
        plt.plot(fpr, tpr, 'b--')
        plt.plot(fpr_t, tpr_t, 'r--')
        plt.title(label='{} ROC curve'.format(name))
        plt.xlabel("fpr", fontsize=13)
        plt.ylabel("tpr", fontsize=13)
        plt.show()

model_est(model_dict, x_train, x_test, y_train, y_test)
逻辑回归 训练集准确率:0.7938082356477307
逻辑回归 测试集准确率:0.7778556412053259

逻辑回归 训练集精确率:0.31894484412470026
逻辑回归 测试集精确率:0.27019498607242337

逻辑回归 训练集召回率:0.6927083333333334
逻辑回归 测试集召回率:0.6381578947368421

逻辑回归 训练集f1评分:0.4367816091954024
逻辑回归 测试集f1评分:0.37964774951076313

逻辑回归 训练集AUC值:0.7498539967720014
逻辑回归 测试集AUC值:0.7163338493292053

在这里插入图片描述

SVM 训练集准确率:0.7682596934174932
SVM 测试集准确率:0.7659425367904695

SVM 训练集精确率:0.08752997601918465
SVM 测试集精确率:0.08356545961002786

SVM 训练集召回率:0.8795180722891566
SVM 测试集召回率:0.8571428571428571

SVM 训练集f1评分:0.15921483097055614
SVM 测试集f1评分:0.15228426395939088

SVM 训练集AUC值:0.8224655712863784
SVM 测试集AUC值:0.8103961412151068

在这里插入图片描述

决策树 训练集准确率:0.7992185151788398
决策树 测试集准确率:0.7631394533987386

决策树 训练集精确率:0.2973621103117506
决策树 测试集精确率:0.23676880222841226

决策树 训练集召回率:0.7515151515151515
决策树 测试集召回率:0.5704697986577181

决策树 训练集f1评分:0.42611683848797255
决策树 测试集f1评分:0.3346456692913386

决策树 训练集AUC值:0.7779931446598113
决策树 测试集AUC值:0.6780361512850406

在这里插入图片描述

随机森林 训练集准确率:0.8028253681995792
随机森林 测试集准确率:0.7778556412053259

随机森林 训练集精确率:0.27817745803357313
随机森林 测试集精确率:0.22284122562674094

随机森林 训练集召回率:0.8111888111888111
随机森林 测试集召回率:0.6779661016949152

随机森林 训练集f1评分:0.4142857142857143
随机森林 测试集f1评分:0.33542976939203356

随机森林 训练集AUC值:0.8066138071070659
随机森林 测试集AUC值:0.7324131501599099

在这里插入图片描述

XGBoost 训练集准确率:0.8557258791704238
XGBoost 测试集准确率:0.7876664330763841

XGBoost 训练集精确率:0.511990407673861
XGBoost 测试集精确率:0.3593314763231198

XGBoost 训练集召回率:0.854
XGBoost 测试集召回率:0.6386138613861386

XGBoost 训练集f1评分:0.6401799100449775
XGBoost 测试集f1评分:0.45989304812834225

XGBoost 训练集AUC值:0.8550155642023348
XGBoost 测试集AUC值:0.7254293796726612

在这里插入图片描述

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值