数据挖掘(二):特征工程

# 导入包
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from datetime import datetime
from dateutil.parser import parse
from woe import WoE 

plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False
%matplotlib inline
# 导入数据
org_data = pd.read_csv("data.csv", encoding = 'gbk')
print(org_data.shape)
(4754, 90)

1、删除无关变量

# 查看前几行数据
org_data.head()
Unnamed: 0custidtrade_nobank_card_nolow_volume_percentmiddle_volume_percenttake_amount_in_later_12_month_highesttrans_amount_increase_rate_latelytrans_activity_monthtrans_activity_day...loans_max_limitloans_avg_limitconsfin_credit_limitconsfin_credibilityconsfin_org_count_currentconsfin_product_countconsfin_max_limitconsfin_avg_limitlatest_query_dayloans_latest_day
05279185820180507115231274000000023057383卡号10.010.9900.900.550.313...2900.01688.01200.075.01.02.01200.01200.012.018.0
11053404720180507121002192000000023073000卡号10.020.9420001.281.000.458...3500.01758.015100.080.05.06.022800.09360.04.02.0
212284978720180507125159718000000023114911卡号10.040.9601.001.000.114...1600.01250.04200.087.01.01.04200.04200.02.06.0
313180970820180507121358683000000388283484卡号10.000.9620000.130.570.777...3200.01541.016300.080.05.05.030000.012180.02.04.0
414249982920180507115448545000000388205844卡号10.010.9900.461.000.175...2300.01630.08300.079.02.02.08400.08250.022.0120.0

5 rows × 90 columns

# 去掉第一列和名字
org_data = org_data.drop(['Unnamed: 0', 'id_name'], axis=1)
# org_data.dtypes
org_data_count = pd.DataFrame(org_data.nunique())
# org_data_count[org_data_count == 0 | org_data_count == len(org_data)]
# 找出唯一值的变量名称
org_data_count[org_data_count.ix[:, 0] == 1].index

Index([‘bank_card_no’, ‘source’], dtype=‘object’)

# 找出类似ID的变量名称
org_data_count[org_data_count.ix[:, 0] == len(org_data)].index
Index(['custid', 'trade_no'], dtype='object')
org_data = org_data.drop(['bank_card_no', 'source', 'custid', 'trade_no'], axis = 1)
print(org_data.shape)
(4754, 84)

2、数据类型分析

# dtypes: float64(70), int64(11), object(4)
org_data.info(memory_usage='deep')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4754 entries, 0 to 4753
Data columns (total 84 columns):
low_volume_percent                            4752 non-null float64
middle_volume_percent                         4752 non-null float64
take_amount_in_later_12_month_highest         4754 non-null int64
trans_amount_increase_rate_lately             4751 non-null float64
trans_activity_month                          4752 non-null float64
trans_activity_day                            4752 non-null float64
transd_mcc                                    4752 non-null float64
trans_days_interval_filter                    4746 non-null float64
trans_days_interval                           4752 non-null float64
regional_mobility                             4752 non-null float64
student_feature                               1756 non-null float64
repayment_capability                          4754 non-null int64
is_high_user                                  4754 non-null int64
number_of_trans_from_2011                     4752 non-null float64
first_transaction_time                        4752 non-null float64
historical_trans_amount                       4754 non-null int64
historical_trans_day                          4752 non-null float64
rank_trad_1_month                             4752 non-null float64
trans_amount_3_month                          4754 non-null int64
avg_consume_less_12_valid_month               4752 non-null float64
abs                                           4754 non-null int64
top_trans_count_last_1_month                  4752 non-null float64
avg_price_last_12_month                       4754 non-null int64
avg_price_top_last_12_valid_month             4650 non-null float64
reg_preference_for_trad                       4752 non-null object
trans_top_time_last_1_month                   4746 non-null float64
trans_top_time_last_6_month                   4746 non-null float64
consume_top_time_last_1_month                 4746 non-null float64
consume_top_time_last_6_month                 4746 non-null float64
cross_consume_count_last_1_month              4328 non-null float64
trans_fail_top_count_enum_last_1_month        4738 non-null float64
trans_fail_top_count_enum_last_6_month        4738 non-null float64
trans_fail_top_count_enum_last_12_month       4738 non-null float64
consume_mini_time_last_1_month                4728 non-null float64
max_cumulative_consume_later_1_month          4754 non-null int64
max_consume_count_later_6_month               4746 non-null float64
railway_consume_count_last_12_month           4742 non-null float64
pawns_auctions_trusts_consume_last_1_month    4754 non-null int64
pawns_auctions_trusts_consume_last_6_month    4754 non-null int64
jewelry_consume_count_last_6_month            4742 non-null float64
status                                        4754 non-null int64
first_transaction_day                         4752 non-null float64
trans_day_last_12_month                       4752 non-null float64
apply_score                                   4450 non-null float64
apply_credibility                             4450 non-null float64
query_org_count                               4450 non-null float64
query_finance_count                           4450 non-null float64
query_cash_count                              4450 non-null float64
query_sum_count                               4450 non-null float64
latest_query_time                             4450 non-null object
latest_one_month_apply                        4450 non-null float64
latest_three_month_apply                      4450 non-null float64
latest_six_month_apply                        4450 non-null float64
loans_score                                   4457 non-null float64
loans_credibility_behavior                    4457 non-null float64
loans_count                                   4457 non-null float64
loans_settle_count                            4457 non-null float64
loans_overdue_count                           4457 non-null float64
loans_org_count_behavior                      4457 non-null float64
consfin_org_count_behavior                    4457 non-null float64
loans_cash_count                              4457 non-null float64
latest_one_month_loan                         4457 non-null float64
latest_three_month_loan                       4457 non-null float64
latest_six_month_loan                         4457 non-null float64
history_suc_fee                               4457 non-null float64
history_fail_fee                              4457 non-null float64
latest_one_month_suc                          4457 non-null float64
latest_one_month_fail                         4457 non-null float64
loans_long_time                               4457 non-null float64
loans_latest_time                             4457 non-null object
loans_credit_limit                            4457 non-null float64
loans_credibility_limit                       4457 non-null float64
loans_org_count_current                       4457 non-null float64
loans_product_count                           4457 non-null float64
loans_max_limit                               4457 non-null float64
loans_avg_limit                               4457 non-null float64
consfin_credit_limit                          4457 non-null float64
consfin_credibility                           4457 non-null float64
consfin_org_count_current                     4457 non-null float64
consfin_product_count                         4457 non-null float64
consfin_max_limit                             4457 non-null float64
consfin_avg_limit                             4457 non-null float64
latest_query_day                              4450 non-null float64
loans_latest_day                              4457 non-null float64
dtypes: float64(70), int64(11), object(3)
memory usage: 3.9 MB
var_char = org_data.dtypes[org_data.dtypes == 'object'].index
print(len(var_char))
print(var_char)
3
Index(['reg_preference_for_trad', 'latest_query_time', 'loans_latest_time'], dtype='object')
var_y = ['status']
var_num = org_data.dtypes[org_data.dtypes != 'object'].index
var_num = list(set(var_num) - set(var_y))
print(var_y)
print(len(var_num))
print(var_num)
['status']
80
['cross_consume_count_last_1_month', 'max_cumulative_consume_later_1_month', 'trans_day_last_12_month', 'query_org_count', 'number_of_trans_from_2011', 'trans_amount_3_month', 'trans_fail_top_count_enum_last_1_month', 'loans_score', 'loans_cash_count', 'pawns_auctions_trusts_consume_last_6_month', 'apply_credibility', 'loans_credit_limit', 'max_consume_count_later_6_month', 'rank_trad_1_month', 'latest_three_month_apply', 'jewelry_consume_count_last_6_month', 'loans_avg_limit', 'consume_mini_time_last_1_month', 'loans_credibility_behavior', 'first_transaction_day', 'is_high_user', 'loans_credibility_limit', 'transd_mcc', 'trans_fail_top_count_enum_last_6_month', 'latest_six_month_loan', 'loans_settle_count', 'loans_max_limit', 'consfin_org_count_behavior', 'regional_mobility', 'historical_trans_amount', 'top_trans_count_last_1_month', 'student_feature', 'avg_price_last_12_month', 'latest_six_month_apply', 'first_transaction_time', 'trans_days_interval_filter', 'consfin_credit_limit', 'history_suc_fee', 'consfin_product_count', 'trans_amount_increase_rate_lately', 'trans_activity_month', 'latest_one_month_loan', 'trans_top_time_last_1_month', 'loans_org_count_current', 'repayment_capability', 'apply_score', 'trans_days_interval', 'query_sum_count', 'consume_top_time_last_6_month', 'latest_one_month_apply', 'query_cash_count', 'latest_one_month_suc', 'loans_product_count', 'consfin_org_count_current', 'trans_top_time_last_6_month', 'loans_org_count_behavior', 'history_fail_fee', 'loans_latest_day', 'take_amount_in_later_12_month_highest', 'middle_volume_percent', 'loans_count', 'consfin_avg_limit', 'trans_fail_top_count_enum_last_12_month', 'latest_one_month_fail', 'low_volume_percent', 'trans_activity_day', 'historical_trans_day', 'loans_overdue_count', 'abs', 'consume_top_time_last_1_month', 'latest_query_day', 'railway_consume_count_last_12_month', 'avg_consume_less_12_valid_month', 'pawns_auctions_trusts_consume_last_1_month', 'query_finance_count', 'consfin_credibility', 'consfin_max_limit', 'avg_price_top_last_12_valid_month', 'loans_long_time', 'latest_three_month_loan']

3、缺失值处理

# 缺失占比统计:70个变量都有缺失!
missing_rate_t = 1 - org_data.describe().T['count'] / len(org_data)
missing_rate = missing_rate_t[missing_rate_t > 0].sort_values(ascending = False)
print("缺失变量个数:%s"%len(missing_rate))
缺失变量个数:70
missing_rate_50 = missing_rate_t[missing_rate_t > 0.5].sort_values(ascending = False)
print(missing_rate_50)
student_feature    0.630627
Name: count, dtype: float64
org_data1 = org_data.drop(missing_rate_50.index, axis=1)
var_num = list(set(var_num)- set(['student_feature']))
#打印缺失占比
missing_rate_t1 = 1 - org_data1.describe().T['count'] / len(org_data)
missing_rate1 = missing_rate_t1[missing_rate_t1 > 0].sort_values(ascending = False)
print(missing_rate1)
cross_consume_count_last_1_month           0.089609
query_cash_count                           0.063946
latest_query_day                           0.063946
latest_six_month_apply                     0.063946
latest_one_month_apply                     0.063946
query_sum_count                            0.063946
latest_three_month_apply                   0.063946
query_finance_count                        0.063946
query_org_count                            0.063946
apply_credibility                          0.063946
apply_score                                0.063946
latest_one_month_loan                      0.062474
loans_score                                0.062474
loans_credibility_behavior                 0.062474
loans_count                                0.062474
loans_settle_count                         0.062474
loans_overdue_count                        0.062474
loans_org_count_behavior                   0.062474
consfin_org_count_behavior                 0.062474
loans_cash_count                           0.062474
loans_latest_day                           0.062474
latest_three_month_loan                    0.062474
loans_product_count                        0.062474
consfin_avg_limit                          0.062474
consfin_max_limit                          0.062474
consfin_product_count                      0.062474
consfin_org_count_current                  0.062474
consfin_credibility                        0.062474
consfin_credit_limit                       0.062474
loans_avg_limit                            0.062474
                                             ...   
latest_six_month_loan                      0.062474
avg_price_top_last_12_valid_month          0.021876
consume_mini_time_last_1_month             0.005469
trans_fail_top_count_enum_last_1_month     0.003366
trans_fail_top_count_enum_last_6_month     0.003366
trans_fail_top_count_enum_last_12_month    0.003366
jewelry_consume_count_last_6_month         0.002524
railway_consume_count_last_12_month        0.002524
max_consume_count_later_6_month            0.001683
consume_top_time_last_6_month              0.001683
consume_top_time_last_1_month              0.001683
trans_top_time_last_6_month                0.001683
trans_top_time_last_1_month                0.001683
trans_days_interval_filter                 0.001683
trans_amount_increase_rate_lately          0.000631
regional_mobility                          0.000421
middle_volume_percent                      0.000421
trans_activity_month                       0.000421
trans_activity_day                         0.000421
transd_mcc                                 0.000421
trans_days_interval                        0.000421
historical_trans_day                       0.000421
number_of_trans_from_2011                  0.000421
first_transaction_time                     0.000421
rank_trad_1_month                          0.000421
avg_consume_less_12_valid_month            0.000421
top_trans_count_last_1_month               0.000421
trans_day_last_12_month                    0.000421
first_transaction_day                      0.000421
low_volume_percent                         0.000421
Name: count, Length: 69, dtype: float64
# 因没有变量说明,通过浏览原数据发现,first_transaction_time为年月日,且存在大量疑似分类变量(变量值为整数、水平数较少),
# 这里将水平数小于50个的变量调整为分类变量
org_data1['today_date'] = '2019-05-13'
org_data1.head()
org_data1.dtypes
low_volume_percent                        float64
middle_volume_percent                     float64
take_amount_in_later_12_month_highest       int64
trans_amount_increase_rate_lately         float64
trans_activity_month                      float64
trans_activity_day                        float64
transd_mcc                                float64
trans_days_interval_filter                float64
trans_days_interval                       float64
regional_mobility                         float64
repayment_capability                        int64
is_high_user                                int64
number_of_trans_from_2011                 float64
first_transaction_time                    float64
historical_trans_amount                     int64
historical_trans_day                      float64
rank_trad_1_month                         float64
trans_amount_3_month                        int64
avg_consume_less_12_valid_month           float64
abs                                         int64
top_trans_count_last_1_month              float64
avg_price_last_12_month                     int64
avg_price_top_last_12_valid_month         float64
reg_preference_for_trad                    object
trans_top_time_last_1_month               float64
trans_top_time_last_6_month               float64
consume_top_time_last_1_month             float64
consume_top_time_last_6_month             float64
cross_consume_count_last_1_month          float64
trans_fail_top_count_enum_last_1_month    float64
                                           ...   
loans_count                               float64
loans_settle_count                        float64
loans_overdue_count                       float64
loans_org_count_behavior                  float64
consfin_org_count_behavior                float64
loans_cash_count                          float64
latest_one_month_loan                     float64
latest_three_month_loan                   float64
latest_six_month_loan                     float64
history_suc_fee                           float64
history_fail_fee                          float64
latest_one_month_suc                      float64
latest_one_month_fail                     float64
loans_long_time                           float64
loans_latest_time                          object
loans_credit_limit                        float64
loans_credibility_limit                   float64
loans_org_count_current                   float64
loans_product_count                       float64
loans_max_limit                           float64
loans_avg_limit                           float64
consfin_credit_limit                      float64
consfin_credibility                       float64
consfin_org_count_current                 float64
consfin_product_count                     float64
consfin_max_limit                         float64
consfin_avg_limit                         float64
latest_query_day                          float64
loans_latest_day                          float64
today_date                                 object
Length: 84, dtype: object
org_data1['latest_query_time_days']=(pd.to_datetime(org_data1['today_date']) - 
                                     pd.to_datetime(org_data1['latest_query_time']))/np.timedelta64(1*60*60*24, 's')

org_data1['loans_latest_time_days']=(pd.to_datetime(org_data1['today_date']) - 
                                     pd.to_datetime(org_data1['loans_latest_time']))/np.timedelta64(1*60*60*24, 's')

org_data1.head()
# org_data1.dtypes
low_volume_percentmiddle_volume_percenttake_amount_in_later_12_month_highesttrans_amount_increase_rate_latelytrans_activity_monthtrans_activity_daytransd_mcctrans_days_interval_filtertrans_days_intervalregional_mobility...consfin_credibilityconsfin_org_count_currentconsfin_product_countconsfin_max_limitconsfin_avg_limitlatest_query_dayloans_latest_daytoday_datelatest_query_time_daysloans_latest_time_days
00.010.9900.900.550.31317.027.026.03.0...75.01.02.01200.01200.012.018.02019-05-13383.0389.0
10.020.9420001.281.000.45819.030.014.04.0...80.05.06.022800.09360.04.02.02019-05-13375.0373.0
20.040.9601.001.000.11413.068.022.01.0...87.01.01.04200.04200.02.06.02019-05-13373.0377.0
30.000.9620000.130.570.77722.014.06.03.0...80.05.05.030000.012180.02.04.02019-05-13373.0375.0
40.010.9900.461.000.17513.066.042.01.0...79.02.02.08400.08250.022.0120.02019-05-13393.0491.0

5 rows × 86 columns

#删掉变量latest_query_time和loans_latest_time
org_data1 = org_data1.drop(['latest_query_time', 'loans_latest_time', 'today_date', 'first_transaction_time'], axis = 1)
# org_data1 = org_data1.drop(['first_transaction_time', 'today_date'], axis = 1)
org_data1.head()
low_volume_percentmiddle_volume_percenttake_amount_in_later_12_month_highesttrans_amount_increase_rate_latelytrans_activity_monthtrans_activity_daytransd_mcctrans_days_interval_filtertrans_days_intervalregional_mobility...consfin_credit_limitconsfin_credibilityconsfin_org_count_currentconsfin_product_countconsfin_max_limitconsfin_avg_limitlatest_query_dayloans_latest_daylatest_query_time_daysloans_latest_time_days
00.010.9900.900.550.31317.027.026.03.0...1200.075.01.02.01200.01200.012.018.0383.0389.0
10.020.9420001.281.000.45819.030.014.04.0...15100.080.05.06.022800.09360.04.02.0375.0373.0
20.040.9601.001.000.11413.068.022.01.0...4200.087.01.01.04200.04200.02.06.0373.0377.0
30.000.9620000.130.570.77722.014.06.03.0...16300.080.05.05.030000.012180.02.04.0373.0375.0
40.010.9900.461.000.17513.066.042.01.0...8300.079.02.02.08400.08250.022.0120.0393.0491.0

5 rows × 82 columns

org_data1.nunique()
low_volume_percent                          40
middle_volume_percent                       90
take_amount_in_later_12_month_highest      166
trans_amount_increase_rate_lately          782
trans_activity_month                        84
trans_activity_day                         512
transd_mcc                                  41
trans_days_interval_filter                 147
trans_days_interval                        114
regional_mobility                            5
repayment_capability                      2390
is_high_user                                 2
number_of_trans_from_2011                   70
historical_trans_amount                   4524
historical_trans_day                       476
rank_trad_1_month                           20
trans_amount_3_month                      3524
avg_consume_less_12_valid_month             12
abs                                       1697
top_trans_count_last_1_month                 8
avg_price_last_12_month                    330
avg_price_top_last_12_valid_month           20
reg_preference_for_trad                      5
trans_top_time_last_1_month                 28
trans_top_time_last_6_month                 97
consume_top_time_last_1_month               28
consume_top_time_last_6_month               94
cross_consume_count_last_1_month            19
trans_fail_top_count_enum_last_1_month      15
trans_fail_top_count_enum_last_6_month      25
                                          ... 
loans_count                                134
loans_settle_count                         123
loans_overdue_count                         26
loans_org_count_behavior                    41
consfin_org_count_behavior                  19
loans_cash_count                            32
latest_one_month_loan                       14
latest_three_month_loan                     31
latest_six_month_loan                       67
history_suc_fee                            171
history_fail_fee                           151
latest_one_month_suc                        19
latest_one_month_fail                       41
loans_long_time                            202
loans_credit_limit                          54
loans_credibility_limit                     33
loans_org_count_current                     32
loans_product_count                         32
loans_max_limit                             91
loans_avg_limit                            961
consfin_credit_limit                       327
consfin_credibility                         24
consfin_org_count_current                   19
consfin_product_count                       20
consfin_max_limit                          175
consfin_avg_limit                         1677
latest_query_day                           210
loans_latest_day                           235
latest_query_time_days                     207
loans_latest_time_days                     232
Length: 82, dtype: int64
var_char = org_data1.dtypes[org_data1.dtypes == 'object'].index
print(len(var_char))
print(var_char)
1
Index(['reg_preference_for_trad'], dtype='object')
# 
org_data1_count = pd.DataFrame(org_data1.nunique())
var_char1 = org_data1_count[org_data1_count.iloc[:,0] <= 20].index
var_char1 = list(set(var_char1) - set(var_y))
var_char = list(var_char) + list(var_char1)
var_char
['reg_preference_for_trad',
 'is_high_user',
 'consfin_product_count',
 'cross_consume_count_last_1_month',
 'latest_one_month_loan',
 'trans_fail_top_count_enum_last_1_month',
 'reg_preference_for_trad',
 'consfin_org_count_behavior',
 'regional_mobility',
 'query_cash_count',
 'latest_one_month_suc',
 'top_trans_count_last_1_month',
 'railway_consume_count_last_12_month',
 'avg_consume_less_12_valid_month',
 'consfin_org_count_current',
 'rank_trad_1_month',
 'avg_price_top_last_12_valid_month',
 'jewelry_consume_count_last_6_month']
print(len(var_char))
print(var_char)
18
['reg_preference_for_trad', 'is_high_user', 'consfin_product_count', 'cross_consume_count_last_1_month', 'latest_one_month_loan', 'trans_fail_top_count_enum_last_1_month', 'reg_preference_for_trad', 'consfin_org_count_behavior', 'regional_mobility', 'query_cash_count', 'latest_one_month_suc', 'top_trans_count_last_1_month', 'railway_consume_count_last_12_month', 'avg_consume_less_12_valid_month', 'consfin_org_count_current', 'rank_trad_1_month', 'avg_price_top_last_12_valid_month', 'jewelry_consume_count_last_6_month']
type(var_char)
list
# 重新生成数值变量
var_num = org_data1.dtypes[org_data1.dtypes != 'object'].index
var_num = list(set(var_num) - set(var_y))
var_num = list(set(var_num) - set(var_char))
print(var_y)
print(len(var_num))
print(var_num)
['status']
64
['loans_latest_time_days', 'max_cumulative_consume_later_1_month', 'trans_day_last_12_month', 'query_org_count', 'number_of_trans_from_2011', 'trans_amount_3_month', 'loans_score', 'loans_cash_count', 'pawns_auctions_trusts_consume_last_6_month', 'apply_credibility', 'loans_credit_limit', 'max_consume_count_later_6_month', 'latest_three_month_apply', 'loans_avg_limit', 'consume_mini_time_last_1_month', 'loans_credibility_behavior', 'first_transaction_day', 'loans_credibility_limit', 'transd_mcc', 'trans_fail_top_count_enum_last_6_month', 'latest_six_month_loan', 'loans_settle_count', 'loans_max_limit', 'historical_trans_amount', 'avg_price_last_12_month', 'latest_six_month_apply', 'trans_days_interval_filter', 'consfin_credit_limit', 'history_suc_fee', 'trans_amount_increase_rate_lately', 'trans_activity_month', 'trans_top_time_last_1_month', 'loans_org_count_current', 'repayment_capability', 'apply_score', 'trans_days_interval', 'query_sum_count', 'consume_top_time_last_6_month', 'latest_one_month_apply', 'loans_product_count', 'trans_top_time_last_6_month', 'loans_org_count_behavior', 'history_fail_fee', 'loans_latest_day', 'take_amount_in_later_12_month_highest', 'middle_volume_percent', 'loans_count', 'consfin_avg_limit', 'trans_fail_top_count_enum_last_12_month', 'latest_one_month_fail', 'low_volume_percent', 'trans_activity_day', 'historical_trans_day', 'latest_query_time_days', 'loans_overdue_count', 'abs', 'consume_top_time_last_1_month', 'latest_query_day', 'pawns_auctions_trusts_consume_last_1_month', 'query_finance_count', 'consfin_credibility', 'consfin_max_limit', 'loans_long_time', 'latest_three_month_loan']
# 对数值缺失变量采用均值替换,对分类变量采用众数替换
for i in var_num:
    org_data1[i].fillna(org_data1[i].mean(),inplace = True)
    
for i in var_char:
    org_data1[i].fillna(org_data1[i].mode()[0],inplace = True)

# 再次打印缺失占比
missing_rate_t1 = 1 - org_data1.describe().T['count'] / len(org_data)
missing_rate1 = missing_rate_t1[missing_rate_t1 > 0].sort_values(ascending = False)
print(missing_rate1)
Series([], Name: count, dtype: float64)

4、变量筛选

# 连续变量——相关系数,将相关系数较低的变量删除
org_data1_corr = abs(org_data1.corr()['status'])
org_data1_corr_re = org_data1_corr[org_data1_corr.values < 0.02].index
print(org_data1_corr_re)
Index(['take_amount_in_later_12_month_highest',
       'trans_amount_increase_rate_lately', 'trans_activity_month',
       'transd_mcc', 'trans_days_interval_filter', 'trans_days_interval',
       'repayment_capability', 'abs', 'cross_consume_count_last_1_month',
       'max_consume_count_later_6_month', 'jewelry_consume_count_last_6_month',
       'query_finance_count', 'latest_three_month_apply',
       'latest_six_month_apply', 'loans_credibility_behavior',
       'loans_long_time', 'loans_credibility_limit', 'loans_avg_limit'],
      dtype='object')
#数值型变量剩余48个,依然较多
var_num_f =list(set(var_num)- set(org_data1_corr_re))
print(len(var_num_f))
48
org_data1.head()
low_volume_percentmiddle_volume_percenttake_amount_in_later_12_month_highesttrans_amount_increase_rate_latelytrans_activity_monthtrans_activity_daytransd_mcctrans_days_interval_filtertrans_days_intervalregional_mobility...consfin_credit_limitconsfin_credibilityconsfin_org_count_currentconsfin_product_countconsfin_max_limitconsfin_avg_limitlatest_query_dayloans_latest_daylatest_query_time_daysloans_latest_time_days
00.010.9900.900.550.31317.027.026.03.0...1200.075.01.02.01200.01200.012.018.0383.0389.0
10.020.9420001.281.000.45819.030.014.04.0...15100.080.05.06.022800.09360.04.02.0375.0373.0
20.040.9601.001.000.11413.068.022.01.0...4200.087.01.01.04200.04200.02.06.0373.0377.0
30.000.9620000.130.570.77722.014.06.03.0...16300.080.05.05.030000.012180.02.04.0373.0375.0
40.010.9900.461.000.17513.066.042.01.0...8300.079.02.02.08400.08250.022.0120.0393.0491.0

5 rows × 82 columns

org_data1.groupby(['reg_preference_for_trad'])['reg_preference_for_trad'].count()
reg_preference_for_trad
一线城市    3405
三线城市    1064
二线城市     131
其他城市       4
境外       150
Name: reg_preference_for_trad, dtype: int64
var_char1 = org_data1.dtypes[org_data1.dtypes == 'object'].index
print(len(var_char1))
print(var_char1)
1
Index(['reg_preference_for_trad'], dtype='object')
# 对reg_preference_for_trad进行编码
city_code = {'一线城市':1, '二线城市':2, '三线城市':3, '其他城市':4, '境外':5}
org_data1['city_code'] = org_data1.reg_preference_for_trad.map(city_code)
org_data1[['city_code']].head()
org_data1 = org_data1.drop(['reg_preference_for_trad'], axis = 1)
# 重新生成分类变量列表
var_char = list(set(var_char)- set(['reg_preference_for_trad']))
var_char += list(['city_code'])
print(len(var_char))
print(var_char)
17
['is_high_user', 'consfin_product_count', 'cross_consume_count_last_1_month', 'latest_one_month_loan', 'trans_fail_top_count_enum_last_1_month', 'consfin_org_count_behavior', 'regional_mobility', 'query_cash_count', 'latest_one_month_suc', 'top_trans_count_last_1_month', 'railway_consume_count_last_12_month', 'avg_consume_less_12_valid_month', 'consfin_org_count_current', 'rank_trad_1_month', 'avg_price_top_last_12_valid_month', 'jewelry_consume_count_last_6_month', 'city_code']
# 根据iv值筛选变量-分类变量
char_iv_d = {}
for i in var_char:
    char_iv_d[i] = WoE(v_type='d').fit(pd.Series(org_data1[i]), pd.Series(org_data1[var_y[0]])).iv

char_sort_iv_d = pd.Series(char_iv_d).sort_values(ascending = False)
print(char_sort_iv_d)

# 以 2% 作为选取变量的阈值
var_char_s = list(char_sort_iv_d[char_sort_iv_d > 0.02].index)
trans_fail_top_count_enum_last_1_month    0.605834
latest_one_month_suc                      0.144222
rank_trad_1_month                         0.127129
top_trans_count_last_1_month              0.072685
avg_price_top_last_12_valid_month         0.044958
query_cash_count                          0.031850
consfin_product_count                     0.025901
consfin_org_count_current                 0.025889
consfin_org_count_behavior                0.025889
latest_one_month_loan                     0.018087
cross_consume_count_last_1_month          0.016866
avg_consume_less_12_valid_month           0.013734
regional_mobility                         0.007948
railway_consume_count_last_12_month       0.007215
city_code                                 0.006084
is_high_user                              0.005614
jewelry_consume_count_last_6_month        0.003111
dtype: float64
# 初步筛选的分类变量汇总
print("保留的分类变量有:%s个"%len(var_char_s))
print(var_char_s)
保留的分类变量有:9个
['trans_fail_top_count_enum_last_1_month', 'latest_one_month_suc', 'rank_trad_1_month', 'top_trans_count_last_1_month', 'avg_price_top_last_12_valid_month', 'query_cash_count', 'consfin_product_count', 'consfin_org_count_current', 'consfin_org_count_behavior']
# 初步筛选的连续变量汇总
print("保留的连续变量有:%s个"%len(var_num_f))
print(var_num_f)
保留的连续变量有:48个
['history_suc_fee', 'loans_latest_time_days', 'max_cumulative_consume_later_1_month', 'trans_day_last_12_month', 'query_org_count', 'number_of_trans_from_2011', 'trans_amount_3_month', 'trans_top_time_last_1_month', 'loans_org_count_current', 'loans_score', 'loans_cash_count', 'pawns_auctions_trusts_consume_last_6_month', 'apply_score', 'query_sum_count', 'consume_top_time_last_6_month', 'apply_credibility', 'latest_one_month_apply', 'loans_product_count', 'trans_top_time_last_6_month', 'loans_org_count_behavior', 'loans_credit_limit', 'history_fail_fee', 'loans_latest_day', 'middle_volume_percent', 'consume_mini_time_last_1_month', 'first_transaction_day', 'loans_count', 'consfin_avg_limit', 'trans_fail_top_count_enum_last_12_month', 'latest_one_month_fail', 'trans_fail_top_count_enum_last_6_month', 'latest_six_month_loan', 'low_volume_percent', 'loans_settle_count', 'trans_activity_day', 'historical_trans_day', 'latest_query_time_days', 'loans_max_limit', 'loans_overdue_count', 'consume_top_time_last_1_month', 'historical_trans_amount', 'latest_query_day', 'avg_price_last_12_month', 'pawns_auctions_trusts_consume_last_1_month', 'consfin_credibility', 'consfin_max_limit', 'consfin_credit_limit', 'latest_three_month_loan']
# # 数据拆分
y = org_data1[var_y]
x = org_data1[var_char_s + var_num_f]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=2018)
# 通过随机森林进一步筛选变量
tf = RandomForestClassifier(criterion='gini')
# tf = RandomForestClassifier(criterion='entropy', n_estimators=3, max_features=0.5, min_samples_split=5)
tf_model = tf.fit(x_train, y_train)
tf_model
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
importance_dict = pd.DataFrame(tf_model.feature_importances_, list(x_train.columns))


importance_dict = pd.DataFrame()
importance_dict["features"] = list(x_train.columns)
importance_dict["importance"] = list(tf_model.feature_importances_)
importance_dict=importance_dict.set_index("features",drop=True)
var_sort = importance_dict.sort_values(by="importance",ascending=False)
# var_sort.plot(kind="bar")
print(var_sort)
                                            importance
features                                              
trans_fail_top_count_enum_last_1_month        0.059021
history_fail_fee                              0.048917
loans_score                                   0.048562
apply_score                                   0.028175
loans_overdue_count                           0.027679
first_transaction_day                         0.026301
trans_amount_3_month                          0.024917
latest_one_month_fail                         0.023102
historical_trans_day                          0.022316
historical_trans_amount                       0.021682
trans_day_last_12_month                       0.020914
trans_activity_day                            0.020426
trans_fail_top_count_enum_last_12_month       0.020374
pawns_auctions_trusts_consume_last_6_month    0.020349
latest_query_day                              0.019935
latest_query_time_days                        0.019858
trans_fail_top_count_enum_last_6_month        0.019220
number_of_trans_from_2011                     0.019115
avg_price_last_12_month                       0.018772
consume_top_time_last_1_month                 0.018292
loans_latest_day                              0.018125
rank_trad_1_month                             0.017332
loans_credit_limit                            0.017324
loans_latest_time_days                        0.016844
trans_top_time_last_6_month                   0.016413
history_suc_fee                               0.016376
loans_settle_count                            0.015711
loans_max_limit                               0.015526
max_cumulative_consume_later_1_month          0.014721
consume_mini_time_last_1_month                0.014599
apply_credibility                             0.014518
latest_six_month_loan                         0.014449
consume_top_time_last_6_month                 0.014368
consfin_avg_limit                             0.014316
latest_one_month_apply                        0.014299
consfin_credit_limit                          0.014110
loans_org_count_behavior                      0.014042
query_org_count                               0.013130
query_sum_count                               0.012740
trans_top_time_last_1_month                   0.012465
avg_price_top_last_12_valid_month             0.012196
loans_org_count_current                       0.011805
loans_count                                   0.011471
consfin_max_limit                             0.011209
consfin_credibility                           0.011141
middle_volume_percent                         0.010836
loans_product_count                           0.010680
latest_three_month_loan                       0.010666
top_trans_count_last_1_month                  0.010497
consfin_product_count                         0.009988
low_volume_percent                            0.009410
latest_one_month_suc                          0.008959
consfin_org_count_behavior                    0.008460
consfin_org_count_current                     0.008409
query_cash_count                              0.008399
loans_cash_count                              0.008302
pawns_auctions_trusts_consume_last_1_month    0.008237
# 以 2% 作为选取变量的阈值
var_x = list(var_sort.importance[var_sort.importance > 0.02].index)
var_x
['trans_fail_top_count_enum_last_1_month',
 'history_fail_fee',
 'loans_score',
 'apply_score',
 'loans_overdue_count',
 'first_transaction_day',
 'trans_amount_3_month',
 'latest_one_month_fail',
 'historical_trans_day',
 'historical_trans_amount',
 'trans_day_last_12_month',
 'trans_activity_day',
 'trans_fail_top_count_enum_last_12_month',
 'pawns_auctions_trusts_consume_last_6_month']
# # 数据拆分
# y = org_data1[var_y]
# x = org_data1[var_char_s + var_num_f]
# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=2018)
# 数据拆分
y = org_data1[var_y]
x = org_data1[var_x]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=2018)
print(x.shape)
print(y.shape)
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)
(4754, 14)
(4754, 1)
(3327, 14)
(3327, 1)
(1427, 14)
(1427, 1)
# 5、正负样本转换
# 正样本比例25%,占比较低
# org_data.groupby(var_y)[var_y].count()

5、模型训练和评估

tf = RandomForestClassifier(n_estimators=500, min_samples_leaf = 5)
tf_model = tf.fit(x_train, y_train)
tf_model
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
# 查看拟合和预测结果
print("拟合效果:")
print(tf_model.score(x_train, y_train))
print(classification_report(y_train, tf_model.predict(x_train)))
print("预测效果:")
print(tf_model.score(x_test, y_test))
print(classification_report(y_test, tf_model.predict(x_test)))
拟合效果:
0.8941989780583108
              precision    recall  f1-score   support

           0       0.88      0.99      0.93      2493
           1       0.96      0.61      0.74       834

   micro avg       0.89      0.89      0.89      3327
   macro avg       0.92      0.80      0.84      3327
weighted avg       0.90      0.89      0.89      3327

预测效果:
0.7806587245970568
              precision    recall  f1-score   support

           0       0.80      0.94      0.87      1068
           1       0.64      0.30      0.41       359

   micro avg       0.78      0.78      0.78      1427
   macro avg       0.72      0.62      0.64      1427
weighted avg       0.76      0.78      0.75      1427
gbc = GradientBoostingClassifier(n_estimators=200)
gbc_model = gbc.fit(x_train, y_train)
gbc_model
GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=200,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)
# 查看拟合和预测结果
print("拟合效果:")
print(gbc_model.score(x_train, y_train))
print(classification_report(y_train, gbc_model.predict(x_train)))
print("预测效果:")
print(gbc_model.score(x_test, y_test))
print(classification_report(y_test, gbc_model.predict(x_test)))
拟合效果:
0.868951006913135
              precision    recall  f1-score   support

           0       0.87      0.98      0.92      2493
           1       0.88      0.55      0.68       834

   micro avg       0.87      0.87      0.87      3327
   macro avg       0.88      0.76      0.80      3327
weighted avg       0.87      0.87      0.86      3327

预测效果:
0.775052557813595
              precision    recall  f1-score   support

           0       0.81      0.92      0.86      1068
           1       0.59      0.35      0.44       359

   micro avg       0.78      0.78      0.78      1427
   macro avg       0.70      0.64      0.65      1427
weighted avg       0.75      0.78      0.75      1427
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值