import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from cal import cal_woe_iv, feature_selection, vif_cal, replace_woe
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import roc_curve, auc
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
%matplotlib inline
pd.options.display.max_rows = 200
pd.options.display.max_columns = 200
这里的cal这个文件里的函数是参考以下的文章,然后自己写的,不好看就先不贴了。
https://cloud.tencent.com/developer/article/1092198
https://www.statsmodels.org/dev/generated/statsmodels.stats.outliers_influence.variance_inflation_factor.html
https://www.jianshu.com/p/b1b1344bd99f
1.特征选取
df = pd.read_csv('../data/rough_clean_data.csv')
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 238636 entries, 0 to 238635
Data columns (total 55 columns):
emp_length 238636 non-null object
home_ownership 238636 non-null object
annual_inc 238636 non-null float64
verification_status 238636 non-null object
purpose 238636 non-null object
dti 238636 non-null float64
inq_last_6mths 238636 non-null float64
open_acc 238636 non-null float64
revol_bal 238636 non-null float64
revol_util 238636 non-null float64
total_acc 238636 non-null float64
tot_cur_bal 238636 non-null float64
open_acc_6m 238636 non-null float64
open_act_il 238636 non-null float64
open_il_12m 238636 non-null float64
open_il_24m 238636 non-null float64
mths_since_rcnt_il 238636 non-null float64
total_bal_il 238636 non-null float64
il_util 238636 non-null float64
open_rv_12m 238636 non-null float64
open_rv_24m 238636 non-null float64
max_bal_bc 238636 non-null float64
all_util 238636 non-null float64
total_rev_hi_lim 238636 non-null float64
inq_fi 238636 non-null float64
total_cu_tl 238636 non-null float64
inq_last_12m 238636 non-null float64
acc_open_past_24mths 238636 non-null float64
avg_cur_bal 238636 non-null float64
bc_open_to_buy 238636 non-null float64
bc_util 238636 non-null float64
mo_sin_old_il_acct 238636 non-null float64
mo_sin_old_rev_tl_op 238636 non-null float64
mo_sin_rcnt_rev_tl_op 238636 non-null float64
mo_sin_rcnt_tl 238636 non-null float64
mort_acc 238636 non-null float64
mths_since_recent_bc 238636 non-null float64
mths_since_recent_inq 238636 non-null float64
num_actv_bc_tl 238636 non-null float64
num_actv_rev_tl 238636 non-null float64
num_bc_sats 238636 non-null float64
num_bc_tl 238636 non-null float64
num_il_tl 238636 non-null float64
num_op_rev_tl 238636 non-null float64
num_rev_accts 238636 non-null float64
num_rev_tl_bal_gt_0 238636 non-null float64
num_sats 238636 non-null float64
num_tl_op_past_12m 238636 non-null float64
pct_tl_nvr_dlq 238636 non-null float64
percent_bc_gt_75 238636 non-null float64
tot_hi_cred_lim 238636 non-null float64
total_bal_ex_mort 238636 non-null float64
total_bc_limit 238636 non-null float64
total_il_high_credit_limit 238636 non-null float64
class 238636 non-null int64
dtypes: float64(50), int64(1), object(4)
memory usage: 100.1+ MB
可以看到经过前一篇数据处理之后,变量数量从145个