import pandas as pd
# 忽略弹出的warnings
import warnings
warnings.filterwarnings('ignore')
text=pd.read_excel('data/LoanStats_securev1_2019Q4.xlsx')
text.head()
id | loan_amnt | funded_amnt | funded_amnt_inv | term | int_rate | installment | grade | sub_grade | emp_title | ... | num_tl_90g_dpd_24m | num_tl_op_past_12m | pct_tl_nvr_dlq | percent_bc_gt_75 | pub_rec_bankruptcies | tax_liens | tot_hi_cred_lim | total_bal_ex_mort | total_bc_limit | total_il_high_credit_limit | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 164027473 | 20000 | 20000 | 20000 | 36 months | 0.1240 | 668.12 | B | B4 | NaN | ... | 0 | 2 | 100.0 | 50.0 | 1 | 0 | 60800 | 42566 | 5200 | 40000.0 |
1 | 163984413 | 16500 | 16500 | 16500 | 60 months | 0.1033 | 353.27 | B | B1 | NaN | ... | 0 | 0 | 100.0 | 0.0 | 0 | 0 | 223390 | 40913 | 40500 | 39890.0 |
2 | 164193225 | 7500 | 7500 | 7500 | 36 months | 0.1240 | 250.55 | B | B4 | Rn | ... | 0 | 7 | 54.5 | 16.7 | 0 | 0 | 138468 | 102122 | 47700 | 90768.0 |
3 | 162948736 | 19000 | 19000 | 18975 | 36 months | 0.0646 | 581.99 | A | A1 | Tech Ops Analyst | ... | 0 | 0 | 100.0 | 40.0 | 0 | 0 | 184034 | 28461 | 38400 | 35000.0 |
4 | 164161686 | 10000 | 10000 | 10000 | 36 months | 0.2055 | 374.45 | D | D2 | Planner | ... | 0 | 2 | 100.0 | 16.7 | 0 | 0 | 639373 | 161516 | 24600 | 172818.0 |
5 rows × 114 columns
text['loan_status'].value_counts()
Current 122625
Fully Paid 3539
In Grace Period 1079
Late (31-120 days) 509
Late (16-30 days) 304
Charged Off 80
n 1
Name: loan_status, dtype: int64
#0为已经完成的
def function(x):
if 'Current' in x:
return 0
elif 'Fully Paid' in x:
return 0
else:
return 1
text['loan_status']=text.apply(lambda x:function(x['loan_status']),axis=1)
text['loan_status'].value_counts()
0 126164
1 1973
Name: loan_status, dtype: int64
pos_trainDf = text[text['loan_status'] == 1]
neg_trainDf = text[text['loan_status'] == 0].sample(n=5000, random_state=2018)
text = pd.concat([pos_trainDf, neg_trainDf], axis=0).sample(frac=1.0,random_state=2018)
text.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 6973 entries, 110105 to 92872
Columns: 114 entries, id to total_il_high_credit_limit
dtypes: datetime64[ns](1), float64(36), int64(50), object(27)
memory usage: 6.1+ MB
缺失值查看
check_null = text.isnull().sum(axis=0).sort_values(ascending=False)/float(len(text)) #查看缺失值比例
print(check_null[check_null >0.2]) # 查看缺失比例大于20%的属性。
desc 0.999857
mths_since_last_record 0.899613
verification_status_joint 0.882977
annual_inc_joint 0.864334
dti_joint 0.864334
mths_since_recent_bc_dlq 0.794206
mths_since_last_major_derog 0.771691
mths_since_recent_revol_delinq 0.704145
mths_since_last_delinq 0.551556
dtype: float64
thresh_count = len(text)*0.4 # 设定阀值
data = text.dropna(thresh=thresh_count, axis=1 ) #若某一列数据缺失的数量超过阀值就会被删除
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 6973 entries, 110105 to 92872
Columns: 106 entries, id to total_il_high_credit_limit
dtypes: datetime64[ns](1), float64(30), int64(50), object(25)
memory usage: 5.7+ MB
删除无意义的列
sub_grade:与Grade的信息重复
emp_title :缺失值较多,同时不能反映借款人收入或资产的真实情况
zip_code:地址邮编,邮编显示不全,没有意义
addr_state:申请地址所属州,不能反映借款人的偿债能力
last_credit_pull_d :LendingClub平台最近一个提供贷款的时间,没有意义
policy_code : 变量信息全为1
pymnt_plan 基本是n
title: title与purpose的信息重复,同时title的分类信息更加离散
next_pymnt_d : 下一个付款时间,没有意义
policy_code : 没有意义
collection_recovery_fee: 全为0,没有意义
earliest_cr_line : 记录的是借款人发生第一笔借款的时间
issue_d : 贷款发行时间,这里提前向模型泄露了信息
last_pymnt_d、collection_recovery_fee、last_pymnt_amnt: 预测贷款违约模型是贷款前的风险控制手段,这些贷后信息都会影响我们训练模型的效果,在此将这些信息删除
drop_list = ['sub_grade', 'emp_title', 'title', 'zip_code', 'addr_state',
'mths_since_last_delinq' ,'initial_list_status','title','issue_d','last_pymnt_d','last_pymnt_amnt',
'next_pymnt_d','last_credit_pull_d','policy_code','collection_recovery_fee', 'earliest_cr_line']
data.drop(drop_list, axis=1, inplace = True)
data.head()
id | loan_amnt | funded_amnt | funded_amnt_inv | term | int_rate | installment | grade | emp_length | home_ownership | ... | num_tl_90g_dpd_24m | num_tl_op_past_12m | pct_tl_nvr_dlq | percent_bc_gt_75 | pub_rec_bankruptcies | tax_liens | tot_hi_cred_lim | total_bal_ex_mort | total_bc_limit | total_il_high_credit_limit | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
110105 | 160041532 | 5000 | 5000 | 5000 | 36 months | 0.1102 | 163.75 | B | NaN | RENT | ... | 0 | 0 | 66.7 | 100.0 | 0 | 0 | 4900 | 3313 | 2800 | 0.0 |
36272 | 162224912 | 9000 | 9000 | 9000 | 36 months | 0.0819 | 282.82 | A | 10+ years | MORTGAGE | ... | 1 | 1 | 89.7 | 0.0 | 0 | 0 | 443788 | 77239 | 45400 | 84288.0 |
66408 | 161687267 | 15950 | 15950 | 15950 | 36 months | 0.1171 | 527.57 | B | 10+ years | MORTGAGE | ... | 0 | 2 | 100.0 | 0.0 | 0 | 0 | 233421 | 115483 | 62100 | 90541.0 |
121280 | 159844563 | 5600 | 5600 | 5600 | 36 months | 0.1171 | 185.23 | B | 10+ years | RENT | ... | 0 | 7 | 100.0 | 0.0 | 0 | 0 | 64866 | 5486 | 51000 | 4166.0 |
20195 | 163365512 | 28000 | 28000 | 28000 | 60 months | 0.0819 | 570.29 | A | < 1 year | MORTGAGE | ... | 0 | 4 | 100.0 | 10.0 | 0 | 0 | 115641 | 66401 | 33800 | 75041.0 |
5 rows × 91 columns
分类变量
objectColumns = data.select_dtypes(include=["object"]).columns
data[objectColumns].isnull().sum().sort_values(ascending=False)
emp_length 651
application_type 1
url 1
total_acc 0
delinq_2yrs 0
purpose 0
pymnt_plan 0
verification_status 0
annual_inc 0
home_ownership 0
grade 0
term 0
dtype: int64
# data['int_rate'] = data['int_rate'].str.rstrip('%').astype('float')
# data['revol_util'] = data['revol_util'].str.rstrip('%').astype('float')
# data['annual_inc'] = data['annual_inc'].str.replace(",","").astype('float')
import numpy as np
objectColumns = data.select_dtypes(include=["object"]).columns # 筛选数据类型为object的数据
data[objectColumns] = data[objectColumns].fillna("Unknown") #以分类“Unknown”填充缺失值
import missingno as msno
import matplotlib as mpl
mpl.rcParams['font.sans-serif']=[u'simHei']
mpl.rcParams['axes.unicode_minus']=False
%matplotlib inline
msno.bar(data[objectColumns]) #可视化
<matplotlib.axes._subplots.AxesSubplot at 0x238b1e15a58>
mapping_dict = {
"emp_length": {
"10+ years": 10,
"9 years": 9,
"8 years": 8,
"7 years": 7,
"6 years": 6,
"5 years": 5,
"4 years": 4,
"3 years": 3,
"2 years": 2,
"1 year": 1,
"< 1 year": 0,
"n/a": 0
},
"grade":{
"A": 1,
"B": 2,
"C": 3,
"D": 4,
"E": 5,
"F": 6,
"G": 7
}
}
data = data.replace(mapping_dict) #变量映射
数值类型缺失值
data.select_dtypes(include=[np.number]).isnull().sum().sort_values(ascending=False)
il_util 1011
mths_since_recent_inq 773
mo_sin_old_il_acct 234
mths_since_rcnt_il 234
bc_util 116
...
total_cu_tl 0
inq_fi 0
total_rev_hi_lim 0
total_bc_limit 0
id 0
Length: 80, dtype: int64
numColumns = data.select_dtypes(include=[np.number]).columns
msno.matrix(data[numColumns]) #缺失值可视化
<matplotlib.axes._subplots.AxesSubplot at 0x238d13c17f0>
data.isnull().sum().sum()
mean_cols=data.mean()
data= data.fillna(mean_cols)
目标变量
y=data['int_rate']
x=data.drop(['int_rate'],axis=1)
#使用pandas库将类别变量编码
x=pd.get_dummies(x)
特征工程
#数据进行分割(训练数据和测试数据)
from sklearn.model_selection import train_test_split#测试集和训练集
x_train1, x_test1, y_train1, y_test1 = train_test_split(x, y, train_size=0.8, random_state=14)
x_train, x_test, y_train, y_test = x_train1, x_test1, y_train1, y_test1
print ("训练数据集样本数目:%d, 测试数据集样本数目:%d" % (x_train.shape[0], x_test.shape[0]))
y_train = y_train.astype(np.int)
y_test = y_test.astype(np.int)
训练数据集样本数目:5578, 测试数据集样本数目:1395
#标准化
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(x_train)
x_train = pd.DataFrame(scaler.transform(x_train))
x_test = pd.DataFrame(scaler.transform(x_test))
x_train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5578 entries, 0 to 5577
Columns: 8267 entries, 0 to 8266
dtypes: float64(8267)
memory usage: 351.8 MB
#降维
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
x_train = pca.fit_transform(x_train)
x_test = pca.transform(x_test)
x_train.shape
print(pca.explained_variance_ratio_)
[0.00212175 0.00138494]
y_train.shape
(5578,)
神经网络
import keras
from keras.models import Sequential
from keras.layers import Dense
classifier = Sequential()
Using Theano backend.
WARNING (theano.configdefaults): g++ not available, if using conda: `conda install m2w64-toolchain`
D:\sofewore\anaconda\lib\site-packages\theano\configdefaults.py:560: UserWarning: DeprecationWarning: there is no c++ compiler.This is deprecated and with Theano 0.11 a c++ compiler will be mandatory
warnings.warn("DeprecationWarning: there is no c++ compiler."
WARNING (theano.configdefaults): g++ not detected ! Theano will be unable to execute optimized C-implementations (for both CPU and GPU) and will default to Python implementations. Performance will be severely degraded. To remove this warning, set Theano flags cxx to an empty string.
WARNING (theano.tensor.blas): Using NumPy C-API based implementation for BLAS functions.
model = Sequential()
model.add(Dense(32, activation='relu', input_dim=2))
model.add(Dense(1, activation='relu'))
model.compile(optimizer='rmsprop',
loss='mse',
metrics=['accuracy'])
model.fit(x_train, y_train, batch_size =1000,epochs=1)
y_true=model.predict(x_test)
Epoch 1/1
5578/5578 [==============================] - 13s 2ms/step - loss: 0.2186 - accuracy: 1.0000