import os
import pandas as pd
os.chdir('E:\BaiduNetdiskDownload\违约贷款案例')
# ## 1.1导入数据
loanfile = os.listdir()
createVar = locals()
for i in loanfile:
if i.endswith("csv"):
createVar[i.split('.')[0]] = pd.read_csv(i, encoding = 'gbk')
print(i.split('.')[0])# 将数据表依次导入并写出表名
counts card clients disp district loans order RFM_TRAD_FLOW trans
# ## 1.2、生成被解释变量bad_good
bad_good = {'B':1, 'D':1, 'A':0, 'C': 2} #A为正常还款,BD违约,C还在还款
loans['bad_good'] = loans.status.map(bad_good)
loans.head()
loan_id | account_id | date | amount | duration | payments | status | bad_good | |
---|---|---|---|---|---|---|---|---|
0 | 5314 | 1787 | 1993-07-05 | 96396 | 12 | 8033 | B | 1 |
1 | 5316 | 1801 | 1993-07-11 | 165960 | 36 | 4610 | A | 0 |
2 | 6863 | 9188 | 1993-07-28 | 127080 | 60 | 2118 | A | 0 |
3 | 5325 | 1843 | 1993-08-03 | 105804 | 36 | 2939 | A | 0 |
4 | 7240 | 11013 | 1993-09-06 | 274740 | 60 | 4579 | A | 0 |
# ## 1.3、借款人的年龄、性别
data2 = pd.merge(loans, disp, on = 'account_id', how = 'left') #表之间关联起来,用left连接
data2 = pd.merge(data2, clients, on = 'client_id', how = 'left')
data2=data2[data2.type=='所有者']
data2.head()
loan_id | account_id | date | amount | duration | payments | status | bad_good | disp_id | client_id | type | sex | birth_date | district_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 5314 | 1787 | 1993-07-05 | 96396 | 12 | 8033 | B | 1 | 2166 | 2166 | 所有者 | 女 | 1947-07-22 | 30 |
1 | 5316 | 1801 | 1993-07-11 | 165960 | 36 | 4610 | A | 0 | 2181 | 2181 | 所有者 | 男 | 1968-07-22 | 46 |
2 | 6863 | 9188 | 1993-07-28 | 127080 | 60 | 2118 | A | 0 | 11006 | 11314 | 所有者 | 男 | 1936-06-02 | 45 |
3 | 5325 | 1843 | 1993-08-03 | 105804 | 36 | 2939 | A | 0 | 2235 | 2235 | 所有者 | 女 | 1940-04-20 | 14 |
4 | 7240 | 11013 | 1993-09-06 | 274740 | 60 | 4579 | A | 0 | 13231 | 13539 | 所有者 | 男 | 1978-09-07 | 63 |
# ## 1.4、借款人居住地的经济状况
data3 = pd.merge(data2, district, left_on = 'district_id', right_on = 'A1', how = 'left')
data3.head()
loan_id | account_id | date | amount | duration | payments | status | bad_good | disp_id | client_id | ... | A1 | GDP | A4 | A10 | A11 | A12 | A13 | A14 | A15 | a16 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 5314 | 1787 | 1993-07-05 | 96396 | 12 | 8033 | B | 1 | 2166 | 2166 | ... | 30 | 16979 | 94812 | 81.8 | 9650 | 3.38 | 3.67 | 100 | 15.7 | 14.8 |
1 | 5316 | 1801 | 1993-07-11 | 165960 | 36 | 4610 | A | 0 | 2181 | 2181 | ... | 46 | 14111 | 112709 | 73.5 | 8369 | 1.79 | 2.31 | 117 | 12.7 | 11.6 |
2 | 6863 | 9188 | 1993-07-28 | 127080 | 60 | 2118 | A | 0 | 11006 | 11314 | ... | 45 | 12888 | 77917 | 53.5 | 8390 | 2.28 | 2.89 | 132 | 13.3 | 13.6 |
3 | 5325 | 1843 | 1993-08-03 | 105804 | 36 | 2939 | A | 0 | 2235 | 2235 | ... | 14 | 31891 | 177686 | 74.8 | 10045 | 1.42 | 1.71 | 135 | 18.6 | 17.7 |
4 | 7240 | 11013 | 1993-09-06 | 274740 | 60 | 4579 | A | 0 | 13231 | 13539 | ... | 63 | 11322 | 86513 | 50.5 | 8288 | 3.79 | 4.52 | 110 | 9.0 | 8.4 |
# ## 1.5、贷款前一年内的账户平均余额、余额的标准差、变异系数、平均收入和平均支出的比例
data_4temp1 = pd.merge(loans[['account_id', 'date']],trans[['account_id','type','amount','balance','date']], on = 'account_id') data_4temp1.columns = ['account_id', 'date', 'type', 'amount', 'balance', 't_date'] data_4temp1 = data_4temp1.sort_values(by = ['account_id','t_date']) data_4temp1['date']=pd.to_datetime(data_4temp1['date']) data_4temp1['t_date']=pd.to_datetime(data_4temp1['t_date']) data_4temp1.tail()
account_id | date | type | amount | balance | t_date | |
---|---|---|---|---|---|---|
127263 | 11362 | 1996-12-27 | 借 | $56 | $51420 | 1998-12-08 |
127264 | 11362 | 1996-12-27 | 借 | $4,780 | $46640 | 1998-12-10 |
127265 | 11362 | 1996-12-27 | 借 | $5,392 | $41248 | 1998-12-12 |
127266 | 11362 | 1996-12-27 | 借 | $2,880 | $38368 | 1998-12-19 |
127267 | 11362 | 1996-12-27 | 贷 | $163 | $38531 | 1998-12-31 |
# ## 将对账户余额进行清洗
data_4temp1['balance2'] = data_4temp1['balance'].map(lambda x: int(''.join(x[1:].split(','))))
data_4temp1['amount2'] = data_4temp1['amount'].map(lambda x: int(''.join(x[1:].split(','))))
data_4temp1.tail()
account_id | date | type | amount | balance | t_date | balance2 | amount2 | |
---|---|---|---|---|---|---|---|---|
127263 | 11362 | 1996-12-27 | 借 | $56 | $51420 | 1998-12-08 | 51420 | 56 |
127264 | 11362 | 1996-12-27 | 借 | $4,780 | $46640 | 1998-12-10 | 46640 | 4780 |
127265 | 11362 | 1996-12-27 | 借 | $5,392 | $41248 | 1998-12-12 | 41248 | 5392 |
127266 | 11362 | 1996-12-27 | 借 | $2,880 | $38368 | 1998-12-19 | 38368 | 2880 |
127267 | 11362 | 1996-12-27 | 贷 | $163 | $38531 | 1998-12-31 | 38531 | 163 |
len(data_4temp1)#查看交易数据量
191556
# ## 根据取数窗口提取交易数据,只要贷款前1年时间内的数据
import datetime
data_4temp2 = data_4temp1[data_4temp1.date>data_4temp1.t_date][
data_4temp1.date<data_4temp1.t_date+datetime.timedelta(days=365)]
data_4temp2.tail()
account_id | date | type | amount | balance | t_date | balance2 | amount2 | |
---|---|---|---|---|---|---|---|---|
127026 | 11362 | 1996-12-27 | 借 | $129 | $39766 | 1996-12-06 | 39766 | 129 |
127027 | 11362 | 1996-12-27 | 借 | $10400 | $29366 | 1996-12-07 | 29366 | 10400 |
127028 | 11362 | 1996-12-27 | 借 | $330 | $29036 | 1996-12-07 | 29036 | 330 |
127029 | 11362 | 1996-12-27 | 借 | $56 | $28980 | 1996-12-08 | 28980 | 56 |
127030 | 11362 | 1996-12-27 | 借 | $4,780 | $24200 | 1996-12-10 | 24200 | 4780 |
# ### 1.5.1账户平均余额、余额的标准差、变异系数
data_4temp3 = data_4temp2.groupby('account_id')['balance2'].agg([('avg_balance','mean'), ('stdev_balance','std')])
data_4temp3['cv_balance'] = data_4temp3[['avg_balance','stdev_balance']].apply(lambda x: x[1]/x[0],axis = 1)
data_4temp3.head()
avg_balance | stdev_balance | cv_balance | |
---|---|---|---|
account_id | |||
2 | 32590.759259 | 12061.802206 | 0.370099 |
19 | 25871.223684 | 15057.521648 | 0.582018 |
25 | 56916.984496 | 21058.667949 | 0.369989 |
37 | 36658.981308 | 20782.996690 | 0.566928 |
38 | 31383.581818 | 10950.723180 | 0.348932 |
# ### 1.5.2 平均支出和平均收入的比例 type_dict = {'借':'out','贷':'income'} data_4temp2['type1'] = data_4temp2.type.map(type_dict) data_4temp4 = data_4temp2.groupby(['account_id','type1'])[['amount2']].sum() data_4temp4.head()
amount2 | ||
---|---|---|
account_id | type1 | |
2 | income | 276514 |
out | 153020 | |
19 | income | 254255 |
out | 198020 | |
25 | income | 726479 |
#转置 data_4temp5 = pd.pivot_table(data_4temp4, values = 'amount2',index = 'account_id', columns = 'type1') data_4temp5.fillna(0, inplace = True) data_4temp5['r_out_in'] = data_4temp5[['out','income']].apply(lambda x: x[0]/x[1], axis = 1)data_4temp5.head()
type1 | income | out | r_out_in |
---|---|---|---|
account_id | |||
2 | 276514.0 | 153020.0 | 0.553390 |
19 | 254255.0 | 198020.0 | 0.778824 |
25 | 726479.0 | 629108.0 | 0.865969 |
37 | 386357.0 | 328541.0 | 0.850356 |
38 | 154300.0 | 105091.0 | 0.681082 |
#添加到预测变量
data4 = pd.merge(data3, data_4temp3, left_on='account_id', right_index= True, how = 'left')
data4 = pd.merge(data4, data_4temp5, left_on='account_id', right_index= True, how = 'left')
data4.head()
loan_id | account_id | date | amount | duration | payments | status | bad_good | disp_id | client_id | ... | A13 | A14 | A15 | a16 | avg_balance | stdev_balance | cv_balance | income | out | r_out_in | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 5314 | 1787 | 1993-07-05 | 96396 | 12 | 8033 | B | 1 | 2166 | 2166 | ... | 3.67 | 100 | 15.7 | 14.8 | 12250.000000 | 8330.866301 | 0.680071 | 20100.0 | 0.0 | 0.000000 |
1 | 5316 | 1801 | 1993-07-11 | 165960 | 36 | 4610 | A | 0 | 2181 | 2181 | ... | 2.31 | 117 | 12.7 | 11.6 | 43975.810811 | 25468.748605 | 0.579154 | 243576.0 | 164004.0 | 0.673318 |
2 | 6863 | 9188 | 1993-07-28 | 127080 | 60 | 2118 | A | 0 | 11006 | 11314 | ... | 2.89 | 132 | 13.3 | 13.6 | 30061.041667 | 11520.127013 | 0.383224 | 75146.0 | 54873.0 | 0.730219 |
3 | 5325 | 1843 | 1993-08-03 | 105804 | 36 | 2939 | A | 0 | 2235 | 2235 | ... | 1.71 | 135 | 18.6 | 17.7 | 41297.640000 | 14151.357776 | 0.342667 | 120310.0 | 86018.0 | 0.714970 |
4 | 7240 | 11013 | 1993-09-06 | 274740 | 60 | 4579 | A | 0 | 13231 | 13539 | ... | 4.52 | 110 | 9.0 | 8.4 | 49780.777778 | 22172.541600 | 0.445404 | 276327.0 | 235214.0 | 0.851216 |
# ## 1.6、计算贷存比,贷收比
data4['r_lb'] = data4[['amount','avg_balance']].apply(lambda x: x[0]/x[1],axis = 1)
data4['r_lincome'] = data4[['amount','income']].apply(lambda x: x[0]/x[1],axis = 1)
data4.head()
loan_id | account_id | date | amount | duration | payments | status | bad_good | disp_id | client_id | ... | A15 | a16 | avg_balance | stdev_balance | cv_balance | income | out | r_out_in | r_lb | r_lincome | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 5314 | 1787 | 1993-07-05 | 96396 | 12 | 8033 | B | 1 | 2166 | 2166 | ... | 15.7 | 14.8 | 12250.000000 | 8330.866301 | 0.680071 | 20100.0 | 0.0 | 0.000000 | 7.869061 | 4.795821 |
1 | 5316 | 1801 | 1993-07-11 | 165960 | 36 | 4610 | A | 0 | 2181 | 2181 | ... | 12.7 | 11.6 | 43975.810811 | 25468.748605 | 0.579154 | 243576.0 | 164004.0 | 0.673318 | 3.773893 | 0.681348 |
2 | 6863 | 9188 | 1993-07-28 | 127080 | 60 | 2118 | A | 0 | 11006 | 11314 | ... | 13.3 | 13.6 | 30061.041667 | 11520.127013 | 0.383224 | 75146.0 | 54873.0 | 0.730219 | 4.227398 | 1.691108 |
3 | 5325 | 1843 | 1993-08-03 | 105804 | 36 | 2939 | A | 0 | 2235 | 2235 | ... | 18.6 | 17.7 | 41297.640000 | 14151.357776 | 0.342667 | 120310.0 | 86018.0 | 0.714970 | 2.561987 | 0.879428 |
4 | 7240 | 11013 | 1993-09-06 | 274740 | 60 | 4579 | A | 0 | 13231 | 13539 | ... | 9.0 | 8.4 | 49780.777778 | 22172.541600 | 0.445404 | 276327.0 | 235214.0 | 0.851216 | 5.518998 | 0.994257 |
# # 2 构建Logistic模型
data4.columns #查看预测变量
Index(['loan_id', 'account_id', 'date', 'amount', 'duration', 'payments', 'status', 'bad_good', 'disp_id', 'client_id', 'type', 'sex', 'birth_date', 'district_id', 'A1', 'GDP', 'A4', 'A10', 'A11', 'A12', 'A13', 'A14', 'A15', 'a16', 'avg_balance', 'stdev_balance', 'cv_balance', 'income', 'out', 'r_out_in', 'r_lb', 'r_lincome'], dtype='object')
# •提取状态为C的用于预测。其它样本随机抽样,建立训练集与测试集
data_model=data4[data4.status!='C']
for_predict=data4[data4.status=='C']
train = data_model.sample(frac=0.7, random_state=1235).copy()# 随机抽样 70%
test = data_model[~ data_model.index.isin(train.index)].copy()
print(' 训练集样本量: %i \n 测试集样本量: %i' %(len(train), len(test)))
训练集样本量: 195 测试集样本量: 84
# 向前法 aic最小为标准,编写函数来遍历变量筛选变量
def forward_select(data, response):
import statsmodels.api as sm
import statsmodels.formula.api as smf
remaining = set(data.columns)
remaining.remove(response)
selected = []
current_score, best_new_score = float('inf'), float('inf')
while remaining:
aic_with_candidates=[]
for candidate in remaining:
formula = "{} ~ {}".format(
response,' + '.join(selected + [candidate]))
aic = smf.glm(
formula=formula, data=data,
family=sm.families.Binomial(sm.families.links.logit)
).fit().aic
aic_with_candidates.append((aic, candidate))
aic_with_candidates.sort(reverse=True)
best_new_score, best_candidate=aic_with_candidates.pop()
if current_score > best_new_score:
remaining.remove(best_candidate)
selected.append(best_candidate)
current_score = best_new_score
print ('aic is {},continuing!'.format(current_score))
else:
print ('forward selection over!')
break
formula = "{} ~ {} ".format(response,' + '.join(selected))
print('final formula is {}'.format(formula))
model = smf.glm(
formula=formula, data=data,
family=sm.families.Binomial(sm.families.links.logit)
).fit()
return(model)
candidates = ['bad_good', 'A1', 'GDP', 'A4', 'A10', 'A11', 'A12','amount', 'duration',
'A13', 'A14', 'A15', 'a16', 'avg_balance', 'stdev_balance',
'cv_balance', 'income', 'out', 'r_out_in', 'r_lb', 'r_lincome']
data_for_select = train[candidates]
lg_m1 = forward_select(data=data_for_select, response='bad_good')
lg_m1.summary().tables[1]
aic is 167.43311432504638,continuing! aic is 135.82435856041837,continuing! forward selection over! final formula is bad_good ~ r_lb + cv_balance #结论显示贷存比,存款的变异系数 对预测变量影响最大
coef | std err | z | P>|z| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | -7.4260 | 1.125 | -6.599 | 0.000 | -9.632 | -5.220 |
r_lb | 0.4391 | 0.094 | 4.651 | 0.000 | 0.254 | 0.624 |
cv_balance | 10.1355 | 2.094 | 4.841 | 0.000 | 6.032 | 14.239 |
import sklearn.metrics as metrics#验证,查看roc曲线
import matplotlib.pyplot as plt
fpr, tpr, th = metrics.roc_curve(test.bad_good, lg_m1.predict(test))
plt.figure(figsize=[6, 6])
plt.plot(fpr, tpr, 'b--')
plt.title('ROC curve')
plt.show()
print('AUC = %.4f' %metrics.auc(fpr, tpr))#信用评级,一般大于0.8就可以了 AUC = 0.8846
for_predict['prob']=lg_m1.predict(for_predict) #使用模型预测用户的还款概率
for_predict[['account_id','prob']].head()
account_id | prob | |
---|---|---|
23 | 1071 | 0.704914 |
30 | 5313 | 0.852249 |
38 | 10079 | 0.118128 |
39 | 5385 | 0.177591 |
42 | 8321 | 0.024302 |