个人贷款违约预测模型

import os
import pandas as pd

os.chdir('E:\BaiduNetdiskDownload\违约贷款案例')

# ## 1.1导入数据
loanfile = os.listdir()
createVar = locals()
for i in loanfile:
    if i.endswith("csv"):        
        createVar[i.split('.')[0]] = pd.read_csv(i, encoding = 'gbk')
        print(i.split('.')[0])# 将数据表依次导入并写出表名

counts
card
clients
disp
district
loans
order
RFM_TRAD_FLOW
trans

# ## 1.2、生成被解释变量bad_good
bad_good = {'B':1, 'D':1, 'A':0, 'C': 2} #A为正常还款,BD违约,C还在还款
loans['bad_good'] = loans.status.map(bad_good)
loans.head()

loan_idaccount_iddateamountdurationpaymentsstatusbad_good
0531417871993-07-0596396128033B1
1531618011993-07-11165960364610A0
2686391881993-07-28127080602118A0
3532518431993-08-03105804362939A0
47240110131993-09-06274740604579A0

# ## 1.3、借款人的年龄、性别

data2 = pd.merge(loans, disp, on = 'account_id', how = 'left') #表之间关联起来,用left连接

data2 = pd.merge(data2, clients, on = 'client_id', how = 'left')

data2=data2[data2.type=='所有者']

data2.head()

loan_idaccount_iddateamountdurationpaymentsstatusbad_gooddisp_idclient_idtypesexbirth_datedistrict_id
0531417871993-07-0596396128033B121662166所有者1947-07-2230
1531618011993-07-11165960364610A021812181所有者1968-07-2246
2686391881993-07-28127080602118A01100611314所有者1936-06-0245
3532518431993-08-03105804362939A022352235所有者1940-04-2014
47240110131993-09-06274740604579A01323113539所有者1978-09-0763

# ## 1.4、借款人居住地的经济状况
data3 = pd.merge(data2, district, left_on = 'district_id', right_on = 'A1', how = 'left')
data3.head()

loan_idaccount_iddateamountdurationpaymentsstatusbad_gooddisp_idclient_id...A1GDPA4A10A11A12A13A14A15a16
0531417871993-07-0596396128033B121662166...30169799481281.896503.383.6710015.714.8
1531618011993-07-11165960364610A021812181...461411111270973.583691.792.3111712.711.6
2686391881993-07-28127080602118A01100611314...45128887791753.583902.282.8913213.313.6
3532518431993-08-03105804362939A022352235...143189117768674.8100451.421.7113518.617.7
47240110131993-09-06274740604579A01323113539...63113228651350.582883.794.521109.08.4

# ## 1.5、贷款前一年内的账户平均余额、余额的标准差、变异系数、平均收入和平均支出的比例

data_4temp1 = pd.merge(loans[['account_id', 'date']],trans[['account_id','type','amount','balance','date']],                       on = 'account_id')
data_4temp1.columns = ['account_id', 'date', 'type', 'amount', 'balance', 't_date']
data_4temp1 = data_4temp1.sort_values(by = ['account_id','t_date'])
data_4temp1['date']=pd.to_datetime(data_4temp1['date'])
data_4temp1['t_date']=pd.to_datetime(data_4temp1['t_date'])
data_4temp1.tail()
account_iddatetypeamountbalancet_date
127263113621996-12-27$56$514201998-12-08
127264113621996-12-27$4,780$466401998-12-10
127265113621996-12-27$5,392$412481998-12-12
127266113621996-12-27$2,880$383681998-12-19
127267113621996-12-27$163$385311998-12-31

# ## 将对账户余额进行清洗

data_4temp1['balance2'] = data_4temp1['balance'].map(lambda x: int(''.join(x[1:].split(','))))

data_4temp1['amount2'] = data_4temp1['amount'].map(lambda x: int(''.join(x[1:].split(','))))

data_4temp1.tail()

account_iddatetypeamountbalancet_datebalance2amount2
127263113621996-12-27$56$514201998-12-085142056
127264113621996-12-27$4,780$466401998-12-10466404780
127265113621996-12-27$5,392$412481998-12-12412485392
127266113621996-12-27$2,880$383681998-12-19383682880
127267113621996-12-27$163$385311998-12-3138531163

len(data_4temp1)#查看交易数据量

191556

# ## 根据取数窗口提取交易数据,只要贷款前1年时间内的数据
import datetime
data_4temp2 = data_4temp1[data_4temp1.date>data_4temp1.t_date][
    data_4temp1.date<data_4temp1.t_date+datetime.timedelta(days=365)]
data_4temp2.tail()

account_iddatetypeamountbalancet_datebalance2amount2
127026113621996-12-27$129$397661996-12-0639766129
127027113621996-12-27$10400$293661996-12-072936610400
127028113621996-12-27$330$290361996-12-0729036330
127029113621996-12-27$56$289801996-12-082898056
127030113621996-12-27$4,780$242001996-12-10242004780

# ### 1.5.1账户平均余额、余额的标准差、变异系数

data_4temp3 = data_4temp2.groupby('account_id')['balance2'].agg([('avg_balance','mean'), ('stdev_balance','std')])

data_4temp3['cv_balance'] = data_4temp3[['avg_balance','stdev_balance']].apply(lambda x: x[1]/x[0],axis = 1)

data_4temp3.head()

avg_balancestdev_balancecv_balance
account_id   
232590.75925912061.8022060.370099
1925871.22368415057.5216480.582018
2556916.98449621058.6679490.369989
3736658.98130820782.9966900.566928
3831383.58181810950.7231800.348932
# ### 1.5.2 平均支出和平均收入的比例
type_dict = {'借':'out','贷':'income'}
data_4temp2['type1'] = data_4temp2.type.map(type_dict)
data_4temp4 = data_4temp2.groupby(['account_id','type1'])[['amount2']].sum()
data_4temp4.head()
 amount2
account_idtype1 
2income276514
out153020
19income254255
out198020
25income726479
#转置
data_4temp5 = pd.pivot_table(data_4temp4, values = 'amount2',index = 'account_id', columns = 'type1')
data_4temp5.fillna(0, inplace = True)
data_4temp5['r_out_in'] = data_4temp5[['out','income']].apply(lambda x: x[0]/x[1], axis = 1)data_4temp5.head()
type1incomeoutr_out_in
account_id   
2276514.0153020.00.553390
19254255.0198020.00.778824
25726479.0629108.00.865969
37386357.0328541.00.850356
38154300.0105091.00.681082

#添加到预测变量

data4 = pd.merge(data3, data_4temp3, left_on='account_id', right_index= True, how = 'left')

data4 = pd.merge(data4, data_4temp5, left_on='account_id', right_index= True, how = 'left')

data4.head()

loan_idaccount_iddateamountdurationpaymentsstatusbad_gooddisp_idclient_id...A13A14A15a16avg_balancestdev_balancecv_balanceincomeoutr_out_in
0531417871993-07-0596396128033B121662166...3.6710015.714.812250.0000008330.8663010.68007120100.00.00.000000
1531618011993-07-11165960364610A021812181...2.3111712.711.643975.81081125468.7486050.579154243576.0164004.00.673318
2686391881993-07-28127080602118A01100611314...2.8913213.313.630061.04166711520.1270130.38322475146.054873.00.730219
3532518431993-08-03105804362939A022352235...1.7113518.617.741297.64000014151.3577760.342667120310.086018.00.714970
47240110131993-09-06274740604579A01323113539...4.521109.08.449780.77777822172.5416000.445404276327.0235214.00.851216

# ## 1.6、计算贷存比,贷收比

data4['r_lb'] = data4[['amount','avg_balance']].apply(lambda x: x[0]/x[1],axis = 1)

data4['r_lincome'] = data4[['amount','income']].apply(lambda x: x[0]/x[1],axis = 1)

data4.head()

loan_idaccount_iddateamountdurationpaymentsstatusbad_gooddisp_idclient_id...A15a16avg_balancestdev_balancecv_balanceincomeoutr_out_inr_lbr_lincome
0531417871993-07-0596396128033B121662166...15.714.812250.0000008330.8663010.68007120100.00.00.0000007.8690614.795821
1531618011993-07-11165960364610A021812181...12.711.643975.81081125468.7486050.579154243576.0164004.00.6733183.7738930.681348
2686391881993-07-28127080602118A01100611314...13.313.630061.04166711520.1270130.38322475146.054873.00.7302194.2273981.691108
3532518431993-08-03105804362939A022352235...18.617.741297.64000014151.3577760.342667120310.086018.00.7149702.5619870.879428
47240110131993-09-06274740604579A01323113539...9.08.449780.77777822172.5416000.445404276327.0235214.00.8512165.5189980.994257

# # 2 构建Logistic模型

data4.columns #查看预测变量

Index(['loan_id', 'account_id', 'date', 'amount', 'duration', 'payments',
       'status', 'bad_good', 'disp_id', 'client_id', 'type', 'sex',
       'birth_date', 'district_id', 'A1', 'GDP', 'A4', 'A10', 'A11', 'A12',
       'A13', 'A14', 'A15', 'a16', 'avg_balance', 'stdev_balance',
       'cv_balance', 'income', 'out', 'r_out_in', 'r_lb', 'r_lincome'],
      dtype='object')

# •提取状态为C的用于预测。其它样本随机抽样,建立训练集与测试集
data_model=data4[data4.status!='C']
for_predict=data4[data4.status=='C']

train = data_model.sample(frac=0.7, random_state=1235).copy()# 随机抽样 70%
test = data_model[~ data_model.index.isin(train.index)].copy()
print(' 训练集样本量: %i \n 测试集样本量: %i' %(len(train), len(test)))

 训练集样本量: 195 
 测试集样本量: 84

# 向前法 aic最小为标准,编写函数来遍历变量筛选变量
def forward_select(data, response):
    import statsmodels.api as sm
    import statsmodels.formula.api as smf
    remaining = set(data.columns)
    remaining.remove(response)
    selected = []
    current_score, best_new_score = float('inf'), float('inf')
    while remaining:
        aic_with_candidates=[]
        for candidate in remaining:
            formula = "{} ~ {}".format(
                response,' + '.join(selected + [candidate]))
            aic = smf.glm(
                formula=formula, data=data, 
                family=sm.families.Binomial(sm.families.links.logit)
            ).fit().aic
            aic_with_candidates.append((aic, candidate))
        aic_with_candidates.sort(reverse=True)
        best_new_score, best_candidate=aic_with_candidates.pop()
        if current_score > best_new_score: 
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            current_score = best_new_score
            print ('aic is {},continuing!'.format(current_score))
        else:        
            print ('forward selection over!')
            break
            
    formula = "{} ~ {} ".format(response,' + '.join(selected))
    print('final formula is {}'.format(formula))
    model = smf.glm(
        formula=formula, data=data, 
        family=sm.families.Binomial(sm.families.links.logit)
    ).fit()
    return(model)

 

candidates = ['bad_good', 'A1', 'GDP', 'A4', 'A10', 'A11', 'A12','amount', 'duration',
       'A13', 'A14', 'A15', 'a16', 'avg_balance', 'stdev_balance',
       'cv_balance', 'income', 'out', 'r_out_in', 'r_lb', 'r_lincome']
data_for_select = train[candidates]

lg_m1 = forward_select(data=data_for_select, response='bad_good')
lg_m1.summary().tables[1]

aic is 167.43311432504638,continuing!
aic is 135.82435856041837,continuing!
forward selection over!
final formula is bad_good ~ r_lb + cv_balance  #结论显示贷存比,存款的变异系数  对预测变量影响最大

 coefstd errzP>|z|[0.0250.975]
Intercept-7.42601.125-6.5990.000-9.632-5.220
r_lb0.43910.0944.6510.0000.2540.624
cv_balance10.13552.0944.8410.0006.03214.239

 

import sklearn.metrics as metrics#验证,查看roc曲线
import matplotlib.pyplot as plt
fpr, tpr, th = metrics.roc_curve(test.bad_good, lg_m1.predict(test))
plt.figure(figsize=[6, 6])
plt.plot(fpr, tpr, 'b--')
plt.title('ROC curve')
plt.show()

print('AUC = %.4f' %metrics.auc(fpr, tpr))#信用评级,一般大于0.8就可以了

AUC = 0.8846

for_predict['prob']=lg_m1.predict(for_predict) #使用模型预测用户的还款概率
for_predict[['account_id','prob']].head()

account_idprob
2310710.704914
3053130.852249
38100790.118128
3953850.177591
4283210.024302

 

  • 1
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 3
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值