分箱、WOE、IV的计算

%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
import warnings
from scipy import stats
warnings.filterwarnings('ignore')
plt.style.use("seaborn")
plt.rc('font', family='SimHei', size=13)  # 显示中文
plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负
# 载入数据
data = pd.read_csv(r"./cs_training.csv",encoding='gbk')
# 查看数据集
# data.head(10)

# 将特征名字改为中文
column={'SeriousDlqin2yrs':'好坏客户',
        'RevolvingUtilizationOfUnsecuredLines':'可用额度比值',
        'age':'年龄',
        'NumberOfTime30-59DaysPastDueNotWorse':'逾期30-59天笔数',
        'DebtRatio':'负债率',
        'MonthlyIncome':'月收入',
        'NumberOfOpenCreditLinesAndLoans':'信贷数量',
        'NumberOfTimes90DaysLate':'逾期90天笔数',
        'NumberRealEstateLoansOrLines':'固定资产贷款量',
        'NumberOfTime60-89DaysPastDueNotWorse':'逾期60-89天笔数',
        'NumberOfDependents':'家属数量'}
data.rename(columns=column,inplace=True)
data.head()
好坏客户可用额度比值年龄逾期30-59天笔数负债率月收入信贷数量逾期90天笔数固定资产贷款量逾期60-89天笔数家属数量
010.7661274520.8029829120.0130602.0
100.9571514000.1218762600.040001.0
200.6581803810.0851133042.021000.0
300.2338103000.0360503300.050000.0
400.9072394910.02492663588.070100.0
from sklearn.ensemble import RandomForestRegressor
# 用随机森林对缺失值预测填充函数
def set_missing(df):
    # 把已有的数值型特征取出来
    process_df = df.iloc[:,[5,0,1,2,3,4,6,7,8,9]]
    # 分成已知该特征和未知该特征两部分
    # dataframe.values获取的是dataframe中的数据为数组array
    known = process_df[process_df['月收入'].notnull()].values
    unknown = process_df[process_df['月收入'].isnull()].values
    # X为已知月收入的特征属性值
    X = known[:, 1:]
    # y为结果标签值月收入
    y = known[:, 0]
    # X与y用于训练随机森林模型,fit到RandomForestRegressor之中
    rfr = RandomForestRegressor(random_state=0, n_estimators=200,max_depth=3,n_jobs=-1)
    rfr.fit(X,y)
    # 用得到的模型进行未知特征值预测
    predicted = rfr.predict(unknown[:, 1:]).round(0)
    # 用得到的预测结果填补原缺失数据
    df.loc[df['月收入'].isnull(), '月收入'] = predicted
    return df
# 用随机森林填补比较多的缺失值
data=set_missing(data)   
# 删除比较少的缺失值
data=data.dropna()   
# 删除重复项
data = data.drop_duplicates()    
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 145563 entries, 0 to 149999
Data columns (total 11 columns):
好坏客户          145563 non-null int64
可用额度比值        145563 non-null float64
年龄            145563 non-null int64
逾期30-59天笔数    145563 non-null int64
负债率           145563 non-null float64
月收入           145563 non-null float64
信贷数量          145563 non-null int64
逾期90天笔数       145563 non-null int64
固定资产贷款量       145563 non-null int64
逾期60-89天笔数    145563 non-null int64
家属数量          145563 non-null float64
dtypes: float64(4), int64(7)
memory usage: 13.3 MB
# 删除逾期30-59天笔数、逾期90天笔数、逾期60-89天笔数大于80的数据
data = data[data['逾期30-59天笔数'] < 80]
data = data[data['逾期90天笔数'] < 80]
data = data[data['逾期60-89天笔数'] < 80]
data = data[data['年龄'] > 0]
col_list = data.columns.values
col_list
array(['好坏客户', '可用额度比值', '年龄', '逾期30-59天笔数', '负债率', '月收入', '信贷数量',
       '逾期90天笔数', '固定资产贷款量', '逾期60-89天笔数', '家属数量'], dtype=object)
new_col_list = []
for i in range(len(col_list)):
    if i != 0 and i != 3 and i != 7 and i != 9:
        new_col_list.append(col_list[i])
# 去除单侧99%上部分异常值
for item in new_col_list:
    data = data[data[item] < data[item].quantile(0.99)]
    
import woe.feature_process as fp
import woe.eval as eval
data.columns
Index(['好坏客户', '可用额度比值', '年龄', '逾期30-59天笔数', '负债率', '月收入', '信贷数量', '逾期90天笔数',
       '固定资产贷款量', '逾期60-89天笔数', '家属数量'],
      dtype='object')
data.rename(columns={'好坏客户': 'target'}, inplace=True)
# woe分箱, iv and transform
data_woe = data # 用于存储所有数据的woe值
civ_list = []
n_positive = sum(data['target'])
n_negtive = len(data) - n_positive
for column in list(data.columns[1:]):
    if data[column].dtypes == 'object':
        civ = fp.proc_woe_discrete(data, column, n_positive, n_negtive, 0.05 * len(data), alpha=0.05)
    else:            
        civ = fp.proc_woe_continuous(data, column, n_positive, n_negtive, 0.05 * len(data), alpha=0.05)
    civ_list.append(civ)
    data_woe[column] = fp.woe_trans(data[column], civ)
    
civ_df = eval.eval_feature_detail(civ_list,'output_feature_detail_0315.csv')
# 删除iv值过小的变量
iv_thre = 0.001
iv = civ_df[['var_name','iv']].drop_duplicates()
x_columns = iv.var_name[iv.iv > iv_thre]

-------------process continuous variable:可用额度比值-------------
---------------process continuous variable:年龄---------------
-----------process continuous variable:逾期30-59天笔数-----------
--------------process continuous variable:负债率---------------
--------------process continuous variable:月收入---------------
--------------process continuous variable:信贷数量--------------
------------process continuous variable:逾期90天笔数-------------
------------process continuous variable:固定资产贷款量-------------
-----------process continuous variable:逾期60-89天笔数-----------
--------------process continuous variable:家属数量--------------
可用额度比值
年龄
逾期30-59天笔数
负债率
月收入
信贷数量
逾期90天笔数
固定资产贷款量
逾期60-89天笔数
家属数量
civ_df
var_namesplit_listsub_total_sample_numpositive_sample_numnegative_sample_numsub_total_num_percentagepositive_rate_in_sub_totalwoe_listiv_listiv
0可用额度比值(-INF,0.0]935228190710.0712130.030047-0.7574630.0296261.097527
1可用额度比值(0.0,0.04215617400000002]29188365288230.2222590.012505-1.6520110.3124541.097527
2可用额度比值(0.04215617400000002,0.0596119858]696211168510.0530140.015944-1.4055990.0590041.097527
3可用额度比值(0.0596119858,0.13857709429999995]17901375175260.1363120.020949-1.1274950.1084641.097527
4可用额度比值(0.13857709429999995,0.21535932080000003]1011329598180.0770080.029170-0.7879770.0342421.097527
5可用额度比值(0.21535932080000003,0.30067412204]827429679780.0630040.035775-0.5770630.0163861.097527
6可用额度比值(0.30067412204,0.3974544458]751037171390.0571870.049401-0.2401060.0029701.097527
7可用额度比值(0.3974544458,0.5331554074]850658679200.0647710.0688930.1131930.0008721.097527
8可用额度比值(0.5331554074,0.74050784496]998599689890.0760330.0997500.5170100.0255411.097527
9可用额度比值(0.74050784496,0.90349439404]7295110361920.0555500.1511990.9917960.0845551.097527
10可用额度比值(0.90349439404,+INF)162383360128780.1236480.2069221.3734410.4234111.097527
0年龄(-INF,32.0]135311392121390.1030350.1028750.5513380.0399640.046040
1年龄(32.0,+INF)11779367471110460.8969650.057278-0.0838270.0060760.046040
0逾期30-59天笔数(-INF,0.0]11111942791068400.8461440.038508-0.5005930.1709890.606073
1逾期30-59天笔数(0.0,+INF)202053860163450.1538560.1910421.2737650.4350840.606073
0负债率(-INF,0.018495376]10665466101990.0812110.043694-0.3688390.0094200.088009
1负债率(0.018495376,0.087064379]884357782660.0673370.0652490.0549560.0002080.088009
2负债率(0.087064379,0.138218834]753344970840.0573620.059604-0.0415510.0000970.088009
3负债率(0.138218834,0.191269577]914849386550.0696600.053892-0.1483630.0014370.088009
4负债率(0.191269577,0.229044637]689838365150.0525270.055523-0.1168070.0006810.088009
5负债率(0.229044637,0.26480176767999997]678031364670.0516280.046165-0.3112440.0043700.088009
6负债率(0.26480176767999997,0.33095571454]12054618114360.0917880.051269-0.2010130.0033980.088009
7负债率(0.33095571454,0.37664756308]755144071110.0574990.058270-0.0656030.0002400.088009
8负债率(0.37664756308,0.4237495164599999]669640662900.0509880.060633-0.0233430.0000280.088009
9负债率(0.4237495164599999,0.54743575044]12664918117460.0964330.0724890.1679490.0029290.088009
10负债率(0.54743575044,0.7263413320000001]911282582870.0693860.0905400.4099600.0139760.088009
11负债率(0.7263413320000001,2.6823588614000204]9111102680850.0693780.1126110.6526770.0394390.088009
12负债率(2.6823588614000204,1009.0]10925639102860.0831910.058490-0.0616140.0003070.088009
13负债率(1009.0,+INF)13344586127580.1016110.043915-0.3635740.0114780.088009
0月收入(-INF,1159.0]13281877124040.1011320.0660340.0677530.0004780.114078
.................................
7月收入(4831.0,5332.68]720147067310.0548340.0652690.0552740.0001720.114078
8月收入(5332.68,5917.0]738143269490.0562050.058529-0.0609070.0002030.114078
9月收入(5917.0,6667.0]864753381140.0658450.061640-0.0058050.0000020.114078
10月收入(6667.0,7916.0]1030651997870.0784780.050359-0.2198860.0034480.114078
11月收入(7916.0,8333.0]796732776400.0606670.041044-0.4341720.0094840.114078
12月收入(8333.0,10300.0]1034048998510.0787370.047292-0.2859460.0056870.114078
13月收入(10300.0,+INF)13361474128870.1017410.035476-0.5857470.0271650.114078
0信贷数量(-INF,3.0]186321864167680.1418780.1000430.5202720.0483330.067247
1信贷数量(3.0,4.0]1039662097760.0791630.059638-0.0409460.0001300.067247
2信贷数量(4.0,5.0]11689671110180.0890090.057404-0.0814960.0005700.067247
3信贷数量(5.0,6.0]12373651117220.0942170.052615-0.1736930.0026350.067247
4信贷数量(6.0,7.0]12102629114730.0921540.051975-0.1866000.0029580.067247
5信贷数量(7.0,8.0]11422518109040.0869760.045351-0.3298900.0082050.067247
6信贷数量(8.0,9.0]1021956896510.0778150.055583-0.1156750.0009900.067247
7信贷数量(9.0,10.0]874548882570.0665910.055803-0.1114810.0007880.067247
8信贷数量(10.0,11.0]743140570260.0565850.054501-0.1364660.0009930.067247
9信贷数量(11.0,13.0]11199615105840.0852780.054916-0.1284560.0013300.067247
10信贷数量(13.0,+INF)171161110160060.1303340.0648520.0484160.0003120.067247
0逾期90天笔数(-INF,0.0]12448854261190620.9479460.043587-0.3714220.1113760.800610
1逾期90天笔数(0.0,+INF)6836271341230.0520540.3968702.2984940.6892340.800610
0固定资产贷款量(-INF,0.0]494713805456660.3767100.0769140.2319820.0224540.043142
1固定资产贷款量(0.0,1.0]481532429457240.3666730.050443-0.2181240.0158670.043142
2固定资产贷款量(1.0,2.0]284131538268750.2163580.054130-0.1436940.0041960.043142
3固定资产贷款量(2.0,+INF)528736749200.0402590.0694160.1213180.0006250.043142
0逾期60-89天笔数(-INF,0.0]12516260531191090.9530780.048361-0.2624650.0585840.515526
1逾期60-89天笔数(0.0,+INF)6162208640760.0469220.3385262.0471520.4569420.515526
0家属数量(-INF,0.0]799544351756030.6088300.054419-0.1380700.0109280.028199
1家属数量(0.0,1.0]244731683227900.1863560.0687700.1112760.0024230.028199
2家属数量(1.0,2.0]181171377167400.1379570.0760060.2191260.0072950.028199
3家属数量(2.0,3.0]878072880520.0668580.0829160.3136450.0075530.028199

66 rows × 10 columns


x_columns
0        可用额度比值
0            年龄
0    逾期30-59天笔数
0           负债率
0           月收入
0          信贷数量
0       逾期90天笔数
0       固定资产贷款量
0    逾期60-89天笔数
0          家属数量
Name: var_name, dtype: object
iv
var_nameiv
0可用额度比值1.097527
0年龄0.046040
0逾期30-59天笔数0.606073
0负债率0.088009
0月收入0.114078
0信贷数量0.067247
0逾期90天笔数0.800610
0固定资产贷款量0.043142
0逾期60-89天笔数0.515526
0家属数量0.028199
data_woe.head()
target可用额度比值年龄逾期30-59天笔数负债率月收入信贷数量逾期90天笔数固定资产贷款量逾期60-89天笔数家属数量
101.373441-0.083827-0.500593-0.0415510.461028-0.040946-0.3714220.231982-0.2624650.111276
200.517010-0.0838271.2737650.0549560.4610280.5202722.2984940.231982-0.262465-0.138070
30-0.5770630.551338-0.5005930.0549560.461028-0.081496-0.3714220.231982-0.262465-0.138070
50-0.787977-0.083827-0.500593-0.0656030.2430140.520272-0.371422-0.218124-0.2624650.111276
700.991796-0.083827-0.500593-0.1168070.243014-0.329890-0.3714220.231982-0.262465-0.138070

模型建立

信用评分卡模型在国外是一种成熟的预测方法,尤其在信用风险评估以及金融风险控制领域更是得到了比较广泛的使用,其原理是将模型变量WOE编码方式离散化之后运用logistic回归模型进行的一种二分类变量的广义线性模型,下面将模型目标标量为1记为违约用户,对于目标变量为0记为正常用户,采用sklearn中LogisticRegression进行建模

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
# 模型评估
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.metrics import auc
# 数据提取与数据分割
col_names = data_woe.columns.values
X = data_woe[col_names[1:]]  # 特征列
y = data_woe[col_names[0]]  # 标签列
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=0)
lr = LogisticRegression(C=1000.0, random_state=0)
result = lr.fit(X_train, y_train)
result
LogisticRegression(C=1000.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=0, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)
# 模型预测
y_pred = lr.predict(X_test)
y_pred
array([0, 0, 0, ..., 0, 0, 0], dtype=int64)
# 预测为坏的客户的概率
prob_pred = [round(u[1], 5) for u in lr.predict_proba(X_test)]
# 预测的准确率
accuracy_score(y_test, y_pred)
0.9387532362048835
# 样本类别不平衡,用PR不好评价,采用ROC曲线
FPR, TPR, thresholds = metrics.roc_curve(y_test, prob_pred, pos_label=1)
metrics.auc(FPR, TPR)
0.8499778184241903
# 画图对预测值和实际值进行比较
plt.plot(FPR, TPR, 'b', label='AUC = %0.2f' % metrics.auc(FPR, TPR)) # 生成ROC曲线
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('真正率')
plt.xlabel('假正率')
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-ZuGQDu7G-1577332256208)(output_33_0.png)]

从上图可知,AUC值为0.85,说明该模型的预测效果还是不错的,正确率较高

评分卡计算方法

odds为good用户概率(p)与bad用户概率(1-p)的比值

odds ⁡ = p 1 − p \operatorname{odds}=\frac{p}{1-p} odds=1pp

评分卡设定的分值刻度可以通过将分值表示为比率对数的现行表达式来定义。公式如下:

s c o r e 总 = A + B ∗ ln ⁡ ( o d d s ) score_{总}=A+B{*}\ln(odds) score=A+Bln(odds)

常数 A 和 B 通常被称为补偿和刻度,它们的值可以通过将两个已知或者假设的分值带入 s c o r e 总 = A + B ∗ ln ⁡ ( o d d s ) score_{总}=A+B{*}\ln(odds) score=A+Bln(odds) 中得到。通常,需要两个假设:

  • 在某个特定的比率设定特定的预期分值 P 0 P_{0} P0
  • 指定比率翻番的分数(PDO,Point-to-Double Odds)

首先,设定比率为odds的特定点的分值为 P 0 P_{0} P0。然后,比率为 2odds的点分值为 P 0 − P D O P_{0}-PDO P0PDO,带入可以得到
B = P D O log ⁡ ( 2 ) B=\frac{PDO}{\log (2)} B=log(2)PDO

A = P 0 + B log ⁡ ( o d d s ) A=P_{0}+B \log \left(odds\right) A=P0+Blog(odds)

import math
# PDO为比率翻番的分数,P0为特定比例的预期分值,B为刻度
PDO = 20
P0 = 600
B = PDO / math.log(2)
B
28.85390081777927
# A为补偿
A = P0 + B * math.log(1 / 60)
A
481.8621880878296

基于Logistic的评分卡构建


最终,评分卡的分值可以写成下列形式:

Score = A − B ( β 0 + β 1 x 1 + ⋯ + β p x p ) =A-B\left(\beta_{0}+\beta_{1} x_{1}+\cdots+\beta_{p} x_{p}\right) =AB(β0+β1x1++βpxp)

变量 x 1 x_{1} x1,⋯, x p x_{p} xp为自变量对应WOE, β 0 \beta_{0} β0,⋯, β p \beta_{p} βp为逻辑斯蒂回归方程的系数

# 逻辑斯蒂回归的系数列表
coef_list = list(result.coef_[0])
coef_list.insert(0, result.intercept_[0])
# 计算信用评分
def credit_socre(data, coef): 
    score_list = []
    for i in range(data.shape[0]):
        tmp_score = coef[0]
        for j in range(data.shape[1]):
            tmp_score += data.iat[i, j] * coef[j + 1]
        score = A - B * tmp_score
        score_list.append(score)
    return score_list
score_list = credit_socre(data_woe.iloc[:, 1:], coef_list)
data_woe.insert(11, 'credit_score', score_list)
data_woe.head().append(data_woe.tail())

# 在原始数据中插入信用评分
data.insert(11, 'credit_socre', score_list)
data.head().append(data.tail())




  • 2
    点赞
  • 12
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值