信用评分卡模型

%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
import warnings
from scipy import stats
warnings.filterwarnings('ignore')
plt.style.use("seaborn")
plt.rc('font', family='SimHei', size=13)  # 显示中文
plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负
# 载入数据
data = pd.read_csv(r"./cs_training.csv",encoding='gbk')
# 查看数据集
# data.head(10)

# 将特征名字改为中文
column={'SeriousDlqin2yrs':'好坏客户',
        'RevolvingUtilizationOfUnsecuredLines':'可用额度比值',
        'age':'年龄',
        'NumberOfTime30-59DaysPastDueNotWorse':'逾期30-59天笔数',
        'DebtRatio':'负债率',
        'MonthlyIncome':'月收入',
        'NumberOfOpenCreditLinesAndLoans':'信贷数量',
        'NumberOfTimes90DaysLate':'逾期90天笔数',
        'NumberRealEstateLoansOrLines':'固定资产贷款量',
        'NumberOfTime60-89DaysPastDueNotWorse':'逾期60-89天笔数',
        'NumberOfDependents':'家属数量'}
data.rename(columns=column,inplace=True)
data.head()
好坏客户可用额度比值年龄逾期30-59天笔数负债率月收入信贷数量逾期90天笔数固定资产贷款量逾期60-89天笔数家属数量
010.7661274520.8029829120.0130602.0
100.9571514000.1218762600.040001.0
200.6581803810.0851133042.021000.0
300.2338103000.0360503300.050000.0
400.9072394910.02492663588.070100.0

from sklearn.ensemble import RandomForestRegressor
# 用随机森林对缺失值预测填充函数
def set_missing(df):
    # 把已有的数值型特征取出来
    process_df = df.iloc[:,[5, 0, 1, 2, 3, 4, 6, 7, 8, 9]]
    # 分成已知该特征和未知该特征两部分
    # dataframe.values获取的是dataframe中的数据为数组array
    known = process_df[process_df['月收入'].notnull()].values
    unknown = process_df[process_df['月收入'].isnull()].values
    # X为已知月收入的特征属性值
    X = known[:, 1:]
    # y为结果标签值月收入
    y = known[:, 0]
    # X与y用于训练随机森林模型,fit到RandomForestRegressor之中
    rfr = RandomForestRegressor(random_state=0, n_estimators=200,max_depth=3,n_jobs=-1)
    rfr.fit(X,y)
    # 用得到的模型进行未知特征值预测
    predicted = rfr.predict(unknown[:, 1:]).round(0)
    # 用得到的预测结果填补原缺失数据
    df.loc[df['月收入'].isnull(), '月收入'] = predicted
    return df
# 用随机森林填补比较多的缺失值
data = set_missing(data)   
# 删除比较少的缺失值
data = data.dropna()   
# 删除重复项
data = data.drop_duplicates()      
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 145563 entries, 0 to 149999
Data columns (total 11 columns):
好坏客户          145563 non-null int64
可用额度比值        145563 non-null float64
年龄            145563 non-null int64
逾期30-59天笔数    145563 non-null int64
负债率           145563 non-null float64
月收入           145563 non-null float64
信贷数量          145563 non-null int64
逾期90天笔数       145563 non-null int64
固定资产贷款量       145563 non-null int64
逾期60-89天笔数    145563 non-null int64
家属数量          145563 non-null float64
dtypes: float64(4), int64(7)
memory usage: 13.3 MB
# 删除逾期30-59天笔数、逾期90天笔数、逾期60-89天笔数大于80的数据
data = data[data['逾期30-59天笔数'] < 80]
data = data[data['逾期90天笔数'] < 80]
data = data[data['逾期60-89天笔数'] < 80]
data = data[data['年龄'] > 0]
col_list = data.columns.values
col_list
array(['好坏客户', '可用额度比值', '年龄', '逾期30-59天笔数', '负债率', '月收入', '信贷数量',
       '逾期90天笔数', '固定资产贷款量', '逾期60-89天笔数', '家属数量'], dtype=object)
new_col_list = []
for i in range(len(col_list)):
    if i != 0 and i != 3 and i != 7 and i != 9:
        new_col_list.append(col_list[i])
# 去除单侧99%上部分异常值
for item in new_col_list:
    data = data[data[item] < data[item].quantile(0.99)]

from sklearn.tree import DecisionTreeClassifier

def _optimal_binning_boundary(x, y):
    """
    利用决策树获得最优分箱的边界值列表
    """

    boundary = []  # 待return的分箱边界值列表
    y = y.values
    clf = DecisionTreeClassifier(criterion='gini',
                                 max_leaf_nodes=6,
                                 min_samples_leaf=5)
    x = x.values.reshape(-1, 1)
    clf.fit(x, y)  # 训练决策树

    n_nodes = clf.tree_.node_count
    children_left = clf.tree_.children_left
    children_right = clf.tree_.children_right
    threshold = clf.tree_.threshold

    for i in range(n_nodes):
        if children_left[i] != children_right[i]:  # 获得决策树节点上的划分边界值
            boundary.append(threshold[i])

    boundary.sort()

    min_x = x.min() - 0.0001
    max_x = x.max() + 0.1  # +0.1是为了考虑后续groupby操作时,能包含特征最大值的样本
    boundary = [min_x] + boundary + [max_x]

    return boundary
x = data.iloc[:, 1:]
y = data['好坏客户']

def cut_func(data):
    """
    分箱的结果保存到一个字典中
    """
    cut_dict = {}
    col_list = data.columns[1:]
    for i in range(len(col_list)):
        bins = _optimal_binning_boundary(data.iloc[:, i + 1], data[data.columns[0]])
        cut_ = pd.cut(data[col_list[i]], bins, labels=False)
        cut_dict[col_list[i]] = cut_
    return cut_dict
cut_dict = cut_func(data)

# WOE值计算
def get_woe_data(cut, data):
    BT = data.sum()  # 总的坏客户
    GT = data.count() - data.sum()  # 总的好客户
    grouped = data.groupby(cut, as_index=True).value_counts()
    Bi = grouped.unstack().iloc[:, 1]  # 每个分段区间坏的客户数
    Gi = grouped.unstack().iloc[:, 0]  # 每个分段区间好的客户数
    odds = (Bi / Gi) * (GT / BT)
    woe = np.log(odds)
    return woe
def cut_woe_func(src_dict, src_data):
    """
    计算每个分箱后的woe值,结果保存到字典中
    """
    cut_woe_dict = {}
    for key in src_dict.keys():
        cut_woe = get_woe_data(cut_dict[key], src_data["好坏客户"])
        cut_woe_dict[key] = cut_woe
    return cut_woe_dict
cut_woe_dict = cut_woe_func(cut_dict, data)

# IV值计算
def get_IV_data(cut, cut_woe, data):
    grouped = data.groupby(cut, as_index=True).value_counts()
    Bi = grouped.unstack().iloc[:,1]
    BT = data.sum()
    Gi = grouped.unstack().iloc[:,0]
    GT = data.count() - data.sum()
    cut_IV = (( Bi / BT - Gi / GT) * cut_woe).sum()   
    
    return cut_IV
def cut_IV_func(src_dict, src_cut_woe_dict, src_data):
    """
    计算各个分箱后的IV值,并将结果保存到字典中
    """
    cut_IV_dict = {}
    for key in src_dict.keys():
        cut_IV = get_IV_data(src_dict[key], src_cut_woe_dict[key], src_data['好坏客户'])
        cut_IV_dict[key] = cut_IV
    return cut_IV_dict   

cut_IV_dict = cut_IV_func(cut_dict, cut_woe_dict, data)
cut_IV_dict
{'可用额度比值': 1.0496267788824982,
 '年龄': 0.2283880128708045,
 '逾期30-59天笔数': 0.6890352866527477,
 '负债率': 0.0797191570453572,
 '月收入': 0.1018015502249017,
 '信贷数量': 0.09975008041753788,
 '逾期90天笔数': 0.8367128665446881,
 '固定资产贷款量': 0.0431415986925943,
 '逾期60-89天笔数': 0.5379534841785022,
 '家属数量': 0.028199260527371775}
IV_df = pd.DataFrame([cut_IV_dict])
IV_df
可用额度比值年龄逾期30-59天笔数负债率月收入信贷数量逾期90天笔数固定资产贷款量逾期60-89天笔数家属数量
01.0496270.2283880.6890350.0797190.1018020.099750.8367130.0431420.5379530.028199
iv = IV_df.plot.bar(rot=90, figsize=(10,5), fontsize=(10))
iv.set_title('特征变量与IV值分布图', fontsize=(15))
iv.set_xlabel('特征变量', fontsize=(15))
iv.set_ylabel('IV', fontsize=(15))
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-UG16ash0-1577330868231)(output_33_0.png)]

# 新建dwoe_df存放woe转换后的数据
woe_df = pd.DataFrame()  
# 转换woe
def replace_data(cut, cut_woe):
    a = []
    for i in cut.unique():
        a.append(i)
        a.sort()
    for j in range(len(a)):
        cut.replace(a[j], cut_woe.values[j], inplace=True)
    return cut

def gen_data_func(src_data, src_cut_dict, src_cut_woe_dict):
    """
    存放woe转换后的数据
    """
    for key in src_cut_dict.keys():
        new_key = key + "WOE"
        src_data[new_key] = replace_data(src_cut_dict[key], src_cut_woe_dict[key])
    return src_data

woe_df = gen_data_func(woe_df, cut_dict, cut_woe_dict)
woe_df.insert(0, '好坏客户', data["好坏客户"])
woe_df.head()
好坏客户可用额度比值WOE年龄WOE逾期30-59天笔数WOE负债率WOE月收入WOE信贷数量WOE逾期90天笔数WOE固定资产贷款量WOE逾期60-89天笔数WOE家属数量WOE
101.2574820.265965-0.500593-0.1041190.470780-0.145441-0.3714220.231982-0.2624650.111276
200.4040430.2659650.897932-0.1041190.4707800.3622701.9968940.231982-0.262465-0.138070
30-1.1220390.455702-0.500593-0.1041190.470780-0.145441-0.3714220.231982-0.262465-0.138070
50-1.122039-0.920630-0.500593-0.1041190.1129170.135016-0.371422-0.218124-0.2624650.111276
700.8539120.265965-0.500593-0.1041190.112917-0.145441-0.3714220.231982-0.262465-0.138070

模型建立

信用评分卡模型在国外是一种成熟的预测方法,尤其在信用风险评估以及金融风险控制领域更是得到了比较广泛的使用,其原理是将模型变量WOE编码方式离散化之后运用logistic回归模型进行的一种二分类变量的广义线性模型,下面将模型目标标量为1记为违约用户,对于目标变量为0记为正常用户,采用sklearn中LogisticRegression进行建模

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
# 模型评估
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.metrics import auc
# 数据提取与数据分割
col_names = woe_df.columns.values
X = woe_df[col_names[1:]]  # 特征列
y = woe_df[col_names[0]]  # 标签列
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=0)
lr = LogisticRegression(C=1000.0, random_state=0)
result = lr.fit(X_train, y_train)
result
LogisticRegression(C=1000.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=0, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)
# 模型预测
y_pred = lr.predict(X_test)
y_pred
array([0, 0, 0, ..., 0, 0, 0], dtype=int64)
# 预测为坏的客户的概率
prob_pred = [round(u[1], 5) for u in lr.predict_proba(X_test)]

模型评估

# 预测的准确率
accuracy_score(y_test, y_pred)
0.9404538301436621
# 样本类别不平衡,用PR不好评价,采用ROC曲线
FPR, TPR, thresholds = metrics.roc_curve(y_test, prob_pred, pos_label=1)
metrics.auc(FPR, TPR)
0.8537992492862908
# 画图对预测值和实际值进行比较
plt.plot(FPR, TPR, 'b', label='AUC = %0.2f' % metrics.auc(FPR, TPR)) # 生成ROC曲线
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('真正率')
plt.xlabel('假正率')
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-2174WDcy-1577330868236)(output_51_0.png)]

从上图可知,AUC值为0.85,说明该模型的预测效果还是不错的,正确率较高

信用评分

我们已经基本完成了建模相关的工作,并用ROC曲线验证了模型的预测能力。接下来的步骤,就是将Logistic模型转换为标准评分卡的形式

评分卡计算方法

odds为good用户概率(p)与bad用户概率(1-p)的比值

odds ⁡ = p 1 − p \operatorname{odds}=\frac{p}{1-p} odds=1pp

评分卡设定的分值刻度可以通过将分值表示为比率对数的现行表达式来定义。公式如下:

s c o r e 总 = A + B ∗ ln ⁡ ( o d d s ) score_{总}=A+B{*}\ln(odds) score=A+Bln(odds)

常数 A 和 B 通常被称为补偿和刻度,它们的值可以通过将两个已知或者假设的分值带入 s c o r e 总 = A + B ∗ ln ⁡ ( o d d s ) score_{总}=A+B{*}\ln(odds) score=A+Bln(odds) 中得到。通常,需要两个假设:

  • 在某个特定的比率设定特定的预期分值 P 0 P_{0} P0
  • 指定比率翻番的分数(PDO,Point-to-Double Odds)

首先,设定比率为odds的特定点的分值为 P 0 P_{0} P0。然后,比率为 2odds的点分值为 P 0 − P D O P_{0}-PDO P0PDO,带入可以得到
B = P D O log ⁡ ( 2 ) B=\frac{PDO}{\log (2)} B=log(2)PDO

A = P 0 + B log ⁡ ( o d d s ) A=P_{0}+B \log \left(odds\right) A=P0+Blog(odds)

P 0 P_{0} P0和PDO的值都是已知常数,我们可以设定评分卡刻度使得比率为 1:60(违约与正常)时的分值为600分,PDO = 20,从而计算出A和B

import math
# PDO为比率翻番的分数,P0为特定比例的预期分值,B为刻度
PDO = 20
P0 = 600
B = PDO / math.log(2)
B
28.85390081777927
# A为补偿
A = P0 + B * math.log(1 / 60)
A
481.8621880878296

基于Logistic的评分卡构建


最终,评分卡的分值可以写成下列形式:

Score = A − B ( β 0 + β 1 x 1 + ⋯ + β p x p ) =A-B\left(\beta_{0}+\beta_{1} x_{1}+\cdots+\beta_{p} x_{p}\right) =AB(β0+β1x1++βpxp)

变量 x 1 x_{1} x1,⋯, x p x_{p} xp为自变量对应WOE, β 0 \beta_{0} β0,⋯, β p \beta_{p} βp为逻辑斯蒂回归方程的系数

# 逻辑斯蒂回归的系数列表
coef_list = list(result.coef_[0])
coef_list.insert(0, result.intercept_[0])
# 计算信用评分
def credit_socre(data, coef): 
    score_list = []
    for i in range(data.shape[0]):
        tmp_score = coef[0]
        for j in range(data.shape[1]):
            tmp_score += data.iat[i, j] * coef[j + 1]
        score = A - B * tmp_score
        score_list.append(score)
    return score_list
score_list = credit_socre(woe_df.iloc[:, 1:], coef_list)
woe_df.insert(11, 'credit_score', score_list)
woe_df.head().append(woe_df.tail())
好坏客户可用额度比值WOE年龄WOE逾期30-59天笔数WOE负债率WOE月收入WOE信贷数量WOE逾期90天笔数WOE固定资产贷款量WOE逾期60-89天笔数WOE家属数量WOEcredit_score
101.2574820.265965-0.500593-0.1041190.470780-0.145441-0.3714220.231982-0.2624650.111276548.295866
200.4040430.2659650.897932-0.1041190.4707800.3622701.9968940.231982-0.262465-0.138070499.787457
30-1.1220390.455702-0.500593-0.1041190.470780-0.145441-0.3714220.231982-0.262465-0.138070588.316050
50-1.122039-0.920630-0.500593-0.1041190.1129170.135016-0.371422-0.218124-0.2624650.111276608.484060
700.8539120.265965-0.500593-0.1041190.112917-0.145441-0.3714220.231982-0.262465-0.138070559.845105
1499950-1.122039-0.920630-0.500593-0.1041190.323795-0.145441-0.371422-0.218124-0.262465-0.138070610.921871
1499960-1.1220390.265965-0.5005930.4086990.112917-0.145441-0.371422-0.218124-0.2624650.219126584.601393
1499970-1.122039-0.293016-0.500593-0.218218-0.3936280.048416-0.371422-0.218124-0.262465-0.138070609.993977
1499980-1.1220390.455702-0.500593-0.3764480.112917-0.145441-0.3714220.231982-0.262465-0.138070597.332740
14999900.853912-0.920630-0.500593-0.104119-0.380625-0.145441-0.371422-0.143694-0.262465-0.138070582.329013
data.shape
(131324, 11)
# 在原始数据中插入信用评分
data.insert(11, 'credit_socre', score_list)
data.head().append(data.tail())
好坏客户可用额度比值年龄逾期30-59天笔数负债率月收入信贷数量逾期90天笔数固定资产贷款量逾期60-89天笔数家属数量credit_socre
100.9571514000.1218762600.040001.0548.295866
200.6581803810.0851133042.021000.0499.787457
300.2338103000.0360503300.050000.0588.316050
500.2131797400.3756073500.030101.0608.484060
700.7544643900.2099403500.080000.0559.845105
14999500.0406747400.2251312100.040100.0610.921871
14999600.2997454400.7165625584.040102.0584.601393
14999700.2460445803870.0000002554.0180100.0609.993977
14999800.0000003000.0000005716.040000.0597.332740
14999900.8502836400.2499088158.080200.0582.329013

  • 1
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值