R 语言快速构建信用评分卡模型---scorecard包

最新推荐文章于 2024-08-03 18:07:58 发布

Bonus_F

最新推荐文章于 2024-08-03 18:07:58 发布

阅读量3.8k

点赞数 2

分类专栏：数据分析 R语言系列信用模型文章标签： R语言数据分析

数据分析同时被 3 个专栏收录

26 篇文章 7 订阅

订阅专栏

R语言系列

6 篇文章 0 订阅

订阅专栏

信用模型

5 篇文章 1 订阅

订阅专栏

前言

R 语言快速构建机器学习，基于某大佬的scorecard包。

# github主页 - R版: http://github.com/shichenxie/scorecard

# 加载[data.table](http://r-datatable.com)与scorecard包

library(data.table) # 一个超高性能的数据处理包

library(scorecard)

# 数据准备 ------

# 加载scorecard包中的德国信贷数据集。该数据集包含了1000个信贷样本

# 20个特征，其详细介绍请参考[UCI的德国信贷数据集]

链接：https://archive.ics.uci.edu/ml/datasets/Statlog+(German+Credit+Data)

data("germancredit")

# dim(germancredit)

# 数据集的违约标签为creditability

# 将其中坏样本的标签赋值为1，好样本赋值为0。

dt = setDT(germancredit)[

, creditability := ifelse(creditability=="bad",1,0)]

# 样本粗筛 var_filter

# 默认删除信息值小于0.02、缺失率大于95%或单类别比例大于95%的变量

# 可通过iv_limit, missing_limit, identical_limit分别设定。

# var_rm与var_kp指定需要强制删除或强制保留的变量

dt_s = var_filter(dt, "creditability")

# dim(dt_s)

# 如果return_rm_reason=TRUE，则返回删除变量的原因

# dt_s = var_filter(dt, "creditability", return_rm_reason=TRUE)

# dt_s$rm # 删除变量的原因

# dt_s$dt # 粗筛之后的数据集

# 拆分数据集为训练集与测试集 split_df

# y为标签，如果不指定则随机拆分，反之则按照y值分层拆分

# ratio为拆分后两个数据集样本数比例

# seed为随机种子，用于重现样本拆分

dt_list = split_df(dt_s, y="creditability", ratio=0.6, seed=30)

train = dt_list$train; test = dt_list$test;

# 由于数据集样本较少，后面的分箱过程采用全样本进行

# 分箱与woe转换 ------

# 最优分箱 woebin，该函数通过决策树的形式寻找最优分箱点。

# 默认当stop_limit信息值增益率小于0.1, 或max_bin_num分箱数大于6(缺失值除外)时停止分箱。

bins = woebin(dt_s, "creditability", print_step=1)

# class(bins)

# 打印第一个变量的分箱

bins[[1]]

# 绘制变量分箱图woebin_plot

woebin_plot(bins[[1]])

# 手动调整分箱 woebin

# 通过breaks_list指定分箱点，其中类别变量通过 %,% 相连

breaks_adj = list(

age.in.years=c(26, 35, 40),

other.debtors.or.guarantors=c("none", "co-applicant%,%guarantor"))

bins_adj = woebin(dt_s, "creditability", breaks_list=breaks_adj, print_step=0)

# 交互式调整分箱woebin_adj

# breaks_adj = woebin_adj(dt_s, "creditability", bins)

# bins_adj = woebin(dt_s, "creditability", breaks_list=breaks_adj, print_step=0)

# 原始数据集转换为对应的woe值woebin_ply

train_woe = woebin_ply(train, bins_adj, print_step=0)

test_woe = woebin_ply(test, bins_adj, print_step=0)

# 逻辑回归 ------

m1 = glm( creditability ~ ., family = "binomial", data = train_woe)

# summary(m1)

# 基于AIC筛选变量

# 也可通过lasso实现变量筛选，具体参考上面提到的幻灯片

m_step = step(m1, direction="both", trace = FALSE)

m2 = eval(m_step$call)

# summary(m2)

# 模型评估 ------

# 预测违约概率

train_pred = predict(m2, train_woe, type='response')

test_pred = predict(m2, test_woe, type='response')

# ks & roc plot

# type可设定返回的模型评估指标，包括"ks", "lift", "roc", "pr"

perf_eva(train$creditability, train_pred, title = "train")

perf_eva(test$creditability, test_pred, title = "test")

# 评分卡与信用评分 ------

# 默认基础分points0为600,

# 对应的坏好比odds0为1/19,

# 坏好比翻倍的分数pdo为50分

card = scorecard(bins_adj, m2)

# 基于评分卡，计算相应的信用评分

# only_total_score 如果为TRUE只返回总评分，FALSE返回各个变量的评分

train_score = scorecard_ply(train, card, only_total_score = TRUE, print_step = 0)

test_score = scorecard_ply(test, card, only_total_score = TRUE, print_step = 0)

# 模型稳定性评估

# x_limits, x_tick_break分别指定计算psi时的评分范围与间隔

perf_psi(

score = list(train = train_score, test = test_score),

label = list(train = train$creditability, test = test$creditability),

x_limits = c(250, 700), x_tick_break = 50 )