用R语言scorecard包做一张标准评分卡

最新推荐文章于 2024-04-19 13:50:15 发布

wyzwyzwyzo

最新推荐文章于 2024-04-19 13:50:15 发布

阅读量2k

点赞数 2

文章标签： R语言评分卡风控算法

本文链接：https://blog.csdn.net/wyzwyzwyzo/article/details/107237779

版权

R语言的scorecard包可以满足标准评分卡的所有需求
以下利用kaggle的give me some credits数据进行逻辑回归建模（以使用熟悉scorecard包功能为主）

#导入包，读取数据
require(scorecard)
df <- read.csv("cs-training.csv",header = TRUE)

# 变量选择
#设置SeriousDlqin2yrs为目标，余下则为变量
df_all <- var_filter(df, y="SeriousDlqin2yrs")

#划分训练集与测试集按照7，3比例
df_list <- split_df(df_all,ratio = c(0.7,0.3))
train <- df_list$train
test <- df_list$test

 # woe 分箱 自动分箱
bins <- woebin(train, y="SeriousDlqin2yrs")
# bins 返回分箱，badrate， 每箱映射的woe，变量iv，分箱节点，及是否是特殊值（如：将缺失单独设为一类） 
print(bins)

# 图示各变量分箱，badrate，及变量iv
woebin_plot(bins)

图示为woebin_plot(bins$age)
在这里插入图片描述

#必要时可以对变量进行手动分箱
# 以变量age为例（这里未进行保存）
woebin_adj(train, "SeriousDlqin2yrs", bins$age)

Adjust breaks for (1/1) age?
1: next
2: yes
3: back
Selection (1-3, goX, save): 2
Enter modified breaks: 40,50,60
Current breaks:
“40”, “50”, “60”

在这里插入图片描述

# 训练集和测试数据集转换为woe值
train_woe <-  woebin_ply(train, bins)
test_woe <-  woebin_ply(test, bins)

#逻辑回归建模
fit <- glm( SeriousDlqin2yrs ~ ., family = binomial(link = "logit"), data = train_woe)
summary(fit)

# 模型性能验证 ks和roc
# 预测的概率
train_pred <- predict(fit, train_woe, type = 'response')
test_pred <- predict(fit, test_woe, type = 'response')
# 性能
train_perf <- perf_eva( train_pred,train$SeriousDlqin2yrs, title = 'train')
test_perf <- perf_eva(test_pred,test$SeriousDlqin2yrs,title = 'test')

在这里插入图片描述

# 生成评分卡,自设基准点，发生比
card <- scorecard(bins, fit,points0 = 600, odds0 = 1/19, pdo = 50,
                  basepoints_eq0 = FALSE)
print(card)

$basepoints
variable bin woe points
1: basepoints NA NA 575

$RevolvingUtilizationOfUnsecuredLines
variable bin count count_distr good bad badprob woe bin_iv total_iv breaks
1: RevolvingUtilizationOfUnsecuredLines [-Inf,0.2) 56904 0.5438073 55730 1174 0.02063124 -1.2363021 0.50107552 1.059351 0.2
2: RevolvingUtilizationOfUnsecuredLines [0.2,0.5) 18768 0.1793578 17834 934 0.04976556 -0.3255858 0.01653890 1.059351 0.5
3: RevolvingUtilizationOfUnsecuredLines [0.5,0.85) 13048 0.1246942 11545 1503 0.11519007 0.5850105 0.05503519 1.059351 0.85
4: RevolvingUtilizationOfUnsecuredLines [0.85, Inf) 15920 0.1521407 12455 3465 0.21765075 1.3443903 0.48670107 1.059351 Inf
is_special_values points
1: FALSE 54
2: FALSE 14
3: FALSE -26
4: FALSE -59

$age
variable bin count count_distr good bad badprob woe bin_iv total_iv breaks is_special_values points
1: age [-Inf,40) 22321 0.2133123 19957 2364 0.10590923 0.4905749 0.06354621 0.2458132 40 FALSE -16
2: age [40,56) 39135 0.3739966 35995 3140 0.08023508 0.1846426 0.01381426 0.2458132 56 FALSE -6
3: age [56,64) 19458 0.1859518 18502 956 0.04913146 -0.3390764 0.01849155 0.2458132 64 FALSE 11
4: age [64, Inf) 23726 0.2267393 23110 616 0.02596308 -1.0009739 0.14996124 0.2458132 Inf FALSE 32

$NumberOfTime30.59DaysPastDueNotWorse
variable bin count count_distr good bad badprob woe bin_iv total_iv breaks
1: NumberOfTime30.59DaysPastDueNotWorse [-Inf,1) 87863 0.83966934 84343 3520 0.04006237 -0.5526310 0.2028337 0.757567 1
2: NumberOfTime30.59DaysPastDueNotWorse [1,2) 11191 0.10694763 9465 1726 0.15423108 0.9220056 0.1354517 0.757567 2
3: NumberOfTime30.59DaysPastDueNotWorse [2, Inf) 5586 0.05338303 3756 1830 0.32760473 1.9047612 0.4192815 0.757567 Inf
is_special_values points
1: FALSE 21
2: FALSE -36
3: FALSE -74

$DebtRatio
variable bin count count_distr good bad badprob woe bin_iv total_iv breaks is_special_values points
1: DebtRatio [-Inf,0.4) 56162 0.53671636 52852 3310 0.05893665 -0.14674757 0.010850193 0.07697975 0.4 FALSE 9
2: DebtRatio [0.4,0.55) 12471 0.11918005 11547 924 0.07409189 0.09833093 0.001202505 0.07697975 0.55 FALSE -6
3: DebtRatio [0.55,0.7) 6348 0.06066514 5717 631 0.09940139 0.41990619 0.012839573 0.07697975 0.7 FALSE -27
4: DebtRatio [0.7,2.75) 8355 0.07984518 7337 1018 0.12184321 0.64870968 0.044543437 0.07697975 2.75 FALSE -41
5: DebtRatio [2.75, Inf) 21304 0.20359327 20111 1193 0.05599887 -0.20099599 0.007544039 0.07697975 Inf FALSE 13

$MonthlyIncome
variable bin count count_distr good bad badprob woe bin_iv total_iv breaks is_special_values
1: MonthlyIncome missing 20719 0.1980027 19542 1177 0.05680776 -0.1857974 0.006310120 0.06363238 missing TRUE
2: MonthlyIncome [-Inf,4000) 26827 0.2563742 24414 2413 0.08994670 0.3095138 0.028096469 0.06363238 4000 FALSE
3: MonthlyIncome [4000,5500) 16143 0.1542718 14892 1251 0.07749489 0.1469189 0.003549050 0.06363238 5500 FALSE
4: MonthlyIncome [5500,8000) 18397 0.1758123 17243 1154 0.06272762 -0.0803723 0.001096984 0.06363238 8000 FALSE
5: MonthlyIncome [8000, Inf) 22554 0.2155390 21473 1081 0.04792941 -0.3651100 0.024579753 0.06363238 Inf FALSE
points
1: 0
2: 0
3: 0
4: 0
5: 0

$NumberOfOpenCreditLinesAndLoans
variable bin count count_distr good bad badprob woe bin_iv total_iv breaks
1: NumberOfOpenCreditLinesAndLoans [-Inf,3) 9105 0.08701261 7966 1139 0.12509610 0.67876800 0.05383830 0.06585617 3
2: NumberOfOpenCreditLinesAndLoans [3,14) 80236 0.76678135 75406 4830 0.06019742 -0.12424058 0.01121874 0.06585617 14
3: NumberOfOpenCreditLinesAndLoans [14, Inf) 15299 0.14620604 14192 1107 0.07235767 0.07277502 0.00079913 0.06585617 Inf
is_special_values points
1: FALSE -2
2: FALSE 0
3: FALSE 0

$NumberOfTimes90DaysLate
variable bin count count_distr good bad badprob woe bin_iv total_iv breaks is_special_values
1: NumberOfTimes90DaysLate [-Inf,1) 98791 0.94410359 94139 4652 0.04708931 -0.3836754 0.1179653 0.8183982 1 FALSE
2: NumberOfTimes90DaysLate [1, Inf) 5849 0.05589641 3425 2424 0.41442982 2.2781174 0.7004328 0.8183982 Inf FALSE
points
1: 15
2: -87

$NumberRealEstateLoansOrLines
variable bin count count_distr good bad badprob woe bin_iv total_iv breaks
1: NumberRealEstateLoansOrLines [-Inf,1) 39215 0.3747611 35964 3251 0.08290195 0.2202440 0.020002734 0.05179564 1
2: NumberRealEstateLoansOrLines [1,2) 36452 0.3483563 34533 1919 0.05264457 -0.2663114 0.022038294 0.05179564 2
3: NumberRealEstateLoansOrLines [2,3) 22083 0.2110378 20778 1305 0.05909523 -0.1438919 0.004106915 0.05179564 3
4: NumberRealEstateLoansOrLines [3, Inf) 6890 0.0658448 6289 601 0.08722787 0.2758374 0.005647699 0.05179564 Inf
is_special_values points
1: FALSE -10
2: FALSE 13
3: FALSE 7
4: FALSE -13

$NumberOfTime60.89DaysPastDueNotWorse
variable bin count count_distr good bad badprob woe bin_iv total_iv breaks
1: NumberOfTime60.89DaysPastDueNotWorse [-Inf,1) 99334 0.94929281 94208 5126 0.05160368 -0.2873797 0.06931066 0.5711802 1
2: NumberOfTime60.89DaysPastDueNotWorse [1, Inf) 5306 0.05070719 3356 1950 0.36750848 2.0808794 0.50186959 0.5711802 Inf
is_special_values points
1: FALSE 8
2: FALSE -61

$NumberOfDependents
variable bin count count_distr good bad badprob woe bin_iv total_iv breaks is_special_values
1: NumberOfDependents missing 2700 0.02580275 2576 124 0.04592593 -0.40991172 0.003639670 0.03289703 missing TRUE
2: NumberOfDependents [-Inf,1) 60553 0.57867928 56926 3627 0.05989794 -0.12954652 0.009184289 0.03289703 1 FALSE
3: NumberOfDependents [1,2) 18435 0.17617546 17078 1357 0.07360998 0.09128509 0.001527285 0.03289703 2 FALSE
4: NumberOfDependents [2,3) 13652 0.13046636 12568 1084 0.07940229 0.17330380 0.004224434 0.03289703 3 FALSE
5: NumberOfDependents [3, Inf) 9300 0.08887615 8416 884 0.09505376 0.37036692 0.014321353 0.03289703 Inf FALSE
points
1: 6
2: 2
3: -1
4: -2
5: -5

# 信用评分
# 实际评分
train_score <- scorecard_ply(train, card, print_step = 0)
# 验证集评分
test_score <- scorecard_ply(test, card, print_step = 0)

# 模型的稳定性度量
# psi
psi_result <- perf_psi(
  score = list(train = train_score, test = test_score),
  label = list(train = train$SeriousDlqin2yrs, test = test$SeriousDlqin2yrs)
)
psi_result

在这里插入图片描述