R 等宽分箱
等宽分箱
近期做评分卡模型,对等宽分箱自定义了一个函数,可输出的结果包含分箱区间,分箱节点,每个分箱下的样本总计、坏样本总计、好样本总计、坏样本占比、woe值、iv值及总的IV值。
代码如下:
#参数说明:data为待分箱的特征变量,target为目标变量,bins为分箱个数(若数据集中包含空值,空值会单独自成一箱,不计算在bins之内)
width_dt=data.frame()
width_cut=function(data,target,bins){
count_pos_sum = 0
count_neg_sum = 0
total_IV = 0
for (n in target){
if (n == 0)
count_pos_sum = count_pos_sum + 1
if (n == 1)
count_neg_sum = count_neg_sum + 1
}
index_na = which(is.na(data))
count_na = as.numeric(length(index_na))
# 考虑样本有缺失值的情况
if (count_na!=0){
count_pos_na = 0
count_neg_na = 0
for (x in index_na){
if (target[x]==0)
count_pos_na = count_pos_na + 1
if (target[x]==1)
count_neg_na = count_neg_na +1
}
width_dt[1,1] = 'missing'
width_dt[1,2] = 'missing'
width_dt[1,3] = count_na
width_dt[1,4] = count_pos_na
width_dt[1,5] = count_neg_na
width_dt[1,6] = count_neg_na / count_na
Bad_distr_na = count_neg_na/count_neg_sum
Good_distr_na = count_pos_na/count_pos_sum
woe_na = log(Bad_distr_na / Good_distr_na)
iv_na = (Bad_distr_na - Good_distr_na) * woe_na
width_dt[1,7] = woe_na
width_dt[1,8] = iv_na
dt = data[-index_na]
target = target[-index_na]
inter = (max(dt)-min(dt)+1)/bins
for (i in 1:bins){
fx_min = min(dt)+(i-1)*inter
fx_max = min(dt)+i*inter
breaks = fx_max
if (fx_min != max(dt)){
if (fx_max != max(dt))
fx = paste('[',round(fx_min,2),',',round(fx_max,2),')',sep = "")
if (fx_max >= max(dt))
fx = paste('[',round(fx_min,2),',',round(max(dt),2),']',sep = "")
}
if (fx_min == max(dt))
fx = round(fx_min)
index = c(which(dt>=fx_min & dt<fx_max))
width_dt[i+1,1] = fx
width_dt[i+1,2] = round(breaks,2)
count_sum = 0
count_neg = 0
count_pos = 0
for (j in dt){
if(j >= fx_min && j < fx_max)
count_sum = count_sum + 1
}
width_dt[i+1,3] = count_sum
for (k in index){
if (target[k]==0)
count_pos = count_pos + 1
if (target[k]==1)
count_neg = count_neg +1
}
width_dt[i+1,4] = count_pos
width_dt[i+1,5] = count_neg
width_dt[i+1,6] = count_neg / count_sum
Bad_distr = count_neg/count_neg_sum
Good_distr = count_pos/count_pos_sum
woe = log(Bad_distr/Good_distr)
IV = (Bad_distr - Good_distr) * log(Bad_distr/Good_distr)
width_dt[i+1,7] = woe
width_dt[i+1,8] = IV
total_IV = total_IV + IV
}
width_dt[,9] = total_IV + iv_na
colnames(width_dt)=c('fx','breaks','count','pos','neg','negprob','woe','IV','total_IV')
print(width_dt)
}
# 考虑样本无缺失值的情况
if (count_na==0){
dt = data
target = target
inter = (max(dt)-min(dt)+1)/bins
for (i in 1:bins){
fx_min = min(dt)+(i-1)*inter
fx_max = min(dt)+i*inter
breaks = fx_max
if (fx_min != max(dt)){
if (fx_max != max(dt))
fx = paste('[',round(fx_min,2),',',round(fx_max,2),')',sep = "")
if (fx_max >= max(dt))
fx = paste('[',round(fx_min,2),',',round(max(dt),2),']',sep = "")
}
if (fx_min == max(dt))
fx = round(fx_min)
index = c(which(dt>=fx_min & dt<fx_max))
width_dt[i,1] = fx
width_dt[i,2] = round(breaks,2)
count_sum = 0
count_neg = 0
count_pos = 0
for (j in dt){
if(j >= fx_min && j < fx_max)
count_sum = count_sum + 1
}
width_dt[i,3] = count_sum
for (k in index){
if (target[k]==0)
count_pos = count_pos + 1
if (target[k]==1)
count_neg = count_neg +1
}
width_dt[i,4] = count_pos
width_dt[i,5] = count_neg
width_dt[i,6] = count_neg / count_sum
Bad_distr = count_neg/count_neg_sum
Good_distr = count_pos/count_pos_sum
woe = log(Bad_distr/Good_distr)
IV = (Bad_distr - Good_distr) * log(Bad_distr/Good_distr)
width_dt[i,7] = woe
width_dt[i,8] = IV
total_IV = total_IV + IV
}
width_dt[,9] = total_IV
colnames(width_dt)=c('fx','breaks','count','pos','neg','negprob','woe','IV','total_IV')
print(width_dt)
}
}