2018.10.16 best-ks分箱

找了很久都没有找到best-ks的分箱代码,自己写了一个。

# -*- coding: utf-8 -*-
"""
Created on Mon Oct  8 09:56:00 2018

@author: Administrator

2018.10.8
创建KS分箱实验
"""
import pandas as pd

def best_ks_box(data,var_name,box_num):
    data = data[[var_name,'是否违约']]
    """
    KS值函数
    """
    def ks_bin(data_,limit):
        g = data_.ix[:,1].value_counts()[0]
        b = data_.ix[:,1].value_counts()[1]
        data_cro = pd.crosstab(data_.ix[:,0],data_.ix[:,1])
        data_cro[0] = data_cro[0]/g
        data_cro[1] = data_cro[1]/b
        data_cro_cum = data_cro.cumsum()
        ks_list = abs(data_cro_cum[1] - data_cro_cum[0])
        ks_list_index = ks_list.nlargest(len(ks_list)).index.tolist()
        for i in ks_list_index:
            data_1 = data_[data_.ix[:,0]<=i]
            data_2 = data_[data_.ix[:,0]>i]
            if len(data_1)>=limit and len(data_2)>=limit:
                break
        return i
    #测试: ks_bin(data,data.shape[0]/7)
    
    """
    区间选取函数
    """
    def ks_zone(data_,list_):
        list_zone = list()
        list_.sort()
        n = 0
        for i in list_:
            m = sum(data_.ix[:,0]<=i) - n
            n = sum(data_.ix[:,0]<=i)
            list_zone.append(m)
        list_zone.append(50000-sum(list_zone)) 
        max_index = list_zone.index(max(list_zone))
        if max_index == 0:
            rst = [data_.ix[:,0].unique().min(),list_[0]]
        elif max_index == len(list_):
            rst = [list_[-1],data_.ix[:,0].unique().max()]
        else:
            rst = [list_[max_index-1],list_[max_index]]
        return rst
#    测试: ks_zone(data_,[23])    #左开右闭
    
    data_ = data.copy()
    limit_ = data.shape[0]/20   #总体的5%
    """"
    循环体
    """
    zone = list()
    for i in range(box_num-1):
        ks_ = ks_bin(data_,limit_)
        zone.append(ks_)
        new_zone = ks_zone(data,zone)
        data_ = data[(data.ix[:,0]>new_zone[0])&(data.ix[:,0]<=new_zone[1])]

    """
    构造分箱明细表
    """
    zone.append(data.ix[:,0].unique().max())
    zone.append(data.ix[:,0].unique().min())
    zone.sort()
    df_ = pd.DataFrame(columns=[0,1])
    for i in range(len(zone)-1):
        if i == 0:
            data_ = data[(data.ix[:,0]>=zone[i])&(data.ix[:,0]<=zone[i+1])]
        else:
            data_ = data[(data.ix[:,0]>zone[i])&(data.ix[:,0]<=zone[i+1])]
        data_cro = pd.crosstab(data_.ix[:,0],data_.ix[:,1])
        df_.loc['{0}-{1}'.format(data_cro.index.min(),data_cro.index.max())] = data_cro.apply(sum)
    return df_

data = pd.read_excel('测试1.xlsx')
var_name = '年龄'
print(best_ks_box(data,var_name,5))

数据链接:https://pan.baidu.com/s/1XcyOhtGPuzL1oliyaHR1Mw 密码:us6c

  • 1
    点赞
  • 13
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值