Table of Contents
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.rcParams["font.sans-serif"] = ["FangSong"]
plt.rcParams["axes.unicode_minus"] = False
import warnings
warnings.filterwarnings("ignore")
from sklearn.tree import DecisionTreeClassifier
数据准备
使用x作为待分箱数据
y为分箱的目标标签
data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
variable = "mean radius"
x = df[variable].values
y = data.target
test = pd.DataFrame({'x':x,'y':y})
test.head()
| x | y |
---|
0 | 17.99 | 0 |
---|
1 | 20.57 | 0 |
---|
2 | 19.69 | 0 |
---|
3 | 11.42 | 0 |
---|
4 | 20.29 | 0 |
---|
sns.kdeplot(test.x)
<matplotlib.axes._subplots.AxesSubplot at 0x298b39e7390>
sns.boxplot(test.x)
<matplotlib.axes._subplots.AxesSubplot at 0x298b3c79b00>
woe和iv的计算过程和理解可见:
https://blog.csdn.net/xiezhen_zheng/article/details/82888653
定义计算函数
def woe_iv(data, x_col='x', y_col='y'):
'''
data 为含有y和分箱标签的dataframe
x_col 为分箱标签
y_col 为分类标签
'''
rate_table = test.groupby(x_col)[y_col].agg(['count', 'sum']).rename(
columns={'count': 'total', 'sum': 'bad'})
rate_table['good'] = rate_table['total'] - \
rate_table['bad']
total_bad = rate_table.sum()['bad']
total_good = rate_table.sum()['good']
rate_table['p_bad'] = rate_table['bad']/total_bad
rate_table['p_good'] = rate_table['good']/total_good
woe = np.log(rate_table['p_bad']/rate_table['p_good'])
iv = (rate_table['p_bad'] - rate_table['p_good'])*woe
rate_table['iv'] = iv
rate_table['woe'] = woe
return rate_table
决策树分箱
def optimal_binning_boundary(data, x_col = 'x',y_col = 'y',criterion='gini') -> list:
'''
利用决策树进行分箱
data 为含有y和分箱标签的dataframe
x_col 为分箱标签
y_col 为分类标签
criterion 为决策树分类准则
'''
x0 = data[x_col]
x = data[x_col].values
y = data[y_col].values
boundary = []
clf = DecisionTreeClassifier(criterion=criterion,
max_leaf_nodes=6,
min_samples_leaf=0.05)
clf.fit(x.reshape(-1, 1), y)
n_nodes = clf.tree_.node_count
children_left = clf.tree_.children_left
children_right = clf.tree_.children_right
threshold = clf.tree_.threshold
for i in range(n_nodes):
if children_left[i] != children_right[i]:
boundary.append(threshold[i])
boundary.sort()
min_x = x.min()
max_x = x.max()
boundary = [min_x] + boundary + [max_x]
data['bins_dtc'] = pd.cut(x0, bins=boundary, include_lowest=True, labels=False)
return boundary, data
boundary, bins = optimal_binning_boundary(test)
woe_iv(bins,'bins_dtc')
| total | bad | good | p_bad | p_good | iv | woe |
---|
bins_dtc | | | | | | | |
---|
0 | 150 | 147 | 3 | 0.411765 | 0.014151 | 1.340225 | 3.370671 |
---|
1 | 115 | 105 | 10 | 0.294118 | 0.047170 | 0.451970 | 1.830226 |
---|
2 | 49 | 39 | 10 | 0.109244 | 0.047170 | 0.052131 | 0.839827 |
---|
3 | 83 | 55 | 28 | 0.154062 | 0.132075 | 0.003385 | 0.153979 |
---|
4 | 54 | 10 | 44 | 0.028011 | 0.207547 | 0.359566 | -2.002754 |
---|
5 | 118 | 1 | 117 | 0.002801 | 0.551887 | 2.900997 | -5.283323 |
---|
分位数分箱
使用pd.qcut函数
bins = pd.qcut(df[variable] ,5,labels=False,)
test['bins_q'] = bins
woe_iv(test,'bins_q')
| total | bad | good | p_bad | p_good | iv | woe |
---|
bins_q | | | | | | | |
---|
0 | 114 | 112 | 2 | 0.313725 | 0.009434 | 1.066299 | 3.504202 |
---|
1 | 114 | 106 | 8 | 0.296919 | 0.037736 | 0.534655 | 2.062848 |
---|
2 | 113 | 91 | 22 | 0.254902 | 0.103774 | 0.135814 | 0.898668 |
---|
3 | 114 | 47 | 67 | 0.131653 | 0.316038 | 0.161465 | -0.875695 |
---|
4 | 114 | 1 | 113 | 0.002801 | 0.533019 | 2.782868 | -5.248537 |
---|
等距和等距log映射
np.floor_divide(a,b) a除b再取整
bins = np.floor() 向下取整
bins = np.floor_divide(df[variable] ,6)
test['bins_divide'] = bins
woe_iv(test,'bins_divide')
| total | bad | good | p_bad | p_good | iv | woe |
---|
bins_divide | | | | | | | |
---|
1.0 | 169 | 163 | 6 | 0.456583 | 0.028302 | 1.190981 | 2.780841 |
---|
2.0 | 308 | 194 | 114 | 0.543417 | 0.537736 | 0.000060 | 0.010510 |
---|
3.0 | 85 | 0 | 85 | 0.000000 | 0.400943 | inf | -inf |
---|
4.0 | 7 | 0 | 7 | 0.000000 | 0.033019 | inf | -inf |
---|
bins = np.floor(np.log2(df[variable]))
test['bins_log'] = bins
woe_iv(test,'bins_log')
| total | bad | good | p_bad | p_good | iv | woe |
---|
bins_log | | | | | | | |
---|
2.0 | 4 | 4 | 0 | 0.011204 | 0.000000 | inf | inf |
---|
3.0 | 424 | 347 | 77 | 0.971989 | 0.363208 | 0.599266 | 0.984370 |
---|
4.0 | 141 | 6 | 135 | 0.016807 | 0.636792 | 2.253440 | -3.634665 |
---|
卡方分箱
卡方分箱的具体思路为:
https://mp.weixin.qq.com/s?__biz=MzA5Njc1MDA2Ng%3D%3D&idx=1&mid=2651650083&sn=a24381efa404500ae96ccfcc3716a614
def Chi2(df, total_col, bad_col,overallRate):
'''
#此函数计算卡方值
:df dataFrame
:total_col 每个值得总数量
:bad_col 每个值的坏数据数量
:overallRate 坏数据的占比
: return 卡方值
'''
df2=df.copy()
df2['expected']=df[total_col].apply(lambda x: x*overallRate)
combined=zip(df2['expected'], df2[bad_col])
chi=[(i[0]-i[1])**2/i[0] for i in combined]
chi2=sum(chi)
return chi2
def chiMerge(data,x_col = 'x',y_col = 'y',max_bins = 5):
gro = data.groupby(x_col)[y_col].agg(['mean', 'count'])
gro['bad'] = gro['count']*gro['mean']
total_rate_bad = gro.sum()['bad']/gro.sum()['count']
gro['exp_bad'] = gro['count']*total_rate_bad
gro['chi2'] = ((gro['exp_bad'] - gro['bad']) ** 2)/gro['exp_bad']
gro.drop('mean', 1, inplace=True)
bad = list(gro['bad'])
exp_bad = list(gro['exp_bad'])
chi_c = np.array((bad,exp_bad)).T
interval = [[i] for i in list(gro.index)]
chi2 = list(gro['chi2'].values)
while len(interval) >max_bins:
between_sum = [chi2[i] + chi2[i+1] for i in range(len(chi2)-1)]
chi2_min_index = between_sum.index(min(between_sum))
interval[chi2_min_index] = interval[chi2_min_index] + interval[chi2_min_index+1]
interval.pop(chi2_min_index+1)
chi_c[chi2_min_index] = chi_c[chi2_min_index] + chi_c[chi2_min_index + 1]
chi_c = np.delete(chi_c,chi2_min_index + 1,0)
chi2[chi2_min_index] = ((chi_c[chi2_min_index][0] - chi_c[chi2_min_index][1])**2)/chi_c[chi2_min_index][1]
chi2.pop(chi2_min_index+1)
interval_ = [min(i) for i in interval] + [data[x_col].max()]
bins = pd.cut(data[x_col],interval_,labels=False,include_lowest=True)
return bins,interval_
inter = chiMerge(test)
bins = inter[0]
test['bins_kafang'] = bins
woe_iv(test,'bins_kafang')
| total | bad | good | p_bad | p_good | iv | woe |
---|
bins_kafang | | | | | | | |
---|
0 | 425 | 351 | 74 | 0.983193 | 0.349057 | 0.656694 | 1.035572 |
---|
1 | 55 | 6 | 49 | 0.016807 | 0.231132 | 0.561792 | -2.621210 |
---|
2 | 29 | 0 | 29 | 0.000000 | 0.136792 | inf | -inf |
---|
3 | 31 | 0 | 31 | 0.000000 | 0.146226 | inf | -inf |
---|
4 | 29 | 0 | 29 | 0.000000 | 0.136792 | inf | -inf |
---|