#对离散特征进行分箱(特征分箱需遵循分箱后样本随分箱结果有规律变化)
def get_dispersed_result_group(data,aim,label,label_loss):
'''
data : 数据集需包括aim和label
aim : 要分组的特征名
label :标签名
label_loss : 标签的流失值,就是分箱后指定的需随结果规律变化的label值
'''
result_group = []
result_rate = []
group_lis = data[aim].value_counts().index
for group in group_lis;
need_data = data[data[aim] == group][label]
if len(need_data) < 2:
result_group.append(group)
result_rate.append(0)
break
tem = need_data.value_counts().to_dict()
loss_rate = tem[label_loss] / len(need_data)
result_group.append(group)
result_rate.append(loss_rate)
n = len(result_rate)
while n > 1:
for point in range(1,n):
front = point - 1
behind = point
if result_rate[front] > result_rate[behind]:
result_rate[front],result_rate[behind] = result_rate[behind],result_rate[front]
result_group[front],result_group[behind] = result_group[behind],result_group[front]
n -= 1
return aim,result_group,result_rate
def dispersed_box_data(data,aim,result_group):
'''
data : 全部数据
aim :针对的特征
result_group :排序后的分组
'''
if len(result_group) < 3:
print("does not need cut !")
return
else:
mid_index = len(result_group) // 2
data[aim].replace(result_group[:mid_index],0,inplace=True)
data[aim].replace(result_group[mid_index:],1,inplaceTrue)
离散分箱
最新推荐文章于 2022-06-04 21:11:05 发布