等频分箱和等距分箱是无监督分箱,卡方分箱和最小熵值分箱是有监督分箱,需要指定标签。
等频分箱
区间的边界值要经过选择,使得每个区间包含大致相等的实例数量。比如说 N=10 ,每个区间应该包含大约10%的实例。
无法使用pd.qcut,用Rank_qcut替代
def Discretization_EqualFrequency(bins_number, data: pd.DataFrame):
"""
等频分箱。
:param bins_number: 箱子数
:param data:一维 DataFrame, 要分箱的数据
:return:
"""
DisOneFeature = pd.qcut(data, bins_number, labels=range(1, bins_number+1))
# DisOneFeature = self.Rank_qcut(pd.Series(data.iloc[:, 0]), bins_number)
DisData = pd.DataFrame(DisOneFeature)
DisData.columns = ["labels"]
DisData = pd.concat([DisData, data], axis=1)
return DisData
def Rank_qcut(self, vector, bins_number):
quantile = np.array([float(i) / bins_number for i in range(bins_number + 1)]) # 转换成百分比
# 函数:(quantile >= x)返回fasle/true列表中第一次出现true的索引值
funBounder = lambda x: (quantile >= x).argmax()
# 每个值对应的百分位数,最终返回对应的组数;rank()函数传入的数据类型若为object,结果会有问题,因此进行了astype
return vector.rank(pct=True).astype(float).apply(funBounder)
等距分箱
从最小值到最大值之间,均分为 N 等份, 如果 A,B 为最小最大值, 则每个区间的长度为 W=(B−A)/N , 则区间边界值为A+W,A+2W,….A+(N−1)W 。这里只考虑边界,每个等份里面的实例数量可能不等。
def Discretization_EqualWidth(self, bins_number, data: pd.DataFrame):
"""
等距分箱。
:param bins_number: 箱子数
:param data: 一维 DataFrame, 要分箱的数据
:return:
"""
DisOneFeature = pd.cut(data.iloc[:0], bins_number, labels=range(1, bins_number + 1))
DisData = pd.DataFrame(DisOneFeature)
DisData.columns = ["labels"]
DisData = pd.concat([DisData, data], axis=1)
return DisData
卡方分箱
#!/usr/bin/env python
# -*- coding:utf-8 -*-
'''
@Time : 2019/9/5
@Author : yangjing gan
@File : Dependency_dsct.py
@Contact : ganyangjing95@qq.com
@License : (C)Copyright 2019-2020
'''
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from scipy.stats import chi2
from dsct_tools import data_describe
def calc_chi2(count, group1, group2):
"""
根据分组信息(group)计算各分组的卡方值
:param count: DataFrame 待分箱变量各取值的正负样本数
:param group1: list 单个分组信息
:param group2: list 单个分组信息
:return: 该分组的卡方值
"""
count_intv1 = count.loc[count.index.isin(group1)].sum(axis=0).values
count_intv2 = count.loc[count.index.isin(group2)].sum(axis=0).values
count_intv = np.vstack((count_intv1, count_intv2))
# 计算四联表
row_sum = count_intv.sum(axis=1)
col_sum = count_intv.sum(axis=0)
total_sum = count_intv.sum()
# 计算期望样本数
count_exp = np.ones(count_intv.shape) * col_sum / total_sum
count_exp = (count_exp.T * row_sum).T
# 计算卡方值
chi2 = (count_intv - count_exp) ** 2 / count_exp
chi2[count_exp == 0] = 0
return chi2.sum()
def merge_adjacent_intervals(count, chi2_list, group):
"""
根据卡方值合并卡方值最小的相邻分组并更新卡方值
:param count: DataFrame 待分箱变量的
:param chi2_list: list 每个分组的卡方值
:param group: list 分组信息
:return: 合并后的分组信息及卡方值
"""
min_idx = chi2_list.index(min(chi2_list))
# 根据卡方值合并卡方值最小的相邻分组
group[min_idx] = group[min_idx] + group[min_idx+1]
group.remove(group[min_idx+1])
# 更新卡方值
if min_idx == 0:
chi2_list.pop(min_idx)
chi2_list[min_idx] = calc_chi2(count, group[min_idx], group[min_idx+1])
elif min_idx == len(group)-1:
chi2_list[min_idx-1] = calc_chi2(count, group[min_idx-1], group[min_idx])
chi2_list.pop(min_idx)
else:
chi2_list[min_idx-1] = calc_chi2(count, group[min_idx-1], group[min_idx])
chi2_list.pop(min_idx)
chi2_list[min_idx] = calc_chi2(count, group[min_idx], group[min_idx+1])
return chi2_list, group
def calc_inconsistency_rate(count, group):
"""
计算分组的不一致性,参考论文《Feature Selection via Discretizations》
:param count: DataFrame 待分箱变量的分布统计
:param group: list 分组信息
:return: float 该分组的不一致性
"""
inconsistency_rate = 0.0
for intv in group:
count_intv = count.loc[count.index.isin(intv)].sum(axis = 0)
inconsistency_rate += count_intv.sum() - max(count_intv)
inconsistency_rate = inconsistency_rate / count.sum().sum()
# print(inconsistency_rate)
return inconsistency_rate
def Chi_Merge(count, max_interval=6, sig_level=0.05):
"""
基于ChiMerge的卡方离散化方法
:param count: DataFrame 待分箱变量各取值的正负样本数
:param max_interval: int 最大分箱数量
:param sig_level: 显著性水平(significance level) = 1 - 置信度
:return: 分组信息(group)
"""
print("ChiMerge分箱开始:")
deg_freedom = len(count.columns) - 1 # 自由度(degree of freedom)= y类别数-1
chi2_threshold = chi2.ppf(1 - sig_level, deg_freedom) # 卡方阈值
group = np.array(count.index).reshape(-1, 1).tolist()