python 数据分箱_PYTHON评分卡建模_分箱代码

#读取数据集,至少包含变量和target两列

sample_set = pd.read_excel('/数据样本.xlsx')

def calc_score_median(sample_set, var):

'''

计算相邻评分的中位数,以便进行决策树二元切分

param sample_set: 待切分样本

param var: 分割变量名称

'''

var_list = list(np.unique(sample_set[var]))

var_median_list = []

for i in range(len(var_list) -1):

var_median = (var_list[i] + var_list[i+1]) / 2

var_median_list.append(var_median)

return var_median_list

def choose_best_split(sample_set, var, min_sample):

'''

使用CART分类决策树选择最好的样本切分点

返回切分点

param sample_set: 待切分样本

param var: 分割变量名称

param min_sample: 待切分样本的最小样本量(限制条件)

'''

# 根据样本评分计算相邻不同分数的中间值

score_median_list = calc_score_median(sample_set, var)

median_len = len(score_median_list)

sample_cnt = sample_set.shape[0]

sample1_cnt = sum(sample_set['target'])

sample0_cnt =  sample_cnt- sample1_cnt

Gini = 1 - np.square(sample1_cnt / sample_cnt) - np.square(sample0_cnt / sample_cnt)

bestGini = 0.0; bestSplit_point = 0.0; bestSplit_position = 0.0

for i in range(median_len):

left = sample_set[sample_set[var] < score_median_list[i]]

right = sample_set[sample_set[var] > score_median_list[i]]

left_cnt = left.shape[0]; right_cnt = right.shape[0]

left1_cnt = sum(left['target']); right1_cnt = sum(right['target'])

left0_cnt =  left_cnt - left1_cnt; right0_cnt =  right_cnt - right1_cnt

left_ratio = left_cnt / sample_cnt; right_ratio = right_cnt / sample_cnt

if left_cnt < min_sample or right_cnt < min_sample:

continue

Gini_left = 1 - np.square(left1_cnt / left_cnt) - np.square(left0_cnt / left_cnt)

Gini_right = 1 - np.square(right1_cnt / right_cnt) - np.square(right0_cnt / right_cnt)

Gini_temp = Gini - (left_ratio * Gini_left + right_ratio * Gini_right)

if Gini_temp > bestGini:

bestGini = Gini_temp; bestSplit_point = score_median_list[i]

if median_len > 1:

bestSplit_position = i / (median_len - 1)

else:

bestSplit_position = i / median_len

else:

continue

Gini = Gini - bestGini

return bestSplit_point, bestSplit_position

def bining_data_split(sample_set, var, min_sample, split_list):

'''

划分数据找到最优分割点list

param sample_set: 待切分样本

param var: 分割变量名称

param min_sample: 待切分样本的最小样本量(限制条件)

param split_list: 最优分割点list

'''

split, position = choose_best_split(sample_set, var, min_sample)

if split != 0.0:

split_list.append(split)

# 根据分割点划分数据集,继续进行划分

sample_set_left = sample_set[sample_set[var] < split]

sample_set_right = sample_set[sample_set[var] > split]

# 如果左子树样本量超过2倍最小样本量,且分割点不是第一个分割点,则切分左子树

if len(sample_set_left) >= min_sample * 2 and position not in [0.0, 1.0]:

bining_data_split(sample_set_left, var, min_sample, split_list)

else:

None

# 如果右子树样本量超过2倍最小样本量,且分割点不是最后一个分割点,则切分右子树

if len(sample_set_right) >= min_sample * 2 and position not in [0.0, 1.0]:

bining_data_split(sample_set_right, var, min_sample, split_list)

else:

None

def get_bestsplit_list(sample_set, var):

'''

根据分箱得到最优分割点list

param sample_set: 待切分样本

param var: 分割变量名称

'''

# 计算最小样本阈值(终止条件)

min_df = sample_set.shape[0] * 0.05

split_list = []

# 计算第一个和最后一个分割点

bining_data_split(sample_set, var, min_df, split_list)

return split_list

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
好的,以下是一个基于CART算法实现的Python最优分箱代码,可以用于对连续变量进行分箱操作: ```python import numpy as np import pandas as pd from sklearn.tree import DecisionTreeRegressor def binning_continuous_var(data, target, min_samples_leaf=50, max_bins=10, return_bins=False): data = pd.concat([data, target], axis=1) cont_cols = data.select_dtypes(include=[np.number]).columns.tolist() for col in cont_cols: binned_col, bins = bin_continuous_var(data, col, target, min_samples_leaf, max_bins) data[col] = binned_col if return_bins: return data, bins else: return data def bin_continuous_var(data, col, target, min_samples_leaf, max_bins): data_range = data[col].max() - data[col].min() if data_range == 0: return data[col], [] else: tree_model = DecisionTreeRegressor( criterion='mse', min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_bins, random_state=42 ) tree_model.fit(data[col].to_frame(), target) n_leaves = tree_model.get_n_leaves() while n_leaves >= max_bins: max_bins -= 1 tree_model = DecisionTreeRegressor( criterion='mse', min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_bins, random_state=42 ) tree_model.fit(data[col].to_frame(), target) n_leaves = tree_model.get_n_leaves() leaves_range = [(tree_model.tree_.threshold[i - 1], tree_model.tree_.threshold[i]) for i in np.where(tree_model.tree_.children_left == -1)[0]] bins = [data[col].min()] + [i[1] for i in leaves_range[:-1]] + [data[col].max()] binned_col = np.digitize(data[col], bins) binned_col = pd.Series(binned_col, index=data.index) binned_col = binned_col.map(lambda x: np.round(np.mean(data[target.name][binned_col == x]), 4)) return binned_col, bins ``` 该代码中,`binning_continuous_var`函数是用于执行最优分箱的主函数,输入参数包括待分箱数据、目标变量、最小样本数、最大分箱数和是否返回分箱边界值等。该函数会循环处理每个连续变量,并调用`bin_continuous_var`函数对每个连续变量进行分箱操作,最后将分箱结果更新到数据集中。如果需要返回分箱结果,则返回数据集和分箱边界值列表。 `bin_continuous_var`函数是用于执行单个连续变量的分箱操作,输入参数包括待分箱数据、连续变量的列名、目标变量、最小样本数和最大分箱数等。该函数会使用CART算法拟合一个回归树模型,并根据最大叶节点数目的限制对树进行剪枝操作,从而得到最优的分箱边界值。最后,该函数会将数据集中的连续变量转换为对应的分箱结果,并返回分箱结果和分箱边界值列表。 使用该代码,您只需要将待分箱数据和目标变量传入`binning_continuous_var`函数即可,例如: ```python # 生成测试数据 data = pd.DataFrame({ 'col1': np.random.rand(1000), 'col2': np.random.rand(1000), 'col3': np.random.rand(1000), 'target': np.random.randint(0, 2, 1000) }) # 执行最优分箱操作 data_binned = binning_continuous_var(data.drop('target', axis=1), data['target'], min_samples_leaf=50, max_bins=10, return_bins=False) ``` 以上就是一个基于CART算法实现的Python最优分箱代码,希望可以对您有所帮助。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值