python最优分箱计算iv值_对数据集进行最优分箱和WOE转换

最新推荐文章于 2021-12-06 22:48:46 发布

weixin_39780255

最新推荐文章于 2021-12-06 22:48:46 发布

阅读量1.4k

点赞数

文章标签： python最优分箱计算iv值

对数据集分箱的方式三种，等宽等频最优，下面介绍对数据集进行最优分箱，分箱的其他介绍可以查看其他的博文，具体在这就不细说了：

大体步骤：

加载数据；

遍历所有的feature, 分别处理离散和连续特征；

得到IV树；

递归遍历IV树，得到分割点构成的列表；

去掉不符合条件的分割点，得到最优分割点列表；

遍历最优分割点列表，将最优分割点信息注入到InfoValue对象中；

将每个特征构成的对象放到规则集中(是一个列表)；

通过规则集对test进行WOE转换；

将规则集存一份到csv中，可以直观的查看；

dump一份形成model以供转换数据集使用。

woe.py

#!/usr/bin/python

# -*- coding:utf-8 -*-

import os

import numpy as np

import pandas as pd

import copy

from sklearn.externals import joblib

from sklearn.model_selection import KFold

pd.set_option('display.max_rows', 500)

pd.set_option('display.max_columns', 500)

pd.set_option('display.width', 1000)

__all__ = ["WoeFeatureProcess"]

class DisInfoValue(object):

"""

用来存储离散特征woe转化信息

"""

def __init__(self):

self.var_name = None

self.origin_value = []

self.woe_before = []

class Node(object):

"""

Tree Node Class

"""

def __init__(self, var_name=None, iv=0, split_point=None, right=None, left=None):

self.var_name = var_name

self.iv = iv

self.split_point = split_point # split points list

self.right = right # right sub tree

self.left = left # left sub tree

class InfoValue(object):

def __init__(self):

self.var_name = []

self.split_list = []

self.iv = 0

self.woe_list = []

self.iv_list = []

self.is_discrete = 0

self.sub_total_sample_num = []

self.positive_sample_num = []

self.negative_sample_num = []

self.sub_total_num_percentage = []

self.positive_rate_in_sub_total = []

self.negative_rate_in_sub_total = []

def init(self, civ):

self.var_name = civ.var_name

self.split_list = civ.split_list

self.iv = civ.iv

self.woe_list = civ.woe_list

self.iv_list = civ.iv_list

self.is_discrete = civ.is_discrete

self.sub_total_sample_num = civ.sub_total_sample_num

self.positive_sample_num = civ.positive_sample_num

self.negative_sample_num = civ.negative_sample_num

self.sub_total_num_percentage = civ.sub_total_num_percentage

self.positive_rate_in_sub_total = civ.positive_rate_in_sub_total

self.negative_rate_in_sub_total = civ.negative_rate_in_sub_total

class WoeFeatureProcess(object):

def __init__(self, continuous_fillna=-1, discrete_fillna='missing', alpha=0.05,

train_start='2017-09-01', train_end='2017-12-01', test_start='2017-12-01', test_end='2017-12-31'):

self.__conf = None

self.__dataset_all = None

self.__dataset_train = None

self.__dataset_test = None

self.__dataset_rest = None

self.__variable_type = None

self.__bin_var_list = []

self.__discrete_var_list = []

self.__identify_var_list = []

self.__model_var_list = []

self.__rule_list = [] # 里面方的是InfoValue对象

self.__continuous_fillna = continuous_fillna

self.__discrete_fillna = discrete_fillna

self.__train_start = train_start

self.__train_end = train_end

self.__test_start = test_start

self.__test_end = test_end

self.__alpha = alpha

def load_file(self, config_path=None, data_path=None):

"""

load dataset and split dataframe into train ， test subsets and rest set

:param config_path:

:param data_path:

:return:

"""

if os.path.isfile(config_path) and os.path.isfile(data_path):

config_path = config_path

data_path = data_path

else:

raise ValueError("some file path does not exist, please check config_path, data_path")

self.__conf = pd.read_csv(config_path)

self.__variable_type = dict(zip(self.__conf['var_name'], self.__conf['var_dtype']))

self.__bin_var_list = self.__conf[(self.__conf['is_candidate'] == 1)

& (self.__conf['is_tobe_bin'] == 1)]['var_name'] # 连续特征

self.__discrete_var_list = self.__conf[(self.__conf['is_candidate'] == 1)

& (self.__conf['is_tobe_bin'] == 0)]['var_name'] # 离散特征

self.__model_var_list = self.__conf[self.__conf['is_modelfeature'] == 1]['var_name'] # 入模特征

self.__identify_var_list = self.__conf[self.__conf['is_user_identify'] == 1][

'var_name'] # 用户标识

self.__dataset_all = pd.read_csv(data_path, nrows=50000).rename(columns={'overdue_day': 'target'})

self.__dataset_all['target'] = self.__dataset_all['target'].apply(lambda x: 1 if x > 7 else 0)

self.__dataset_all['create_time'] = self.__dataset_all['create_time'].astype(str)

self.__dataset_all['create_time'] = self.__dataset_all['create_time'].apply(

lambda x: x[:4] + '-' + x[4:6] + '-' + x[6:])

self.__fillna()

self.__change_var_dtype()

self.__dataset_train = self.__dataset_all[(self.__dataset_all['create_time'] >= self.__train_start)

& (self.__dataset_all['create_time'] < self.__train_end)]

self.__dataset_test = self.__dataset_all[(self.__dataset_all['create_time'] >= self.__test_start)

& (self.__dataset_all['create_time'] < self.__test_end)]

self.__dataset_rest = self.__dataset_all[(self.__dataset_all['create_time'] < self.__train_start)

| (self.__dataset_all['create_time'] >= self.__test_end)]

print('train: test = {}:{}'.format(self.__dataset_train.shape[0], self.__dataset_test.shape[0]))

train_bad = self.__dataset_train[self.__dataset_train['target'] == 1].shape[0]

test_bad = self.__dataset_test[self.__dataset_test['target'] == 1].shape[0]

# 训练机和测试机中坏用户占比

print('train_p: test_p = {}:{}'.format(train_bad * 1.0 / self.__dataset_train.shape[0],

test_bad * 1.0 / self.__dataset_test.shape[0]))

def fit(self, woed_train_path=None, woed_test_path=None, feature_detail_path&#

最低0.47元/天解锁文章

weixin_39780255

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
python最优分箱计算iv值_对数据集进行最优分箱和WOE转换

对数据集分箱的方式三种，等宽等频最优，下面介绍对数据集进行最优分箱，分箱的其他介绍可以查看其他的博文，具体在这就不细说了：大体步骤：加载数据；遍历所有的feature, 分别处理离散和连续特征；得到IV树；递归遍历IV树，得到分割点构成的列表；去掉不符合条件的分割点，得到最优分割点列表；遍历最优分割点列表，将最优分割点信息注入到InfoValue对象中；将每个特征构成的对象放到规则集中(是一个列表...
复制链接

扫一扫