python最优分箱计算iv值_对数据集进行最优分箱和WOE转换

对数据集分箱的方式三种,等宽等频最优,下面介绍对数据集进行最优分箱,分箱的其他介绍可以查看其他的博文,具体在这就不细说了:

大体步骤:

加载数据;

遍历所有的feature, 分别处理离散和连续特征;

得到IV树;

递归遍历IV树,得到分割点构成的列表;

去掉不符合条件的分割点,得到最优分割点列表;

遍历最优分割点列表,将最优分割点信息注入到InfoValue对象中;

将每个特征构成的对象放到规则集中(是一个列表);

通过规则集对test进行WOE转换;

将规则集存一份到csv中,可以直观的查看;

dump一份形成model以供转换数据集使用。

woe.py

#!/usr/bin/python

# -*- coding:utf-8 -*-

import os

import numpy as np

import pandas as pd

import copy

from sklearn.externals import joblib

from sklearn.model_selection import KFold

pd.set_option('display.max_rows', 500)

pd.set_option('display.max_columns', 500)

pd.set_option('display.width', 1000)

__all__ = ["WoeFeatureProcess"]

class DisInfoValue(object):

"""

用来存储离散特征woe转化信息

"""

def __init__(self):

self.var_name = None

self.origin_value = []

self.woe_before = []

class Node(object):

"""

Tree Node Class

"""

def __init__(self, var_name=None, iv=0, split_point=None, right=None, left=None):

self.var_name = var_name

self.iv = iv

self.split_point = split_point # split points list

self.right = right # right sub tree

self.left = left # left sub tree

class InfoValue(object):

def __init__(self):

self.var_name = []

self.split_list = []

self.iv = 0

self.woe_list = []

self.iv_list = []

self.is_discrete = 0

self.sub_total_sample_num = []

self.positive_sample_num = []

self.negative_sample_num = []

self.sub_total_num_percentage = []

self.positive_rate_in_sub_total = []

self.negative_rate_in_sub_total = []

def init(self, civ):

self.var_name = civ.var_name

self.split_list = civ.split_list

self.iv = civ.iv

self.woe_list = civ.woe_list

self.iv_list = civ.iv_list

self.is_discrete = civ.is_discrete

self.sub_total_sample_num = civ.sub_total_sample_num

self.positive_sample_num = civ.positive_sample_num

self.negative_sample_num = civ.negative_sample_num

self.sub_total_num_percentage = civ.sub_total_num_percentage

self.positive_rate_in_sub_total = civ.positive_rate_in_sub_total

self.negative_rate_in_sub_total = civ.negative_rate_in_sub_total

class WoeFeatureProcess(object):

def __init__(self, continuous_fillna=-1, discrete_fillna='missing', alpha=0.05,

train_start='2017-09-01', train_end='2017-12-01', test_start='2017-12-01', test_end='2017-12-31'):

self.__conf = None

self.__dataset_all = None

self.__dataset_train = None

self.__dataset_test = None

self.__dataset_rest = None

self.__variable_type = None

self.__bin_var_list = []

self.__discrete_var_list = []

self.__identify_var_list = []

self.__model_var_list = []

self.__rule_list = [] # 里面方的是InfoValue对象

self.__continuous_fillna = continuous_fillna

self.__discrete_fillna = discrete_fillna

self.__train_start = train_start

self.__train_end = train_end

self.__test_start = test_start

self.__test_end = test_end

self.__alpha = alpha

def load_file(self, config_path=None, data_path=None):

"""

load dataset and split dataframe into train , test subsets and rest set

:param config_path:

:param data_path:

:return:

"""

if os.path.isfile(config_path) and os.path.isfile(data_path):

config_path = config_path

data_path = data_path

else:

raise ValueError("some file path does not exist, please check config_path, data_path")

self.__conf = pd.read_csv(config_path)

self.__variable_type = dict(zip(self.__conf['var_name'], self.__conf['var_dtype']))

self.__bin_var_list = self.__conf[(self.__conf['is_candidate'] == 1)

& (self.__conf['is_tobe_bin'] == 1)]['var_name'] # 连续特征

self.__discrete_var_list = self.__conf[(self.__conf['is_candidate'] == 1)

& (self.__conf['is_tobe_bin'] == 0)]['var_name'] # 离散特征

self.__model_var_list = self.__conf[self.__conf['is_modelfeature'] == 1]['var_name'] # 入模特征

self.__identify_var_list = self.__conf[self.__conf['is_user_identify'] == 1][

'var_name'] # 用户标识

self.__dataset_all = pd.read_csv(data_path, nrows=50000).rename(columns={'overdue_day': 'target'})

self.__dataset_all['target'] = self.__dataset_all['target'].apply(lambda x: 1 if x > 7 else 0)

self.__dataset_all['create_time'] = self.__dataset_all['create_time'].astype(str)

self.__dataset_all['create_time'] = self.__dataset_all['create_time'].apply(

lambda x: x[:4] + '-' + x[4:6] + '-' + x[6:])

self.__fillna()

self.__change_var_dtype()

self.__dataset_train = self.__dataset_all[(self.__dataset_all['create_time'] >= self.__train_start)

& (self.__dataset_all['create_time'] < self.__train_end)]

self.__dataset_test = self.__dataset_all[(self.__dataset_all['create_time'] >= self.__test_start)

& (self.__dataset_all['create_time'] < self.__test_end)]

self.__dataset_rest = self.__dataset_all[(self.__dataset_all['create_time'] < self.__train_start)

| (self.__dataset_all['create_time'] >= self.__test_end)]

print('train: test = {}:{}'.format(self.__dataset_train.shape[0], self.__dataset_test.shape[0]))

train_bad = self.__dataset_train[self.__dataset_train['target'] == 1].shape[0]

test_bad = self.__dataset_test[self.__dataset_test['target'] == 1].shape[0]

# 训练机和测试机中坏用户占比

print('train_p: test_p = {}:{}'.format(train_bad * 1.0 / self.__dataset_train.shape[0],

test_bad * 1.0 / self.__dataset_test.shape[0]))

def fit(self, woed_train_path=None, woed_test_path=None, feature_detail_path&#

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值