对数据集分箱的方式三种,等宽等频最优,下面介绍对数据集进行最优分箱,分箱的其他介绍可以查看其他的博文,具体在这就不细说了:
大体步骤:
加载数据;
遍历所有的feature, 分别处理离散和连续特征;
得到IV树;
递归遍历IV树,得到分割点构成的列表;
去掉不符合条件的分割点,得到最优分割点列表;
遍历最优分割点列表,将最优分割点信息注入到InfoValue对象中;
将每个特征构成的对象放到规则集中(是一个列表);
通过规则集对test进行WOE转换;
将规则集存一份到csv中,可以直观的查看;
dump一份形成model以供转换数据集使用。
woe.py
#!/usr/bin/python
# -*- coding:utf-8 -*-
import os
import numpy as np
import pandas as pd
import copy
from sklearn.externals import joblib
from sklearn.model_selection import KFold
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
__all__ = ["WoeFeatureProcess"]
class DisInfoValue(object):
"""
用来存储离散特征woe转化信息
"""
def __init__(self):
self.var_name = None
self.origin_value = []
self.woe_before = []
class Node(object):
"""
Tree Node Class
"""
def __init__(self, var_name=None, iv=0, split_point=None, right=None, left=None):
self.var_name = var_name
self.iv = iv
self.split_point = split_point # split points list
self.right = right # right sub tree
self.left = left # left sub tree
class InfoValue(object):
def __init__(self):
self.var_name = []
self.split_list = []
self.iv = 0
self.woe_list = []
self.iv_list = []
self.is_discrete = 0
self.sub_total_sample_num = []
self.positive_sample_num = []
self.negative_sample_num = []
self.sub_total_num_percentage = []
self.positive_rate_in_sub_total = []
self.negative_rate_in_sub_total = []
def init(self, civ):
self.var_name = civ.var_name
self.split_list = civ.split_list
self.iv = civ.iv
self.woe_list = civ.woe_list
self.iv_list = civ.iv_list
self.is_discrete = civ.is_discrete
self.sub_total_sample_num = civ.sub_total_sample_num
self.positive_sample_num = civ.positive_sample_num
self.negative_sample_num = civ.negative_sample_num
self.sub_total_num_percentage = civ.sub_total_num_percentage
self.positive_rate_in_sub_total = civ.positive_rate_in_sub_total
self.negative_rate_in_sub_total = civ.negative_rate_in_sub_total
class WoeFeatureProcess(object):
def __init__(self, continuous_fillna=-1, discrete_fillna='missing', alpha=0.05,
train_start='2017-09-01', train_end='2017-12-01', test_start='2017-12-01', test_end='2017-12-31'):
self.__conf = None
self.__dataset_all = None
self.__dataset_train = None
self.__dataset_test = None
self.__dataset_rest = None
self.__variable_type = None
self.__bin_var_list = []
self.__discrete_var_list = []
self.__identify_var_list = []
self.__model_var_list = []
self.__rule_list = [] # 里面方的是InfoValue对象
self.__continuous_fillna = continuous_fillna
self.__discrete_fillna = discrete_fillna
self.__train_start = train_start
self.__train_end = train_end
self.__test_start = test_start
self.__test_end = test_end
self.__alpha = alpha
def load_file(self, config_path=None, data_path=None):
"""
load dataset and split dataframe into train , test subsets and rest set
:param config_path:
:param data_path:
:return:
"""
if os.path.isfile(config_path) and os.path.isfile(data_path):
config_path = config_path
data_path = data_path
else:
raise ValueError("some file path does not exist, please check config_path, data_path")
self.__conf = pd.read_csv(config_path)
self.__variable_type = dict(zip(self.__conf['var_name'], self.__conf['var_dtype']))
self.__bin_var_list = self.__conf[(self.__conf['is_candidate'] == 1)
& (self.__conf['is_tobe_bin'] == 1)]['var_name'] # 连续特征
self.__discrete_var_list = self.__conf[(self.__conf['is_candidate'] == 1)
& (self.__conf['is_tobe_bin'] == 0)]['var_name'] # 离散特征
self.__model_var_list = self.__conf[self.__conf['is_modelfeature'] == 1]['var_name'] # 入模特征
self.__identify_var_list = self.__conf[self.__conf['is_user_identify'] == 1][
'var_name'] # 用户标识
self.__dataset_all = pd.read_csv(data_path, nrows=50000).rename(columns={'overdue_day': 'target'})
self.__dataset_all['target'] = self.__dataset_all['target'].apply(lambda x: 1 if x > 7 else 0)
self.__dataset_all['create_time'] = self.__dataset_all['create_time'].astype(str)
self.__dataset_all['create_time'] = self.__dataset_all['create_time'].apply(
lambda x: x[:4] + '-' + x[4:6] + '-' + x[6:])
self.__fillna()
self.__change_var_dtype()
self.__dataset_train = self.__dataset_all[(self.__dataset_all['create_time'] >= self.__train_start)
& (self.__dataset_all['create_time'] < self.__train_end)]
self.__dataset_test = self.__dataset_all[(self.__dataset_all['create_time'] >= self.__test_start)
& (self.__dataset_all['create_time'] < self.__test_end)]
self.__dataset_rest = self.__dataset_all[(self.__dataset_all['create_time'] < self.__train_start)
| (self.__dataset_all['create_time'] >= self.__test_end)]
print('train: test = {}:{}'.format(self.__dataset_train.shape[0], self.__dataset_test.shape[0]))
train_bad = self.__dataset_train[self.__dataset_train['target'] == 1].shape[0]
test_bad = self.__dataset_test[self.__dataset_test['target'] == 1].shape[0]
# 训练机和测试机中坏用户占比
print('train_p: test_p = {}:{}'.format(train_bad * 1.0 / self.__dataset_train.shape[0],
test_bad * 1.0 / self.__dataset_test.shape[0]))
def fit(self, woed_train_path=None, woed_test_path=None, feature_detail_path&#