1

      版权声明:文笔尽力通俗,讲得全,听得懂,学得会,用得上。觉得不错,留下小红心哟❤!!!( 转载请标明出处,谢谢!)          https://blog.csdn.net/sunyaowu315/article/details/84061980        </div>
        <link rel="stylesheet" href="https://csdnimg.cn/release/phoenix/template/css/ck_htmledit_views-f57960eb32.css">
                          <div id="content_views" class="markdown_views prism-tomorrow-night-eighties">
        <!-- flowchart 箭头图标 勿删 -->
        <svg xmlns="http://www.w3.org/2000/svg" style="display: none;">
          <path stroke-linecap="round" d="M5,0 0,2.5 5,5z" id="raphael-marker-block" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></path>
        </svg>
        <p>【博客地址】:<a href="https://blog.csdn.net/sunyaowu315" rel="nofollow" target="_blank">https://blog.csdn.net/sunyaowu315</a><br>

【博客大纲地址】:https://blog.csdn.net/sunyaowu315/article/details/82905347


数据集介绍:

本次案例分析所用的数据,是拍拍贷发起的一次与信贷申请审核工作相关的竞赛数据集。其中共有3份文件:

  • PPD_Training_Master_GBK_3_1_Training_Set.csv :信贷用户在拍拍贷上的申报信息和部分三方数据信息,以及需要预测的目标变量。
  • PPD_LogInfo_3_1_Training_Set : 信贷客户的登录信息
  • PPD_Userupdate_Info_3_1_Training_Set :部分客户的信息修改行为

建模工作就是从上述三个文件中对数据进行加工,提取特征并且建立合适的模型,对贷后表现做预测。

【Logistic原理】:https://blog.csdn.net/sunyaowu315/article/details/87866135


  对数据分析、机器学习、数据科学、金融风控等感兴趣的小伙伴,需要数据集、代码、行业报告等各类学习资料,可添加qq群(资料共享群):102755159,也可添加微信wu805686220,加入微信讨论群,相互学习,共同成长。

在这里插入图片描述

主程序

import pandas as pd
import datetime
import collections
import numpy as np
import numbers
import random
import sys
import sys
_path = r'C:\Users\A3\Desktop\LR_scorecard'
sys.path.append(_path)
import pickle
from itertools import combinations
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
import statsmodels.api as sm
from importlib import reload
from matplotlib import pyplot as plt
reload(sys)
#sys.setdefaultencoding( "utf-8")
import scorecard_functions as sf
#from scorecard_functions_V3 import *
from sklearn.linear_model import LogisticRegressionCV
# -*- coding: utf-8 -*-

################################
######## UDF: 自定义函数 ########
################################
### 对时间窗口,计算累计产比 ###
def TimeWindowSelection(df, daysCol, time_windows):
‘’’
:param df: the dataset containg variabel of days
:param daysCol: the column of days
:param time_windows: the list of time window
:return:
‘’’

freq_tw = {}
for tw in time_windows:
freq = sum(df[daysCol].apply(lambda x: int(x<=tw)))
freq_tw[tw] = freq
return freq_tw

def DeivdedByZero(nominator, denominator):
‘’’
当分母为0时,返回0;否则返回正常值
‘’’

if denominator == 0:
return 0
else:
return nominator*1.0/denominator

#对某些统一的字段进行统一
def ChangeContent(x):
y = x.upper()
if y == ‘_MOBILEPHONE’:
y = ‘_PHONE’
return y

def MissingCategorial(df,x):
missing_vals = df[x].map(lambda x: int(x!=x))
return sum(missing_vals)*1.0/df.shape[0]

def MissingContinuous(df,x):
missing_vals = df[x].map(lambda x: int(np.isnan(x)))
return sum(missing_vals) * 1.0 / df.shape[0]

def MakeupRandom(x, sampledList):
if x==x:
return x
else:
randIndex = random.randint(0, len(sampledList)-1)
return sampledList[randIndex]

############################################################
#Step 0: 数据分析的初始工作, 包括读取数据文件、检查用户Id的一致性等#
############################################################

folderOfData = ‘C:/Users/A3/Desktop/scorecard/’
data1 = pd.read_csv(folderOfData+‘PPD_LogInfo_3_1_Training_Set.csv’, header = 0)
data2 = pd.read_csv(folderOfData+‘PPD_Training_Master_GBK_3_1_Training_Set.csv’, header = 0,encoding = ‘gbk’)
data3 = pd.read_csv(folderOfData+‘PPD_Userupdate_Info_3_1_Training_Set.csv’, header = 0)

#############################################################################################
# Step 1: 从PPD_LogInfo_3_1_Training_Set & PPD_Userupdate_Info_3_1_Training_Set数据中衍生特征#
#############################################################################################
# compare whether the four city variables match
data2[‘city_match’] = data2.apply(lambda x: int(x.UserInfo_2 x.UserInfo_4 x.UserInfo_8 == x.UserInfo_20),axis = 1)
del data2[‘UserInfo_2’]
del data2[‘UserInfo_4’]
del data2[‘UserInfo_8’]
del data2[‘UserInfo_20’]

### 提取申请日期,计算日期差,查看日期差的分布
data1[‘logInfo’] = data1[‘LogInfo3’].map(lambda x: datetime.datetime.strptime(x,’%Y-%m-%d’))
data1[‘Listinginfo’] = data1[‘Listinginfo1’].map(lambda x: datetime.datetime.strptime(x,’%Y-%m-%d’))
data1[‘ListingGap’] = data1[[‘logInfo’,‘Listinginfo’]].apply(lambda x: (x[1]-x[0]).days,axis = 1)
plt.hist(data1[‘ListingGap’],bins=200)
plt.title(‘Days between login date and listing date’)
ListingGap2 = data1[‘ListingGap’].map(lambda x: min(x,365))
plt.hist(ListingGap2,bins=200)

timeWindows = TimeWindowSelection(data1, ‘ListingGap’, range(30,361,30))

‘’’
使用180天作为最大的时间窗口计算新特征
所有可以使用的时间窗口可以有7 days, 30 days, 60 days, 90 days, 120 days, 150 days and 180 days.
在每个时间窗口内,计算总的登录次数,不同的登录方式,以及每种登录方式的平均次数
‘’’

time_window = [7, 30, 60, 90, 120, 150, 180]
var_list = [‘LogInfo1’,‘LogInfo2’]
data1GroupbyIdx = pd.DataFrame({‘Idx’:data1[‘Idx’].drop_duplicates()})

for tw in time_window:
data1[‘TruncatedLogInfo’] = data1[‘Listinginfo’].map(lambda x: x + datetime.timedelta(-tw))
temp = data1.loc[data1[‘logInfo’] >= data1[‘TruncatedLogInfo’]]
for var in var_list:
#count the frequences of LogInfo1 and LogInfo2
count_stats = temp.groupby([‘Idx’])[var].count().to_dict()
data1GroupbyIdx[str(var)+’_’+str(tw)+’_count’] = data1GroupbyIdx[‘Idx’].map(lambda x: count_stats.get(x,0))

    <span class="token comment"># count the distinct value of LogInfo1 and LogInfo2</span>
    Idx_UserupdateInfo1 <span class="token operator">=</span> temp<span class="token punctuation">[</span><span class="token punctuation">[</span><span class="token string">'Idx'</span><span class="token punctuation">,</span> var<span class="token punctuation">]</span><span class="token punctuation">]</span><span class="token punctuation">.</span>drop_duplicates<span class="token punctuation">(</span><span class="token punctuation">)</span>
    uniq_stats <span class="token operator">=</span> Idx_UserupdateInfo1<span class="token punctuation">.</span>groupby<span class="token punctuation">(</span><span class="token punctuation">[</span><span class="token string">'Idx'</span><span class="token punctuation">]</span><span class="token punctuation">)</span><span class="token punctuation">[</span>var<span class="token punctuation">]</span><span class="token punctuation">.</span>count<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">.</span>to_dict<span class="token punctuation">(</span><span class="token punctuation">)</span>
    data1GroupbyIdx<span class="token punctuation">[</span><span class="token builtin">str</span><span class="token punctuation">(</span>var<span class="token punctuation">)</span> <span class="token operator">+</span> <span class="token string">'_'</span> <span class="token operator">+</span> <span class="token builtin">str</span><span class="token punctuation">(</span>tw<span class="token punctuation">)</span> <span class="token operator">+</span> <span class="token string">'_unique'</span><span class="token punctuation">]</span> <span class="token operator">=</span> data1GroupbyIdx<span class="token punctuation">[</span><span class="token string">'Idx'</span><span class="token punctuation">]</span><span class="token punctuation">.</span><span class="token builtin">map</span><span class="token punctuation">(</span><span class="token keyword">lambda</span> x<span class="token punctuation">:</span> uniq_stats<span class="token punctuation">.</span>get<span class="token punctuation">(</span>x<span class="token punctuation">,</span><span class="token number">0</span><span class="token punctuation">)</span><span class="token punctuation">)</span>

    <span class="token comment"># calculate the average count of each value in LogInfo1 and LogInfo2</span>
    data1GroupbyIdx<span class="token punctuation">[</span><span class="token builtin">str</span><span class="token punctuation">(</span>var<span class="token punctuation">)</span> <span class="token operator">+</span> <span class="token string">'_'</span> <span class="token operator">+</span> <span class="token builtin">str</span><span class="token punctuation">(</span>tw<span class="token punctuation">)</span> <span class="token operator">+</span> <span class="token string">'_avg_count'</span><span class="token punctuation">]</span> <span class="token operator">=</span> data1GroupbyIdx<span class="token punctuation">[</span><span class="token punctuation">[</span><span class="token builtin">str</span><span class="token punctuation">(</span>var<span class="token punctuation">)</span><span class="token operator">+</span><span class="token string">'_'</span><span class="token operator">+</span><span class="token builtin">str</span><span class="token punctuation">(</span>tw<span class="token punctuation">)</span><span class="token operator">+</span><span class="token string">'_count'</span><span class="token punctuation">,</span><span class="token builtin">str</span><span class="token punctuation">(</span>var<span class="token punctuation">)</span> <span class="token operator">+</span> <span class="token string">'_'</span> <span class="token operator">+</span> <span class="token builtin">str</span><span class="token punctuation">(</span>tw<span class="token punctuation">)</span> <span class="token operator">+</span> <span class="token string">'_unique'</span><span class="token punctuation">]</span><span class="token punctuation">]</span><span class="token punctuation">.</span>\
        <span class="token builtin">apply</span><span class="token punctuation">(</span><span class="token keyword">lambda</span> x<span class="token punctuation">:</span> DeivdedByZero<span class="token punctuation">(</span>x<span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">]</span><span class="token punctuation">,</span>x<span class="token punctuation">[</span><span class="token number">1</span><span class="token punctuation">]</span><span class="token punctuation">)</span><span class="token punctuation">,</span> axis<span class="token operator">=</span><span class="token number">1</span><span class="token punctuation">)</span>

data3[‘ListingInfo’] = data3[‘ListingInfo1’].map(lambda x: datetime.datetime.strptime(x,’%Y/%m/%d’))
data3[‘UserupdateInfo’] = data3[‘UserupdateInfo2’].map(lambda x: datetime.datetime.strptime(x,’%Y/%m/%d’))
data3[‘ListingGap’] = data3[[‘UserupdateInfo’,‘ListingInfo’]].apply(lambda x: (x[1]-x[0]).days,axis = 1)
collections.Counter(data3[‘ListingGap’])
hist_ListingGap = np.histogram(data3[‘ListingGap’])
hist_ListingGap = pd.DataFrame({‘Freq’:hist_ListingGap[0],‘gap’:hist_ListingGap[1][1:]})
hist_ListingGap[‘CumFreq’] = hist_ListingGap[‘Freq’].cumsum()
hist_ListingGap[‘CumPercent’] = hist_ListingGap[‘CumFreq’].map(lambda x: x*1.0/hist_ListingGap.iloc[-1][‘CumFreq’])

‘’’
对 QQ和qQ, Idnumber和idNumber,MOBILEPHONE和PHONE 进行统一
在时间切片内,计算
(1) 更新的频率
(2) 每种更新对象的种类个数
(3) 对重要信息如IDNUMBER,HASBUYCAR, MARRIAGESTATUSID, PHONE的更新
‘’’

data3[‘UserupdateInfo1’] = data3[‘UserupdateInfo1’].map(ChangeContent)
data3GroupbyIdx = pd.DataFrame({‘Idx’:data3[‘Idx’].drop_duplicates()})

time_window = [7, 30, 60, 90, 120, 150, 180]
for tw in time_window:
data3[‘TruncatedLogInfo’] = data3[‘ListingInfo’].map(lambda x: x + datetime.timedelta(-tw))
temp = data3.loc[data3[‘UserupdateInfo’] >= data3[‘TruncatedLogInfo’]]

<span class="token comment">#frequency of updating</span>
freq_stats <span class="token operator">=</span> temp<span class="token punctuation">.</span>groupby<span class="token punctuation">(</span><span class="token punctuation">[</span><span class="token string">'Idx'</span><span class="token punctuation">]</span><span class="token punctuation">)</span><span class="token punctuation">[</span><span class="token string">'UserupdateInfo1'</span><span class="token punctuation">]</span><span class="token punctuation">.</span>count<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">.</span>to_dict<span class="token punctuation">(</span><span class="token punctuation">)</span>
data3GroupbyIdx<span class="token punctuation">[</span><span class="token string">'UserupdateInfo_'</span><span class="token operator">+</span><span class="token builtin">str</span><span class="token punctuation">(</span>tw<span class="token punctuation">)</span><span class="token operator">+</span><span class="token string">'_freq'</span><span class="token punctuation">]</span> <span class="token operator">=</span> data3GroupbyIdx<span class="token punctuation">[</span><span class="token string">'Idx'</span><span class="token punctuation">]</span><span class="token punctuation">.</span><span class="token builtin">map</span><span class="token punctuation">(</span><span class="token keyword">lambda</span> x<span class="token punctuation">:</span> freq_stats<span class="token punctuation">.</span>get<span class="token punctuation">(</span>x<span class="token punctuation">,</span><span class="token number">0</span><span class="token punctuation">)</span><span class="token punctuation">)</span>

<span class="token comment"># number of updated types</span>
Idx_UserupdateInfo1 <span class="token operator">=</span> temp<span class="token punctuation">[</span><span class="token punctuation">[</span><span class="token string">'Idx'</span><span class="token punctuation">,</span><span class="token string">'UserupdateInfo1'</span><span class="token punctuation">]</span><span class="token punctuation">]</span><span class="token punctuation">.</span>drop_duplicates<span class="token punctuation">(</span><span class="token punctuation">)</span>
uniq_stats <span class="token operator">=</span> Idx_UserupdateInfo1<span class="token punctuation">.</span>groupby<span class="token punctuation">(</span><span class="token punctuation">[</span><span class="token string">'Idx'</span><span class="token punctuation">]</span><span class="token punctuation">)</span><span class="token punctuation">[</span><span class="token string">'UserupdateInfo1'</span><span class="token punctuation">]</span><span class="token punctuation">.</span>count<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">.</span>to_dict<span class="token punctuation">(</span><span class="token punctuation">)</span>
data3GroupbyIdx<span class="token punctuation">[</span><span class="token string">'UserupdateInfo_'</span> <span class="token operator">+</span> <span class="token builtin">str</span><span class="token punctuation">(</span>tw<span class="token punctuation">)</span> <span class="token operator">+</span> <span class="token string">'_unique'</span><span class="token punctuation">]</span> <span class="token operator">=</span> data3GroupbyIdx<span class="token punctuation">[</span><span class="token string">'Idx'</span><span class="token punctuation">]</span><span class="token punctuation">.</span><span class="token builtin">map</span><span class="token punctuation">(</span><span class="token keyword">lambda</span> x<span class="token punctuation">:</span> uniq_stats<span class="token punctuation">.</span>get<span class="token punctuation">(</span>x<span class="token punctuation">,</span> x<span class="token punctuation">)</span><span class="token punctuation">)</span>

<span class="token comment">#average count of each type</span>
data3GroupbyIdx<span class="token punctuation">[</span><span class="token string">'UserupdateInfo_'</span> <span class="token operator">+</span> <span class="token builtin">str</span><span class="token punctuation">(</span>tw<span class="token punctuation">)</span> <span class="token operator">+</span> <span class="token string">'_avg_count'</span><span class="token punctuation">]</span> <span class="token operator">=</span> data3GroupbyIdx<span class="token punctuation">[</span><span class="token punctuation">[</span><span class="token string">'UserupdateInfo_'</span><span class="token operator">+</span><span class="token builtin">str</span><span class="token punctuation">(</span>tw<span class="token punctuation">)</span><span class="token operator">+</span><span class="token string">'_freq'</span><span class="token punctuation">,</span> <span class="token string">'UserupdateInfo_'</span> <span class="token operator">+</span> <span class="token builtin">str</span><span class="token punctuation">(</span>tw<span class="token punctuation">)</span> <span class="token operator">+</span> <span class="token string">'_unique'</span><span class="token punctuation">]</span><span class="token punctuation">]</span><span class="token punctuation">.</span> \
    <span class="token builtin">apply</span><span class="token punctuation">(</span><span class="token keyword">lambda</span> x<span class="token punctuation">:</span> x<span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">]</span> <span class="token operator">*</span> <span class="token number">1.0</span> <span class="token operator">/</span> x<span class="token punctuation">[</span><span class="token number">1</span><span class="token punctuation">]</span><span class="token punctuation">,</span> axis<span class="token operator">=</span><span class="token number">1</span><span class="token punctuation">)</span>

<span class="token comment">#whether the applicant changed items like IDNUMBER,HASBUYCAR, MARRIAGESTATUSID, PHONE</span>
Idx_UserupdateInfo1<span class="token punctuation">[</span><span class="token string">'UserupdateInfo1'</span><span class="token punctuation">]</span> <span class="token operator">=</span> Idx_UserupdateInfo1<span class="token punctuation">[</span><span class="token string">'UserupdateInfo1'</span><span class="token punctuation">]</span><span class="token punctuation">.</span><span class="token builtin">map</span><span class="token punctuation">(</span><span class="token keyword">lambda</span> x<span class="token punctuation">:</span> <span class="token punctuation">[</span>x<span class="token punctuation">]</span><span class="token punctuation">)</span>
Idx_UserupdateInfo1_V2 <span class="token operator">=</span> Idx_UserupdateInfo1<span class="token punctuation">.</span>groupby<span class="token punctuation">(</span><span class="token punctuation">[</span><span class="token string">'Idx'</span><span class="token punctuation">]</span><span class="token punctuation">)</span><span class="token punctuation">[</span><span class="token string">'UserupdateInfo1'</span><span class="token punctuation">]</span><span class="token punctuation">.</span><span class="token builtin">sum</span><span class="token punctuation">(</span><span class="token punctuation">)</span>
<span class="token keyword">for</span> item <span class="token keyword">in</span> <span class="token punctuation">[</span><span class="token string">'_IDNUMBER'</span><span class="token punctuation">,</span><span class="token string">'_HASBUYCAR'</span><span class="token punctuation">,</span><span class="token string">'_MARRIAGESTATUSID'</span><span class="token punctuation">,</span><span class="token string">'_PHONE'</span><span class="token punctuation">]</span><span class="token punctuation">:</span>
    item_dict <span class="token operator">=</span> Idx_UserupdateInfo1_V2<span class="token punctuation">.</span><span class="token builtin">map</span><span class="token punctuation">(</span><span class="token keyword">lambda</span> x<span class="token punctuation">:</span> <span class="token builtin">int</span><span class="token punctuation">(</span>item <span class="token keyword">in</span> x<span class="token punctuation">)</span><span class="token punctuation">)</span><span class="token punctuation">.</span>to_dict<span class="token punctuation">(</span><span class="token punctuation">)</span>
    data3GroupbyIdx<span class="token punctuation">[</span><span class="token string">'UserupdateInfo_'</span> <span class="token operator">+</span> <span class="token builtin">str</span><span class="token punctuation">(</span>tw<span class="token punctuation">)</span> <span class="token operator">+</span> <span class="token builtin">str</span><span class="token punctuation">(</span>item<span class="token punctuation">)</span><span class="token punctuation">]</span> <span class="token operator">=</span> data3GroupbyIdx<span class="token punctuation">[</span><span class="token string">'Idx'</span><span class="token punctuation">]</span><span class="token punctuation">.</span><span class="token builtin">map</span><span class="token punctuation">(</span><span class="token keyword">lambda</span> x<span class="token punctuation">:</span> item_dict<span class="token punctuation">.</span>get<span class="token punctuation">(</span>x<span class="token punctuation">,</span> x<span class="token punctuation">)</span><span class="token punctuation">)</span>

# Combine the above features with raw features in PPD_Training_Master_GBK_3_1_Training_Set
allData = pd.concat([data2.set_index(‘Idx’), data3GroupbyIdx.set_index(‘Idx’), data1GroupbyIdx.set_index(‘Idx’)],axis= 1)
allData.to_csv(folderOfData+‘allData_0.csv’,encoding = ‘gbk’)

#######################################
# Step 2: 对类别型变量和数值型变量进行补缺#
######################################
allData = pd.read_csv(folderOfData+‘allData_0.csv’,header = 0,encoding = ‘gbk’)
allFeatures = list(allData.columns)
allFeatures.remove(‘target’)
if ‘Idx’ in allFeatures:
allFeatures.remove(‘Idx’)
allFeatures.remove(‘ListingInfo’)

#检查是否有常数型变量,并且检查是类别型还是数值型变量
numerical_var = []
for col in allFeatures:
if len(set(allData[col])) 1:
print(‘delete {} from the dataset because it is a constant’.format(col))
del allData[col]
allFeatures.remove(col)
else:
uniq_valid_vals = [i for i in allData[col] if i i]
uniq_valid_vals = list(set(uniq_valid_vals))
if len(uniq_valid_vals) >= 10 and isinstance(uniq_valid_vals[0], numbers.Real):
numerical_var.append(col)

categorical_var = [i for i in allFeatures if i not in numerical_var]

#检查变量的最多值的占比情况,以及每个变量中占比最大的值
records_count = allData.shape[0]
col_most_values,col_large_value = {},{}
for col in allFeatures:
value_count = allData[col].groupby(allData[col]).count()
col_most_values[col] = max(value_count)/records_count
large_value = value_count[value_count== max(value_count)].index[0]
col_large_value[col] = large_value
col_most_values_df = pd.DataFrame.from_dict(col_most_values, orient = ‘index’)
col_most_values_df.columns = [‘max percent’]
col_most_values_df = col_most_values_df.sort_values(by = ‘max percent’, ascending = False)
pcnt = list(col_most_values_df[:500][‘max percent’])
vars = list(col_most_values_df[:500].index)
plt.bar(range(len(pcnt)), height = pcnt)
plt.title(‘Largest Percentage of Single Value in Each Variable’)

#计算多数值占比超过90%的字段中,少数值的坏样本率是否会显著高于多数值
large_percent_cols = list(col_most_values_df[col_most_values_df[‘max percent’]>=0.9].index)
bad_rate_diff = {}
for col in large_percent_cols:
large_value = col_large_value[col]
temp = allData[[col,‘target’]]
temp[col] = temp.apply(lambda x: int(x[col]large_value),axis=1)
bad_rate = temp.groupby(col).mean()
if bad_rate.iloc[0][‘target’] 0:
bad_rate_diff[col] = 0
continue
bad_rate_diff[col] = np.log(bad_rate.iloc[0][‘target’]/bad_rate.iloc[1][‘target’])
bad_rate_diff_sorted = sorted(bad_rate_diff.items(),key=lambda x: x[1], reverse=True)
bad_rate_diff_sorted_values = [x[1] for x in bad_rate_diff_sorted]
plt.bar(x = range(len(bad_rate_diff_sorted_values)), height = bad_rate_diff_sorted_values)

#由于所有的少数值的坏样本率并没有显著高于多数值,意味着这些变量可以直接剔除
for col in large_percent_cols:
if col in numerical_var:
numerical_var.remove(col)
else:
categorical_var.remove(col)
del allData[col]

‘’’
对类别型变量,如果缺失超过80%, 就删除,否则当成特殊的状态
‘’’

missing_pcnt_threshould_1 = 0.8
for col in categorical_var:
missingRate = MissingCategorial(allData,col)
print(’{0} has missing rate as {1}’.format(col,missingRate))
if missingRate > missing_pcnt_threshould_1:
categorical_var.remove(col)
del allData[col]
if 0 < missingRate < missing_pcnt_threshould_1:
uniq_valid_vals = [i for i in allData[col] if i == i]
uniq_valid_vals = list(set(uniq_valid_vals))
if isinstance(uniq_valid_vals[0], numbers.Real):
missing_position = allData.loc[allData[col] != allData[col]][col].index
not_missing_sample = [-1]*len(missing_position)
allData.loc[missing_position, col] = not_missing_sample
else:
# In this way we convert NaN to NAN, which is a string instead of np.nan
allData[col] = allData[col].map(lambda x: str(x).upper())

allData_bk = allData.copy()
‘’’
检查数值型变量
‘’’

missing_pcnt_threshould_2 = 0.8
deleted_var = []
for col in numerical_var:
missingRate = MissingContinuous(allData, col)
print(’{0} has missing rate as {1}’.format(col, missingRate))
if missingRate > missing_pcnt_threshould_2:
deleted_var.append(col)
print(‘we delete variable {} because of its high missing rate’.format(col))
else:
if missingRate > 0:
not_missing = allData.loc[allData[col] == allData[col]][col]
#makeuped = allData[col].map(lambda x: MakeupRandom(x, list(not_missing)))
missing_position = allData.loc[allData[col] != allData[col]][col].index
not_missing_sample = random.sample(list(not_missing), len(missing_position))
allData.loc[missing_position,col] = not_missing_sample
#del allData[col]
#allData[col] = makeuped
missingRate2 = MissingContinuous(allData, col)
print(‘missing rate after making up is:{}’.format(str(missingRate2)))

if deleted_var != []:
for col in deleted_var:
numerical_var.remove(col)
del allData[col]

allData.to_csv(folderOfData+‘allData_1.csv’, header=True,encoding=‘gbk’, columns = allData.columns, index=False)

allData = pd.read_csv(folderOfData+‘allData_1.csv’, header=0,encoding=‘gbk’)

###################################
# Step 3: 基于卡方分箱法对变量进行分箱#
###################################
‘’’
对不同类型的变量,分箱的处理是不同的:
(1)数值型变量可直接分箱
(2)取值个数较多的类别型变量,需要用bad rate做编码转换成数值型变量,再分箱
(3)取值个数较少的类别型变量不需要分箱,但是要检查是否每个类别都有好坏样本。如果有类别只有好或坏,需要合并
‘’’

#for each categorical variable, if it has distinct values more than 5, we use the ChiMerge to merge it

trainData = pd.read_csv(folderOfData+‘allData_1.csv’,header = 0, encoding=‘gbk’)
#trainData = pd.read_csv(folderOfData+‘allData_1.csv’,header = 0, encoding=‘gbk’,dtype=object)
allFeatures = list(trainData.columns)
allFeatures.remove(‘ListingInfo’)
allFeatures.remove(‘target’)
#allFeatures.remove(‘Idx’)

#将特征区分为数值型和类别型
numerical_var = []
for var in allFeatures:
uniq_vals = list(set(trainData[var]))
if np.nan in uniq_vals:
uniq_vals.remove( np.nan)
if len(uniq_vals) >= 10 and isinstance(uniq_vals[0],numbers.Real):
numerical_var.append(var)

categorical_var = [i for i in allFeatures if i not in numerical_var]

for col in categorical_var:
#for Chinese character, upper() is not valid
if col not in [‘UserInfo_7’,‘UserInfo_9’,‘UserInfo_19’]:
trainData[col] = trainData[col].map(lambda x: str(x).upper())

‘’’
对于类别型变量,按照以下方式处理
1,如果变量的取值个数超过5,计算bad rate进行编码
2,除此之外,其他任何类别型变量如果有某个取值中,对应的样本全部是坏样本或者是好样本,进行合并。
‘’’

deleted_features = [] #将处理过的变量删除,防止对后面建模的干扰
encoded_features = {} #将bad rate编码方式保存下来,在以后的测试和生产环境中需要使用
merged_features = {} #将类别型变量合并方案保留下来
var_IV = {} #save the IV values for binned features #将IV值保留和WOE值
var_WOE = {}
for col in categorical_var:
print(‘we are processing {}’.format(col))
# =============================================================================
# if len(set(trainData[col]))>1000:
# continue
# =============================================================================
if len(set(trainData[col]))>5:
print(’{} is encoded with bad rate’.format(col))
col0 = str(col)+’_encoding’

    <span class="token comment">#(1), 计算坏样本率并进行编码</span>
    encoding_result <span class="token operator">=</span> sf<span class="token punctuation">.</span>BadRateEncoding<span class="token punctuation">(</span>trainData<span class="token punctuation">,</span> col<span class="token punctuation">,</span> <span class="token string">'target'</span><span class="token punctuation">)</span>
    trainData<span class="token punctuation">[</span>col0<span class="token punctuation">]</span><span class="token punctuation">,</span> br_encoding <span class="token operator">=</span> encoding_result<span class="token punctuation">[</span><span class="token string">'encoding'</span><span class="token punctuation">]</span><span class="token punctuation">,</span>encoding_result<span class="token punctuation">[</span><span class="token string">'bad_rate'</span><span class="token punctuation">]</span>

    <span class="token comment">#(2), 将(1)中的编码后的变量也加入数值型变量列表中,为后面的卡方分箱做准备</span>
    numerical_var<span class="token punctuation">.</span>append<span class="token punctuation">(</span>col0<span class="token punctuation">)</span>

    <span class="token comment">#(3), 保存编码结果</span>
    encoded_features<span class="token punctuation">[</span>col<span class="token punctuation">]</span> <span class="token operator">=</span> <span class="token punctuation">[</span>col0<span class="token punctuation">,</span> br_encoding<span class="token punctuation">]</span>

    <span class="token comment">#(4), 删除原始值</span>

    deleted_features<span class="token punctuation">.</span>append<span class="token punctuation">(</span>col<span class="token punctuation">)</span>
<span class="token keyword">else</span><span class="token punctuation">:</span>
    bad_bin <span class="token operator">=</span> trainData<span class="token punctuation">.</span>groupby<span class="token punctuation">(</span><span class="token punctuation">[</span>col<span class="token punctuation">]</span><span class="token punctuation">)</span><span class="token punctuation">[</span><span class="token string">'target'</span><span class="token punctuation">]</span><span class="token punctuation">.</span><span class="token builtin">sum</span><span class="token punctuation">(</span><span class="token punctuation">)</span>
    <span class="token comment">#对于类别数少于5个,但是出现0坏样本的特征需要做处理</span>
    <span class="token keyword">if</span> <span class="token builtin">min</span><span class="token punctuation">(</span>bad_bin<span class="token punctuation">)</span> <span class="token operator">==</span> <span class="token number">0</span><span class="token punctuation">:</span>
        <span class="token keyword">print</span><span class="token punctuation">(</span><span class="token string">'{} has 0 bad sample!'</span><span class="token punctuation">.</span><span class="token builtin">format</span><span class="token punctuation">(</span>col<span class="token punctuation">)</span><span class="token punctuation">)</span>
        col1 <span class="token operator">=</span> <span class="token builtin">str</span><span class="token punctuation">(</span>col<span class="token punctuation">)</span> <span class="token operator">+</span> <span class="token string">'_mergeByBadRate'</span>
        <span class="token comment">#(1), 找出最优合并方式,使得每一箱同时包含好坏样本</span>
        mergeBin <span class="token operator">=</span> sf<span class="token punctuation">.</span>MergeBad0<span class="token punctuation">(</span>trainData<span class="token punctuation">,</span> col<span class="token punctuation">,</span> <span class="token string">'target'</span><span class="token punctuation">)</span>
        <span class="token comment">#(2), 依照(1)的结果对值进行合并</span>
        trainData<span class="token punctuation">[</span>col1<span class="token punctuation">]</span> <span class="token operator">=</span> trainData<span class="token punctuation">[</span>col<span class="token punctuation">]</span><span class="token punctuation">.</span><span class="token builtin">map</span><span class="token punctuation">(</span>mergeBin<span class="token punctuation">)</span>
        maxPcnt <span class="token operator">=</span> sf<span class="token punctuation">.</span>MaximumBinPcnt<span class="token punctuation">(</span>trainData<span class="token punctuation">,</span> col1<span class="token punctuation">)</span>
        <span class="token comment">#如果合并后导致有箱占比超过90%,就删除。</span>
        <span class="token keyword">if</span> maxPcnt <span class="token operator">&gt;</span> <span class="token number">0.9</span><span class="token punctuation">:</span>
            <span class="token keyword">print</span><span class="token punctuation">(</span><span class="token string">'{} is deleted because of large percentage of single bin'</span><span class="token punctuation">.</span><span class="token builtin">format</span><span class="token punctuation">(</span>col<span class="token punctuation">)</span><span class="token punctuation">)</span>
            deleted_features<span class="token punctuation">.</span>append<span class="token punctuation">(</span>col<span class="token punctuation">)</span>
            categorical_var<span class="token punctuation">.</span>remove<span class="token punctuation">(</span>col<span class="token punctuation">)</span>
            <span class="token keyword">del</span> trainData<span class="token punctuation">[</span>col<span class="token punctuation">]</span>
            <span class="token keyword">continue</span>
        <span class="token comment">#(3) 如果合并后的新的变量满足要求,就保留下来</span>
        merged_features<span class="token punctuation">[</span>col<span class="token punctuation">]</span> <span class="token operator">=</span> <span class="token punctuation">[</span>col1<span class="token punctuation">,</span> mergeBin<span class="token punctuation">]</span>
        WOE_IV <span class="token operator">=</span> sf<span class="token punctuation">.</span>CalcWOE<span class="token punctuation">(</span>trainData<span class="token punctuation">,</span> col1<span class="token punctuation">,</span> <span class="token string">'target'</span><span class="token punctuation">)</span>
        var_WOE<span class="token punctuation">[</span>col1<span class="token punctuation">]</span> <span class="token operator">=</span> WOE_IV<span class="token punctuation">[</span><span class="token string">'WOE'</span><span class="token punctuation">]</span>
        var_IV<span class="token punctuation">[</span>col1<span class="token punctuation">]</span> <span class="token operator">=</span> WOE_IV<span class="token punctuation">[</span><span class="token string">'IV'</span><span class="token punctuation">]</span>
        <span class="token comment">#del trainData[col]</span>
        deleted_features<span class="token punctuation">.</span>append<span class="token punctuation">(</span>col<span class="token punctuation">)</span>
    <span class="token keyword">else</span><span class="token punctuation">:</span>
        WOE_IV <span class="token operator">=</span> sf<span class="token punctuation">.</span>CalcWOE<span class="token punctuation">(</span>trainData<span class="token punctuation">,</span> col<span class="token punctuation">,</span> <span class="token string">'target'</span><span class="token punctuation">)</span>
        var_WOE<span class="token punctuation">[</span>col<span class="token punctuation">]</span> <span class="token operator">=</span> WOE_IV<span class="token punctuation">[</span><span class="token string">'WOE'</span><span class="token punctuation">]</span>
        var_IV<span class="token punctuation">[</span>col<span class="token punctuation">]</span> <span class="token operator">=</span> WOE_IV<span class="token punctuation">[</span><span class="token string">'IV'</span><span class="token punctuation">]</span>

‘’’
对于连续型变量,处理方式如下:
1,利用卡方分箱法将变量分成5个箱
2,检查坏样本率的单调性,如果发现单调性不满足,就进行合并,直到满足单调性
‘’’

var_cutoff = {}
for col in numerical_var:
print("{} is in processing".format(col))
col1 = str(col) + ‘_Bin’

<span class="token comment">#(1),用卡方分箱法进行分箱,并且保存每一个分割的端点。例如端点=[10,20,30]表示将变量分为x&lt;10,10&lt;x&lt;20,20&lt;x&lt;30和x&gt;30.</span>
<span class="token comment">#特别地,缺失值-1不参与分箱</span>
<span class="token keyword">if</span> <span class="token operator">-</span><span class="token number">1</span> <span class="token keyword">in</span> <span class="token builtin">set</span><span class="token punctuation">(</span>trainData<span class="token punctuation">[</span>col<span class="token punctuation">]</span><span class="token punctuation">)</span><span class="token punctuation">:</span>
    special_attribute <span class="token operator">=</span> <span class="token punctuation">[</span><span class="token operator">-</span><span class="token number">1</span><span class="token punctuation">]</span>
<span class="token keyword">else</span><span class="token punctuation">:</span>
    special_attribute <span class="token operator">=</span> <span class="token punctuation">[</span><span class="token punctuation">]</span>
cutOffPoints <span class="token operator">=</span> sf<span class="token punctuation">.</span>ChiMerge<span class="token punctuation">(</span>trainData<span class="token punctuation">,</span> col<span class="token punctuation">,</span> <span class="token string">'target'</span><span class="token punctuation">,</span>special_attribute<span class="token operator">=</span>special_attribute<span class="token punctuation">)</span>
var_cutoff<span class="token punctuation">[</span>col<span class="token punctuation">]</span> <span class="token operator">=</span> cutOffPoints
trainData<span class="token punctuation">[</span>col1<span class="token punctuation">]</span> <span class="token operator">=</span> trainData<span class="token punctuation">[</span>col<span class="token punctuation">]</span><span class="token punctuation">.</span><span class="token builtin">map</span><span class="token punctuation">(</span><span class="token keyword">lambda</span> x<span class="token punctuation">:</span> sf<span class="token punctuation">.</span>AssignBin<span class="token punctuation">(</span>x<span class="token punctuation">,</span> cutOffPoints<span class="token punctuation">,</span>special_attribute<span class="token operator">=</span>special_attribute<span class="token punctuation">)</span><span class="token punctuation">)</span>

<span class="token comment">#(2), check whether the bad rate is monotone</span>
BRM <span class="token operator">=</span> sf<span class="token punctuation">.</span>BadRateMonotone<span class="token punctuation">(</span>trainData<span class="token punctuation">,</span> col1<span class="token punctuation">,</span> <span class="token string">'target'</span><span class="token punctuation">,</span>special_attribute<span class="token operator">=</span>special_attribute<span class="token punctuation">)</span>
<span class="token keyword">if</span> <span class="token operator">not</span> BRM<span class="token punctuation">:</span>
    <span class="token keyword">if</span> special_attribute <span class="token operator">==</span> <span class="token punctuation">[</span><span class="token punctuation">]</span><span class="token punctuation">:</span>
        bin_merged <span class="token operator">=</span> sf<span class="token punctuation">.</span>Monotone_Merge<span class="token punctuation">(</span>trainData<span class="token punctuation">,</span> <span class="token string">'target'</span><span class="token punctuation">,</span> col1<span class="token punctuation">)</span>
        removed_index <span class="token operator">=</span> <span class="token punctuation">[</span><span class="token punctuation">]</span>
        <span class="token keyword">for</span> <span class="token builtin">bin</span> <span class="token keyword">in</span> bin_merged<span class="token punctuation">:</span>
            <span class="token keyword">if</span> <span class="token builtin">len</span><span class="token punctuation">(</span><span class="token builtin">bin</span><span class="token punctuation">)</span><span class="token operator">&gt;</span><span class="token number">1</span><span class="token punctuation">:</span>
                indices <span class="token operator">=</span> <span class="token punctuation">[</span><span class="token builtin">int</span><span class="token punctuation">(</span>b<span class="token punctuation">.</span>replace<span class="token punctuation">(</span><span class="token string">'Bin '</span><span class="token punctuation">,</span><span class="token string">''</span><span class="token punctuation">)</span><span class="token punctuation">)</span> <span class="token keyword">for</span> b <span class="token keyword">in</span> <span class="token builtin">bin</span><span class="token punctuation">]</span>
                removed_index <span class="token operator">=</span> removed_index<span class="token operator">+</span>indices<span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">:</span><span class="token operator">-</span><span class="token number">1</span><span class="token punctuation">]</span>
        removed_point <span class="token operator">=</span> <span class="token punctuation">[</span>cutOffPoints<span class="token punctuation">[</span>k<span class="token punctuation">]</span> <span class="token keyword">for</span> k <span class="token keyword">in</span> removed_index<span class="token punctuation">]</span>
        <span class="token keyword">for</span> p <span class="token keyword">in</span> removed_point<span class="token punctuation">:</span>
            cutOffPoints<span class="token punctuation">.</span>remove<span class="token punctuation">(</span>p<span class="token punctuation">)</span>
        var_cutoff<span class="token punctuation">[</span>col<span class="token punctuation">]</span> <span class="token operator">=</span> cutOffPoints
        trainData<span class="token punctuation">[</span>col1<span class="token punctuation">]</span> <span class="token operator">=</span> trainData<span class="token punctuation">[</span>col<span class="token punctuation">]</span><span class="token punctuation">.</span><span class="token builtin">map</span><span class="token punctuation">(</span><span class="token keyword">lambda</span> x<span class="token punctuation">:</span> sf<span class="token punctuation">.</span>AssignBin<span class="token punctuation">(</span>x<span class="token punctuation">,</span> cutOffPoints<span class="token punctuation">,</span> special_attribute<span class="token operator">=</span>special_attribute<span class="token punctuation">)</span><span class="token punctuation">)</span>
    <span class="token keyword">else</span><span class="token punctuation">:</span>
        cutOffPoints2 <span class="token operator">=</span> <span class="token punctuation">[</span>i <span class="token keyword">for</span> i <span class="token keyword">in</span> cutOffPoints <span class="token keyword">if</span> i <span class="token operator">not</span> <span class="token keyword">in</span> special_attribute<span class="token punctuation">]</span>
        temp <span class="token operator">=</span> trainData<span class="token punctuation">.</span>loc<span class="token punctuation">[</span><span class="token operator">~</span>trainData<span class="token punctuation">[</span>col<span class="token punctuation">]</span><span class="token punctuation">.</span>isin<span class="token punctuation">(</span>special_attribute<span class="token punctuation">)</span><span class="token punctuation">]</span>
        bin_merged <span class="token operator">=</span> sf<span class="token punctuation">.</span>Monotone_Merge<span class="token punctuation">(</span>temp<span class="token punctuation">,</span> <span class="token string">'target'</span><span class="token punctuation">,</span> col1<span class="token punctuation">)</span>
        removed_index <span class="token operator">=</span> <span class="token punctuation">[</span><span class="token punctuation">]</span>
        <span class="token keyword">for</span> <span class="token builtin">bin</span> <span class="token keyword">in</span> bin_merged<span class="token punctuation">:</span>
            <span class="token keyword">if</span> <span class="token builtin">len</span><span class="token punctuation">(</span><span class="token builtin">bin</span><span class="token punctuation">)</span> <span class="token operator">&gt;</span> <span class="token number">1</span><span class="token punctuation">:</span>
                indices <span class="token operator">=</span> <span class="token punctuation">[</span><span class="token builtin">int</span><span class="token punctuation">(</span>b<span class="token punctuation">.</span>replace<span class="token punctuation">(</span><span class="token string">'Bin '</span><span class="token punctuation">,</span> <span class="token string">''</span><span class="token punctuation">)</span><span class="token punctuation">)</span> <span class="token keyword">for</span> b <span class="token keyword">in</span> <span class="token builtin">bin</span><span class="token punctuation">]</span>
                removed_index <span class="token operator">=</span> removed_index <span class="token operator">+</span> indices<span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">:</span><span class="token operator">-</span><span class="token number">1</span><span class="token punctuation">]</span>
        removed_point <span class="token operator">=</span> <span class="token punctuation">[</span>cutOffPoints2<span class="token punctuation">[</span>k<span class="token punctuation">]</span> <span class="token keyword">for</span> k <span class="token keyword">in</span> removed_index<span class="token punctuation">]</span>
        <span class="token keyword">for</span> p <span class="token keyword">in</span> removed_point<span class="token punctuation">:</span>
            cutOffPoints2<span class="token punctuation">.</span>remove<span class="token punctuation">(</span>p<span class="token punctuation">)</span>
        cutOffPoints2 <span class="token operator">=</span> cutOffPoints2 <span class="token operator">+</span> special_attribute
        var_cutoff<span class="token punctuation">[</span>col<span class="token punctuation">]</span> <span class="token operator">=</span> cutOffPoints2
        trainData<span class="token punctuation">[</span>col1<span class="token punctuation">]</span> <span class="token operator">=</span> trainData<span class="token punctuation">[</span>col<span class="token punctuation">]</span><span class="token punctuation">.</span><span class="token builtin">map</span><span class="token punctuation">(</span><span class="token keyword">lambda</span> x<span class="token punctuation">:</span> sf<span class="token punctuation">.</span>AssignBin<span class="token punctuation">(</span>x<span class="token punctuation">,</span> cutOffPoints2<span class="token punctuation">,</span> special_attribute<span class="token operator">=</span>special_attribute<span class="token punctuation">)</span><span class="token punctuation">)</span>

<span class="token comment">#(3), 分箱后再次检查是否有单一的值占比超过90%。如果有,删除该变量</span>
maxPcnt <span class="token operator">=</span> sf<span class="token punctuation">.</span>MaximumBinPcnt<span class="token punctuation">(</span>trainData<span class="token punctuation">,</span> col1<span class="token punctuation">)</span>
<span class="token keyword">if</span> maxPcnt <span class="token operator">&gt;</span> <span class="token number">0.9</span><span class="token punctuation">:</span>
    <span class="token comment"># del trainData[col1]</span>
    deleted_features<span class="token punctuation">.</span>append<span class="token punctuation">(</span>col<span class="token punctuation">)</span>
    numerical_var<span class="token punctuation">.</span>remove<span class="token punctuation">(</span>col<span class="token punctuation">)</span>
    <span class="token keyword">print</span><span class="token punctuation">(</span><span class="token string">'we delete {} because the maximum bin occupies more than 90%'</span><span class="token punctuation">.</span><span class="token builtin">format</span><span class="token punctuation">(</span>col<span class="token punctuation">)</span><span class="token punctuation">)</span>
    <span class="token keyword">continue</span>

WOE_IV <span class="token operator">=</span> sf<span class="token punctuation">.</span>CalcWOE<span class="token punctuation">(</span>trainData<span class="token punctuation">,</span> col1<span class="token punctuation">,</span> <span class="token string">'target'</span><span class="token punctuation">)</span>
var_IV<span class="token punctuation">[</span>col<span class="token punctuation">]</span> <span class="token operator">=</span> WOE_IV<span class="token punctuation">[</span><span class="token string">'IV'</span><span class="token punctuation">]</span>
var_WOE<span class="token punctuation">[</span>col<span class="token punctuation">]</span> <span class="token operator">=</span> WOE_IV<span class="token punctuation">[</span><span class="token string">'WOE'</span><span class="token punctuation">]</span>
<span class="token comment">#del trainData[col]</span>

trainData.to_csv(folderOfData+‘allData_2.csv’, header=True,encoding=‘gbk’, columns = trainData.columns, index=False)

with open(folderOfData+‘var_WOE.pkl’,“wb”) as f:
f.write(pickle.dumps(var_WOE))

with open(folderOfData+‘var_IV.pkl’,“wb”) as f:
f.write(pickle.dumps(var_IV))

with open(folderOfData+‘var_cutoff.pkl’,“wb”) as f:
f.write(pickle.dumps(var_cutoff))

with open(folderOfData+‘merged_features.pkl’,“wb”) as f:
f.write(pickle.dumps(merged_features))

########################################
# Step 4: WOE编码后的单变量分析与多变量分析#
########################################
trainData = pd.read_csv(folderOfData+‘allData_2.csv’, header=0, encoding=‘gbk’)

with open(folderOfData+‘var_WOE.pkl’,“rb”) as f:
var_WOE = pickle.load(f)

with open(folderOfData+‘var_IV.pkl’,“rb”) as f:
var_IV = pickle.load(f)

with open(folderOfData+‘var_cutoff.pkl’,“rb”) as f:
var_cutoff = pickle.load(f)

with open(folderOfData+‘merged_features.pkl’,“rb”) as f:
merged_features = pickle.load(f)

#将一些看起来像数值变量实际上是类别变量的字段转换成字符
num2str = [‘SocialNetwork_13’,‘SocialNetwork_12’,‘UserInfo_6’,‘UserInfo_5’,‘UserInfo_10’,‘UserInfo_17’]
for col in num2str:
trainData[col] = trainData[col].map(lambda x: str(x))

for col in var_WOE.keys():
print(col)
col2 = str(col)+"_WOE"
if col in var_cutoff.keys():
cutOffPoints = var_cutoff[col]
special_attribute = []
if -1 in cutOffPoints:
special_attribute = [-1]
binValue = trainData[col].map(lambda x: sf.AssignBin(x, cutOffPoints,special_attribute=special_attribute))
trainData[col2] = binValue.map(lambda x: var_WOE[col][x])
else:
print(’********************************************************************************************’)
print(col)
if -1 in set(trainData[col]):
trainData[col2] = trainData[col].map(lambda x: var_WOE[col][str(x*1.0)])
else:
trainData[col2] = trainData[col].map(lambda x: var_WOE[col][x])

trainData.to_csv(folderOfData+‘allData_3.csv’, header=True,encoding=‘gbk’, columns = trainData.columns, index=False)

### (i) 选择IV高于阈值的变量
trainData = pd.read_csv(folderOfData+‘allData_3.csv’, header=0,encoding=‘gbk’)
all_IV = list(var_IV.values())
all_IV = sorted(all_IV, reverse=True)
plt.bar(x=range(len(all_IV)), height = all_IV)
iv_threshould = 0.02
varByIV = [k for k, v in var_IV.items() if v > iv_threshould]

### (ii) 检查WOE编码后的变量的两两线性相关性

var_IV_selected = {k:var_IV[k] for k in varByIV}
var_IV_sorted = sorted(var_IV_selected.items(), key=lambda d:d[1], reverse = True)
var_IV_sorted = [i[0] for i in var_IV_sorted]

removed_var = []
roh_thresould = 0.6
for i in range(len(var_IV_sorted)-1):
if var_IV_sorted[i] not in removed_var:
x1 = var_IV_sorted[i]+"_WOE"
for j in range(i+1,len(var_IV_sorted)):
if var_IV_sorted[j] not in removed_var:
x2 = var_IV_sorted[j] + “_WOE”
roh = np.corrcoef([trainData[x1], trainData[x2]])[0, 1]
if abs(roh) >= roh_thresould:
print(‘the correlation coeffient between {0} and {1} is {2}’.format(x1, x2, str(roh)))
if var_IV[var_IV_sorted[i]] > var_IV[var_IV_sorted[j]]:
removed_var.append(var_IV_sorted[j])
else:
removed_var.append(var_IV_sorted[i])

var_IV_sortet_2 = [i for i in var_IV_sorted if i not in removed_var]

### (iii)检查是否有变量与其他所有变量的VIF > 10
for i in range(len(var_IV_sortet_2)):
x0 = trainData[var_IV_sortet_2[i]+’_WOE’]
x0 = np.array(x0)
X_Col = [k+’_WOE’ for k in var_IV_sortet_2 if k != var_IV_sortet_2[i]]
X = trainData[X_Col]
X = np.matrix(X)
regr = LinearRegression()
clr= regr.fit(X, x0)
x_pred = clr.predict(X)
R2 = 1 - ((x_pred - x0) 2).sum() / ((x0 - x0.mean()) 2).sum()
vif = 1/(1-R2)
if vif > 10:
print(“Warning: the vif for {0} is {1}”.format(var_IV_sortet_2[i], vif))

#########################
# Step 5: 应用逻辑回归模型#
#########################
multi_analysis = [i+’_WOE’ for i in var_IV_sortet_2]
y = trainData[‘target’]
X = trainData[multi_analysis].copy()
X[‘intercept’] = [1]*X.shape[0]

LR = sm.Logit(y, X).fit()
summary = LR.summary2()
pvals = LR.pvalues.to_dict()
params = LR.params.to_dict()

#发现有变量不显著,因此需要单独检验显著性
varLargeP = {k: v for k,v in pvals.items() if v >= 0.1}
varLargeP = sorted(varLargeP.items(), key=lambda d:d[1], reverse = True)
varLargeP = [i[0] for i in varLargeP]
p_value_list = {}
for var in varLargeP:
X_temp = trainData[var].copy().to_frame()
X_temp[‘intercept’] = [1] * X_temp.shape[0]
LR = sm.Logit(y, X_temp).fit()
p_value_list[var] = LR.pvalues[var]
for k,v in p_value_list.items():
print("{0} has p-value of {1} in univariate regression".format(k,v))

#发现有变量的系数为正,因此需要单独检验正确性
varPositive = [k for k,v in params.items() if v >= 0]
coef_list = {}
for var in varPositive:
X_temp = trainData[var].copy().to_frame()
X_temp[‘intercept’] = [1] * X_temp.shape[0]
LR = sm.Logit(y, X_temp).fit()
coef_list[var] = LR.params[var]
for k,v in coef_list.items():
print("{0} has coefficient of {1} in univariate regression".format(k,v))

selected_var = [multi_analysis[0]]
for var in multi_analysis[1:]:
try_vars = selected_var+[var]
X_temp = trainData[try_vars].copy()
X_temp[‘intercept’] = [1] * X_temp.shape[0]
LR = sm.Logit(y, X_temp).fit()
#summary = LR.summary2()
pvals, params = LR.pvalues, LR.params
del params[‘intercept’]
if max(pvals)<0.1 and max(params)<0:
selected_var.append(var)

LR.summary2()

y_pred = LR.predict(X_temp)
y_result = pd.DataFrame({‘y_pred’:y_pred, ‘y_real’:list(trainData[‘target’])})
sf.KS(y_result,‘y_pred’,‘y_real’)

roc_auc_score(trainData[‘target’], y_pred)

################
# Step 6: 尺度化#
################
scores = sf.Prob2Score(y_pred,200,100)
plt.hist(scores,bins=100)

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103
  • 104
  • 105
  • 106
  • 107
  • 108
  • 109
  • 110
  • 111
  • 112
  • 113
  • 114
  • 115
  • 116
  • 117
  • 118
  • 119
  • 120
  • 121
  • 122
  • 123
  • 124
  • 125
  • 126
  • 127
  • 128
  • 129
  • 130
  • 131
  • 132
  • 133
  • 134
  • 135
  • 136
  • 137
  • 138
  • 139
  • 140
  • 141
  • 142
  • 143
  • 144
  • 145
  • 146
  • 147
  • 148
  • 149
  • 150
  • 151
  • 152
  • 153
  • 154
  • 155
  • 156
  • 157
  • 158
  • 159
  • 160
  • 161
  • 162
  • 163
  • 164
  • 165
  • 166
  • 167
  • 168
  • 169
  • 170
  • 171
  • 172
  • 173
  • 174
  • 175
  • 176
  • 177
  • 178
  • 179
  • 180
  • 181
  • 182
  • 183
  • 184
  • 185
  • 186
  • 187
  • 188
  • 189
  • 190
  • 191
  • 192
  • 193
  • 194
  • 195
  • 196
  • 197
  • 198
  • 199
  • 200
  • 201
  • 202
  • 203
  • 204
  • 205
  • 206
  • 207
  • 208
  • 209
  • 210
  • 211
  • 212
  • 213
  • 214
  • 215
  • 216
  • 217
  • 218
  • 219
  • 220
  • 221
  • 222
  • 223
  • 224
  • 225
  • 226
  • 227
  • 228
  • 229
  • 230
  • 231
  • 232
  • 233
  • 234
  • 235
  • 236
  • 237
  • 238
  • 239
  • 240
  • 241
  • 242
  • 243
  • 244
  • 245
  • 246
  • 247
  • 248
  • 249
  • 250
  • 251
  • 252
  • 253
  • 254
  • 255
  • 256
  • 257
  • 258
  • 259
  • 260
  • 261
  • 262
  • 263
  • 264
  • 265
  • 266
  • 267
  • 268
  • 269
  • 270
  • 271
  • 272
  • 273
  • 274
  • 275
  • 276
  • 277
  • 278
  • 279
  • 280
  • 281
  • 282
  • 283
  • 284
  • 285
  • 286
  • 287
  • 288
  • 289
  • 290
  • 291
  • 292
  • 293
  • 294
  • 295
  • 296
  • 297
  • 298
  • 299
  • 300
  • 301
  • 302
  • 303
  • 304
  • 305
  • 306
  • 307
  • 308
  • 309
  • 310
  • 311
  • 312
  • 313
  • 314
  • 315
  • 316
  • 317
  • 318
  • 319
  • 320
  • 321
  • 322
  • 323
  • 324
  • 325
  • 326
  • 327
  • 328
  • 329
  • 330
  • 331
  • 332
  • 333
  • 334
  • 335
  • 336
  • 337
  • 338
  • 339
  • 340
  • 341
  • 342
  • 343
  • 344
  • 345
  • 346
  • 347
  • 348
  • 349
  • 350
  • 351
  • 352
  • 353
  • 354
  • 355
  • 356
  • 357
  • 358
  • 359
  • 360
  • 361
  • 362
  • 363
  • 364
  • 365
  • 366
  • 367
  • 368
  • 369
  • 370
  • 371
  • 372
  • 373
  • 374
  • 375
  • 376
  • 377
  • 378
  • 379
  • 380
  • 381
  • 382
  • 383
  • 384
  • 385
  • 386
  • 387
  • 388
  • 389
  • 390
  • 391
  • 392
  • 393
  • 394
  • 395
  • 396
  • 397
  • 398
  • 399
  • 400
  • 401
  • 402
  • 403
  • 404
  • 405
  • 406
  • 407
  • 408
  • 409
  • 410
  • 411
  • 412
  • 413
  • 414
  • 415
  • 416
  • 417
  • 418
  • 419
  • 420
  • 421
  • 422
  • 423
  • 424
  • 425
  • 426
  • 427
  • 428
  • 429
  • 430
  • 431
  • 432
  • 433
  • 434
  • 435
  • 436
  • 437
  • 438
  • 439
  • 440
  • 441
  • 442
  • 443
  • 444
  • 445
  • 446
  • 447
  • 448
  • 449
  • 450
  • 451
  • 452
  • 453
  • 454
  • 455
  • 456
  • 457
  • 458
  • 459
  • 460
  • 461
  • 462
  • 463
  • 464
  • 465
  • 466
  • 467
  • 468
  • 469
  • 470
  • 471
  • 472
  • 473
  • 474
  • 475
  • 476
  • 477
  • 478
  • 479
  • 480
  • 481
  • 482
  • 483
  • 484
  • 485
  • 486
  • 487
  • 488
  • 489
  • 490
  • 491
  • 492
  • 493
  • 494
  • 495
  • 496
  • 497
  • 498
  • 499
  • 500
  • 501
  • 502
  • 503
  • 504
  • 505
  • 506
  • 507
  • 508
  • 509
  • 510
  • 511
  • 512
  • 513
  • 514
  • 515
  • 516
  • 517
  • 518
  • 519
  • 520
  • 521
  • 522
  • 523
  • 524
  • 525
  • 526
  • 527
  • 528
  • 529
  • 530
  • 531
  • 532
  • 533
  • 534
  • 535
  • 536
  • 537
  • 538
  • 539
  • 540
  • 541
  • 542
  • 543
  • 544
  • 545
  • 546
  • 547
  • 548
  • 549
  • 550
  • 551
  • 552
  • 553
  • 554
  • 555
  • 556
  • 557
  • 558
  • 559
  • 560
  • 561
  • 562
  • 563
  • 564
  • 565
  • 566
  • 567
  • 568
  • 569
  • 570
  • 571
  • 572
  • 573
  • 574
  • 575
  • 576
  • 577
  • 578
  • 579
  • 580
  • 581
  • 582
  • 583
  • 584
  • 585
  • 586
  • 587
  • 588
  • 589
  • 590
  • 591
  • 592
  • 593
  • 594
  • 595
  • 596
  • 597
  • 598
  • 599
  • 600
  • 601
  • 602
  • 603
  • 604
  • 605
  • 606
  • 607
  • 608
  • 609
  • 610
  • 611
  • 612
  • 613
  • 614
  • 615
  • 616
  • 617
  • 618
  • 619
  • 620
  • 621
  • 622
  • 623
  • 624
  • 625
  • 626
  • 627
  • 628
  • 629
  • 630
  • 631
  • 632
  • 633
  • 634
  • 635
  • 636
  • 637
  • 638
  • 639
  • 640
  • 641
  • 642
  • 643
  • 644
  • 645
  • 646
  • 647
  • 648
  • 649
  • 650
  • 651
  • 652
  • 653
  • 654
  • 655
  • 656
  • 657
  • 658
  • 659
  • 660
  • 661
  • 662
  • 663

功能函数

import numpy as np
import pandas as pd

def SplitData(df, col, numOfSplit, special_attribute=[]):
‘’’
:param df: 按照col排序后的数据集
:param col: 待分箱的变量
:param numOfSplit: 切分的组别数
:param special_attribute: 在切分数据集的时候,某些特殊值需要排除在外
:return: 在原数据集上增加一列,把原始细粒度的col重新划分成粗粒度的值,便于分箱中的合并处理
‘’’

df2 = df.copy()
if special_attribute != []:
df2 = df.loc[~df[col].isin(special_attribute)]
N = df2.shape[0]
n = int(N/numOfSplit)
splitPointIndex = [i*n for i in range(1,numOfSplit)]
rawValues = sorted(list(df2[col]))
splitPoint = [rawValues[i] for i in splitPointIndex]
splitPoint = sorted(list(set(splitPoint)))
return splitPoint

def MaximumBinPcnt(df,col):
‘’’
:return: 数据集df中,变量col的分布占比
‘’’

N = df.shape[0]
total = df.groupby([col])[col].count()
pcnt = total*1.0/N
return max(pcnt)

def Chi2(df, total_col, bad_col):
‘’’
:param df: 包含全部样本总计与坏样本总计的数据框
:param total_col: 全部样本的个数
:param bad_col: 坏样本的个数
:return: 卡方值
‘’’

df2 = df.copy()
# 求出df中,总体的坏样本率和好样本率
badRate = sum(df2[bad_col])1.0/sum(df2[total_col])
# 当全部样本只有好或者坏样本时,卡方值为0
if badRate in [0,1]:
return 0
df2[‘good’] = df2.apply(lambda x: x[total_col] - x[bad_col], axis = 1)
goodRate = sum(df2[‘good’]) 1.0 / sum(df2[total_col])
# 期望坏(好)样本个数=全部样本个数平均坏(好)样本占比
df2[‘badExpected’] = df[total_col].apply(lambda x: xbadRate)
df2[‘goodExpected’] = df[total_col].apply(lambda x: x * goodRate)
badCombined = zip(df2[‘badExpected’], df2[bad_col])
goodCombined = zip(df2[‘goodExpected’], df2[‘good’])
badChi = [(i[0]-i[1])2/i[0] for i in badCombined]
goodChi = [(i[0] - i[1]) 2 / i[0] for i in goodCombined]
chi2 = sum(badChi) + sum(goodChi)
return chi2

def BinBadRate(df, col, target, grantRateIndicator=0):
‘’’
:param df: 需要计算好坏比率的数据集
:param col: 需要计算好坏比率的特征
:param target: 好坏标签
:param grantRateIndicator: 1返回总体的坏样本率,0不返回
:return: 每箱的坏样本率,以及总体的坏样本率(当grantRateIndicator==1时)
‘’’

total = df.groupby([col])[target].count()
total = pd.DataFrame({‘total’: total})
bad = df.groupby([col])[target].sum()
bad = pd.DataFrame({‘bad’: bad})
regroup = total.merge(bad, left_index=True, right_index=True, how=‘left’)
regroup.reset_index(level=0, inplace=True)
regroup[‘bad_rate’] = regroup.apply(lambda x: x.bad 1.0 / x.total, axis=1)
dicts = dict(zip(regroup[col],regroup[‘bad_rate’]))
if grantRateIndicator==0:
return (dicts, regroup)
N = sum(regroup[‘total’])
B = sum(regroup[‘bad’])
overallRate = B 1.0 / N
return (dicts, regroup, overallRate)

def AssignGroup(x, bin):
‘’’
:return: 数值x在区间映射下的结果。例如,x=2,bin=[0,3,5], 由于0<x<3,x映射成3
‘’’

N = len(bin)
if x<=min(bin):
return min(bin)
elif x>max(bin):
return 10e10
else:
for i in range(N-1):
if bin[i] < x <= bin[i+1]:
return bin[i+1]

def ChiMerge(df, col, target, max_interval=5,special_attribute=[],minBinPcnt=0):
‘’’
:param df: 包含目标变量与分箱属性的数据框
:param col: 需要分箱的属性
:param target: 目标变量,取值0或1
:param max_interval: 最大分箱数。如果原始属性的取值个数低于该参数,不执行这段函数
:param special_attribute: 不参与分箱的属性取值
:param minBinPcnt:最小箱的占比,默认为0
:return: 分箱结果
‘’’

colLevels = sorted(list(set(df[col])))
N_distinct = len(colLevels)
if N_distinct <= max_interval: #如果原始属性的取值个数低于max_interval,不执行这段函数
print(“The number of original levels for {} is less than or equal to max intervals”.format(col))
return colLevels[:-1]
else:
if len(special_attribute)>=1:
df1 = df.loc[df[col].isin(special_attribute)]
df2 = df.loc[~df[col].isin(special_attribute)]
else:
df2 = df.copy()
N_distinct = len(list(set(df2[col])))

    <span class="token comment"># 步骤一: 通过col对数据集进行分组,求出每组的总样本数与坏样本数</span>
    <span class="token keyword">if</span> N_distinct <span class="token operator">&gt;</span> <span class="token number">100</span><span class="token punctuation">:</span>
        split_x <span class="token operator">=</span> SplitData<span class="token punctuation">(</span>df2<span class="token punctuation">,</span> col<span class="token punctuation">,</span> <span class="token number">100</span><span class="token punctuation">)</span>
        df2<span class="token punctuation">[</span><span class="token string">'temp'</span><span class="token punctuation">]</span> <span class="token operator">=</span> df2<span class="token punctuation">[</span>col<span class="token punctuation">]</span><span class="token punctuation">.</span><span class="token builtin">map</span><span class="token punctuation">(</span><span class="token keyword">lambda</span> x<span class="token punctuation">:</span> AssignGroup<span class="token punctuation">(</span>x<span class="token punctuation">,</span> split_x<span class="token punctuation">)</span><span class="token punctuation">)</span>
    <span class="token keyword">else</span><span class="token punctuation">:</span>
        df2<span class="token punctuation">[</span><span class="token string">'temp'</span><span class="token punctuation">]</span> <span class="token operator">=</span> df2<span class="token punctuation">[</span>col<span class="token punctuation">]</span>
    <span class="token comment"># 总体bad rate将被用来计算expected bad count</span>
    <span class="token punctuation">(</span>binBadRate<span class="token punctuation">,</span> regroup<span class="token punctuation">,</span> overallRate<span class="token punctuation">)</span> <span class="token operator">=</span> BinBadRate<span class="token punctuation">(</span>df2<span class="token punctuation">,</span> <span class="token string">'temp'</span><span class="token punctuation">,</span> target<span class="token punctuation">,</span> grantRateIndicator<span class="token operator">=</span><span class="token number">1</span><span class="token punctuation">)</span>

    <span class="token comment"># 首先,每个单独的属性值将被分为单独的一组</span>
    <span class="token comment"># 对属性值进行排序,然后两两组别进行合并</span>
    colLevels <span class="token operator">=</span> <span class="token builtin">sorted</span><span class="token punctuation">(</span><span class="token builtin">list</span><span class="token punctuation">(</span><span class="token builtin">set</span><span class="token punctuation">(</span>df2<span class="token punctuation">[</span><span class="token string">'temp'</span><span class="token punctuation">]</span><span class="token punctuation">)</span><span class="token punctuation">)</span><span class="token punctuation">)</span>
    groupIntervals <span class="token operator">=</span> <span class="token punctuation">[</span><span class="token punctuation">[</span>i<span class="token punctuation">]</span> <span class="token keyword">for</span> i <span class="token keyword">in</span> colLevels<span class="token punctuation">]</span>

    <span class="token comment"># 步骤二:建立循环,不断合并最优的相邻两个组别,直到:</span>
    <span class="token comment"># 1,最终分裂出来的分箱数&lt;=预设的最大分箱数</span>
    <span class="token comment"># 2,每箱的占比不低于预设值(可选)</span>
    <span class="token comment"># 3,每箱同时包含好坏样本</span>
    <span class="token comment"># 如果有特殊属性,那么最终分裂出来的分箱数=预设的最大分箱数-特殊属性的个数</span>
    split_intervals <span class="token operator">=</span> max_interval <span class="token operator">-</span> <span class="token builtin">len</span><span class="token punctuation">(</span>special_attribute<span class="token punctuation">)</span>
    <span class="token keyword">while</span> <span class="token punctuation">(</span><span class="token builtin">len</span><span class="token punctuation">(</span>groupIntervals<span class="token punctuation">)</span> <span class="token operator">&gt;</span> split_intervals<span class="token punctuation">)</span><span class="token punctuation">:</span>  <span class="token comment"># 终止条件: 当前分箱数=预设的分箱数</span>
        <span class="token comment"># 每次循环时, 计算合并相邻组别后的卡方值。具有最小卡方值的合并方案,是最优方案</span>
        chisqList <span class="token operator">=</span> <span class="token punctuation">[</span><span class="token punctuation">]</span>
        <span class="token keyword">for</span> k <span class="token keyword">in</span> <span class="token builtin">range</span><span class="token punctuation">(</span><span class="token builtin">len</span><span class="token punctuation">(</span>groupIntervals<span class="token punctuation">)</span><span class="token operator">-</span><span class="token number">1</span><span class="token punctuation">)</span><span class="token punctuation">:</span>
            temp_group <span class="token operator">=</span> groupIntervals<span class="token punctuation">[</span>k<span class="token punctuation">]</span> <span class="token operator">+</span> groupIntervals<span class="token punctuation">[</span>k<span class="token operator">+</span><span class="token number">1</span><span class="token punctuation">]</span>
            df2b <span class="token operator">=</span> regroup<span class="token punctuation">.</span>loc<span class="token punctuation">[</span>regroup<span class="token punctuation">[</span><span class="token string">'temp'</span><span class="token punctuation">]</span><span class="token punctuation">.</span>isin<span class="token punctuation">(</span>temp_group<span class="token punctuation">)</span><span class="token punctuation">]</span>
            chisq <span class="token operator">=</span> Chi2<span class="token punctuation">(</span>df2b<span class="token punctuation">,</span> <span class="token string">'total'</span><span class="token punctuation">,</span> <span class="token string">'bad'</span><span class="token punctuation">)</span>
            chisqList<span class="token punctuation">.</span>append<span class="token punctuation">(</span>chisq<span class="token punctuation">)</span>
        best_comnbined <span class="token operator">=</span> chisqList<span class="token punctuation">.</span>index<span class="token punctuation">(</span><span class="token builtin">min</span><span class="token punctuation">(</span>chisqList<span class="token punctuation">)</span><span class="token punctuation">)</span>
        groupIntervals<span class="token punctuation">[</span>best_comnbined<span class="token punctuation">]</span> <span class="token operator">=</span> groupIntervals<span class="token punctuation">[</span>best_comnbined<span class="token punctuation">]</span> <span class="token operator">+</span> groupIntervals<span class="token punctuation">[</span>best_comnbined<span class="token operator">+</span><span class="token number">1</span><span class="token punctuation">]</span>
        <span class="token comment"># 当将最优的相邻的两个变量合并在一起后,需要从原来的列表中将其移除。例如,将[3,4,5] 与[6,7]合并成[3,4,5,6,7]后,需要将[3,4,5] 与[6,7]移除,保留[3,4,5,6,7]</span>
        groupIntervals<span class="token punctuation">.</span>remove<span class="token punctuation">(</span>groupIntervals<span class="token punctuation">[</span>best_comnbined<span class="token operator">+</span><span class="token number">1</span><span class="token punctuation">]</span><span class="token punctuation">)</span>
    groupIntervals <span class="token operator">=</span> <span class="token punctuation">[</span><span class="token builtin">sorted</span><span class="token punctuation">(</span>i<span class="token punctuation">)</span> <span class="token keyword">for</span> i <span class="token keyword">in</span> groupIntervals<span class="token punctuation">]</span>
    cutOffPoints <span class="token operator">=</span> <span class="token punctuation">[</span><span class="token builtin">max</span><span class="token punctuation">(</span>i<span class="token punctuation">)</span> <span class="token keyword">for</span> i <span class="token keyword">in</span> groupIntervals<span class="token punctuation">[</span><span class="token punctuation">:</span><span class="token operator">-</span><span class="token number">1</span><span class="token punctuation">]</span><span class="token punctuation">]</span>

    <span class="token comment"># 检查是否有箱没有好或者坏样本。如果有,需要跟相邻的箱进行合并,直到每箱同时包含好坏样本</span>
    groupedvalues <span class="token operator">=</span> df2<span class="token punctuation">[</span><span class="token string">'temp'</span><span class="token punctuation">]</span><span class="token punctuation">.</span><span class="token builtin">apply</span><span class="token punctuation">(</span><span class="token keyword">lambda</span> x<span class="token punctuation">:</span> AssignBin<span class="token punctuation">(</span>x<span class="token punctuation">,</span> cutOffPoints<span class="token punctuation">)</span><span class="token punctuation">)</span>
    df2<span class="token punctuation">[</span><span class="token string">'temp_Bin'</span><span class="token punctuation">]</span> <span class="token operator">=</span> groupedvalues
    <span class="token punctuation">(</span>binBadRate<span class="token punctuation">,</span>regroup<span class="token punctuation">)</span> <span class="token operator">=</span> BinBadRate<span class="token punctuation">(</span>df2<span class="token punctuation">,</span> <span class="token string">'temp_Bin'</span><span class="token punctuation">,</span> target<span class="token punctuation">)</span>
    <span class="token punctuation">[</span>minBadRate<span class="token punctuation">,</span> maxBadRate<span class="token punctuation">]</span> <span class="token operator">=</span> <span class="token punctuation">[</span><span class="token builtin">min</span><span class="token punctuation">(</span>binBadRate<span class="token punctuation">.</span>values<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">)</span><span class="token punctuation">,</span><span class="token builtin">max</span><span class="token punctuation">(</span>binBadRate<span class="token punctuation">.</span>values<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">)</span><span class="token punctuation">]</span>
    <span class="token keyword">while</span> minBadRate <span class="token operator">==</span><span class="token number">0</span> <span class="token operator">or</span> maxBadRate <span class="token operator">==</span> <span class="token number">1</span><span class="token punctuation">:</span>
        <span class="token comment"># 找出全部为好/坏样本的箱</span>
        indexForBad01 <span class="token operator">=</span> regroup<span class="token punctuation">[</span>regroup<span class="token punctuation">[</span><span class="token string">'bad_rate'</span><span class="token punctuation">]</span><span class="token punctuation">.</span>isin<span class="token punctuation">(</span><span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">,</span><span class="token number">1</span><span class="token punctuation">]</span><span class="token punctuation">)</span><span class="token punctuation">]</span><span class="token punctuation">.</span>temp_Bin<span class="token punctuation">.</span>tolist<span class="token punctuation">(</span><span class="token punctuation">)</span>
        <span class="token builtin">bin</span><span class="token operator">=</span>indexForBad01<span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">]</span>
        <span class="token comment"># 如果是最后一箱,则需要和上一个箱进行合并,也就意味着分裂点cutOffPoints中的最后一个需要移除</span>
        <span class="token keyword">if</span> <span class="token builtin">bin</span> <span class="token operator">==</span> <span class="token builtin">max</span><span class="token punctuation">(</span>regroup<span class="token punctuation">.</span>temp_Bin<span class="token punctuation">)</span><span class="token punctuation">:</span>
            cutOffPoints <span class="token operator">=</span> cutOffPoints<span class="token punctuation">[</span><span class="token punctuation">:</span><span class="token operator">-</span><span class="token number">1</span><span class="token punctuation">]</span>
        <span class="token comment"># 如果是第一箱,则需要和下一个箱进行合并,也就意味着分裂点cutOffPoints中的第一个需要移除</span>
        <span class="token keyword">elif</span> <span class="token builtin">bin</span> <span class="token operator">==</span> <span class="token builtin">min</span><span class="token punctuation">(</span>regroup<span class="token punctuation">.</span>temp_Bin<span class="token punctuation">)</span><span class="token punctuation">:</span>
            cutOffPoints <span class="token operator">=</span> cutOffPoints<span class="token punctuation">[</span><span class="token number">1</span><span class="token punctuation">:</span><span class="token punctuation">]</span>
        <span class="token comment"># 如果是中间的某一箱,则需要和前后中的一个箱进行合并,依据是较小的卡方值</span>
        <span class="token keyword">else</span><span class="token punctuation">:</span>
            <span class="token comment"># 和前一箱进行合并,并且计算卡方值</span>
            currentIndex <span class="token operator">=</span> <span class="token builtin">list</span><span class="token punctuation">(</span>regroup<span class="token punctuation">.</span>temp_Bin<span class="token punctuation">)</span><span class="token punctuation">.</span>index<span class="token punctuation">(</span><span class="token builtin">bin</span><span class="token punctuation">)</span>
            prevIndex <span class="token operator">=</span> <span class="token builtin">list</span><span class="token punctuation">(</span>regroup<span class="token punctuation">.</span>temp_Bin<span class="token punctuation">)</span><span class="token punctuation">[</span>currentIndex <span class="token operator">-</span> <span class="token number">1</span><span class="token punctuation">]</span>
            df3 <span class="token operator">=</span> df2<span class="token punctuation">.</span>loc<span class="token punctuation">[</span>df2<span class="token punctuation">[</span><span class="token string">'temp_Bin'</span><span class="token punctuation">]</span><span class="token punctuation">.</span>isin<span class="token punctuation">(</span><span class="token punctuation">[</span>prevIndex<span class="token punctuation">,</span> <span class="token builtin">bin</span><span class="token punctuation">]</span><span class="token punctuation">)</span><span class="token punctuation">]</span>
            <span class="token punctuation">(</span>binBadRate<span class="token punctuation">,</span> df2b<span class="token punctuation">)</span> <span class="token operator">=</span> BinBadRate<span class="token punctuation">(</span>df3<span class="token punctuation">,</span> <span class="token string">'temp_Bin'</span><span class="token punctuation">,</span> target<span class="token punctuation">)</span>
            chisq1 <span class="token operator">=</span> Chi2<span class="token punctuation">(</span>df2b<span class="token punctuation">,</span> <span class="token string">'total'</span><span class="token punctuation">,</span> <span class="token string">'bad'</span><span class="token punctuation">)</span>
            <span class="token comment"># 和后一箱进行合并,并且计算卡方值</span>
            laterIndex <span class="token operator">=</span> <span class="token builtin">list</span><span class="token punctuation">(</span>regroup<span class="token punctuation">.</span>temp_Bin<span class="token punctuation">)</span><span class="token punctuation">[</span>currentIndex <span class="token operator">+</span> <span class="token number">1</span><span class="token punctuation">]</span>
            df3b <span class="token operator">=</span> df2<span class="token punctuation">.</span>loc<span class="token punctuation">[</span>df2<span class="token punctuation">[</span><span class="token string">'temp_Bin'</span><span class="token punctuation">]</span><span class="token punctuation">.</span>isin<span class="token punctuation">(</span><span class="token punctuation">[</span>laterIndex<span class="token punctuation">,</span> <span class="token builtin">bin</span><span class="token punctuation">]</span><span class="token punctuation">)</span><span class="token punctuation">]</span>
            <span class="token punctuation">(</span>binBadRate<span class="token punctuation">,</span> df2b<span class="token punctuation">)</span> <span class="token operator">=</span> BinBadRate<span class="token punctuation">(</span>df3b<span class="token punctuation">,</span> <span class="token string">'temp_Bin'</span><span class="token punctuation">,</span> target<span class="token punctuation">)</span>
            chisq2 <span class="token operator">=</span> Chi2<span class="token punctuation">(</span>df2b<span class="token punctuation">,</span> <span class="token string">'total'</span><span class="token punctuation">,</span> <span class="token string">'bad'</span><span class="token punctuation">)</span>
            <span class="token keyword">if</span> chisq1 <span class="token operator">&lt;</span> chisq2<span class="token punctuation">:</span>
                cutOffPoints<span class="token punctuation">.</span>remove<span class="token punctuation">(</span>cutOffPoints<span class="token punctuation">[</span>currentIndex <span class="token operator">-</span> <span class="token number">1</span><span class="token punctuation">]</span><span class="token punctuation">)</span>
            <span class="token keyword">else</span><span class="token punctuation">:</span>
                cutOffPoints<span class="token punctuation">.</span>remove<span class="token punctuation">(</span>cutOffPoints<span class="token punctuation">[</span>currentIndex<span class="token punctuation">]</span><span class="token punctuation">)</span>
        <span class="token comment"># 完成合并之后,需要再次计算新的分箱准则下,每箱是否同时包含好坏样本</span>
        groupedvalues <span class="token operator">=</span> df2<span class="token punctuation">[</span><span class="token string">'temp'</span><span class="token punctuation">]</span><span class="token punctuation">.</span><span class="token builtin">apply</span><span class="token punctuation">(</span><span class="token keyword">lambda</span> x<span class="token punctuation">:</span> AssignBin<span class="token punctuation">(</span>x<span class="token punctuation">,</span> cutOffPoints<span class="token punctuation">)</span><span class="token punctuation">)</span>
        df2<span class="token punctuation">[</span><span class="token string">'temp_Bin'</span><span class="token punctuation">]</span> <span class="token operator">=</span> groupedvalues
        <span class="token punctuation">(</span>binBadRate<span class="token punctuation">,</span> regroup<span class="token punctuation">)</span> <span class="token operator">=</span> BinBadRate<span class="token punctuation">(</span>df2<span class="token punctuation">,</span> <span class="token string">'temp_Bin'</span><span class="token punctuation">,</span> target<span class="token punctuation">)</span>
        <span class="token punctuation">[</span>minBadRate<span class="token punctuation">,</span> maxBadRate<span class="token punctuation">]</span> <span class="token operator">=</span> <span class="token punctuation">[</span><span class="token builtin">min</span><span class="token punctuation">(</span>binBadRate<span class="token punctuation">.</span>values<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">)</span><span class="token punctuation">,</span> <span class="token builtin">max</span><span class="token punctuation">(</span>binBadRate<span class="token punctuation">.</span>values<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">)</span><span class="token punctuation">]</span>
    <span class="token comment"># 需要检查分箱后的最小占比</span>
    <span class="token keyword">if</span> minBinPcnt <span class="token operator">&gt;</span> <span class="token number">0</span><span class="token punctuation">:</span>
        groupedvalues <span class="token operator">=</span> df2<span class="token punctuation">[</span><span class="token string">'temp'</span><span class="token punctuation">]</span><span class="token punctuation">.</span><span class="token builtin">apply</span><span class="token punctuation">(</span><span class="token keyword">lambda</span> x<span class="token punctuation">:</span> AssignBin<span class="token punctuation">(</span>x<span class="token punctuation">,</span> cutOffPoints<span class="token punctuation">)</span><span class="token punctuation">)</span>
        df2<span class="token punctuation">[</span><span class="token string">'temp_Bin'</span><span class="token punctuation">]</span> <span class="token operator">=</span> groupedvalues
        valueCounts <span class="token operator">=</span> groupedvalues<span class="token punctuation">.</span>value_counts<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">.</span>to_frame<span class="token punctuation">(</span><span class="token punctuation">)</span>
        N <span class="token operator">=</span> <span class="token builtin">sum</span><span class="token punctuation">(</span>valueCounts<span class="token punctuation">[</span><span class="token string">'temp'</span><span class="token punctuation">]</span><span class="token punctuation">)</span>
        valueCounts<span class="token punctuation">[</span><span class="token string">'pcnt'</span><span class="token punctuation">]</span> <span class="token operator">=</span> valueCounts<span class="token punctuation">[</span><span class="token string">'temp'</span><span class="token punctuation">]</span><span class="token punctuation">.</span><span class="token builtin">apply</span><span class="token punctuation">(</span><span class="token keyword">lambda</span> x<span class="token punctuation">:</span> x <span class="token operator">*</span> <span class="token number">1.0</span> <span class="token operator">/</span> N<span class="token punctuation">)</span>
        valueCounts <span class="token operator">=</span> valueCounts<span class="token punctuation">.</span>sort_index<span class="token punctuation">(</span><span class="token punctuation">)</span>
        minPcnt <span class="token operator">=</span> <span class="token builtin">min</span><span class="token punctuation">(</span>valueCounts<span class="token punctuation">[</span><span class="token string">'pcnt'</span><span class="token punctuation">]</span><span class="token punctuation">)</span>
        <span class="token keyword">while</span> minPcnt <span class="token operator">&lt;</span> minBinPcnt <span class="token operator">and</span> <span class="token builtin">len</span><span class="token punctuation">(</span>cutOffPoints<span class="token punctuation">)</span> <span class="token operator">&gt;</span> <span class="token number">2</span><span class="token punctuation">:</span>
            <span class="token comment"># 找出占比最小的箱</span>
            indexForMinPcnt <span class="token operator">=</span> valueCounts<span class="token punctuation">[</span>valueCounts<span class="token punctuation">[</span><span class="token string">'pcnt'</span><span class="token punctuation">]</span> <span class="token operator">==</span> minPcnt<span class="token punctuation">]</span><span class="token punctuation">.</span>index<span class="token punctuation">.</span>tolist<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">]</span>
            <span class="token comment"># 如果占比最小的箱是最后一箱,则需要和上一个箱进行合并,也就意味着分裂点cutOffPoints中的最后一个需要移除</span>
            <span class="token keyword">if</span> indexForMinPcnt <span class="token operator">==</span> <span class="token builtin">max</span><span class="token punctuation">(</span>valueCounts<span class="token punctuation">.</span>index<span class="token punctuation">)</span><span class="token punctuation">:</span>
                cutOffPoints <span class="token operator">=</span> cutOffPoints<span class="token punctuation">[</span><span class="token punctuation">:</span><span class="token operator">-</span><span class="token number">1</span><span class="token punctuation">]</span>
            <span class="token comment"># 如果占比最小的箱是第一箱,则需要和下一个箱进行合并,也就意味着分裂点cutOffPoints中的第一个需要移除</span>
            <span class="token keyword">elif</span> indexForMinPcnt <span class="token operator">==</span> <span class="token builtin">min</span><span class="token punctuation">(</span>valueCounts<span class="token punctuation">.</span>index<span class="token punctuation">)</span><span class="token punctuation">:</span>
                cutOffPoints <span class="token operator">=</span> cutOffPoints<span class="token punctuation">[</span><span class="token number">1</span><span class="token punctuation">:</span><span class="token punctuation">]</span>
            <span class="token comment"># 如果占比最小的箱是中间的某一箱,则需要和前后中的一个箱进行合并,依据是较小的卡方值</span>
            <span class="token keyword">else</span><span class="token punctuation">:</span>
                <span class="token comment"># 和前一箱进行合并,并且计算卡方值</span>
                currentIndex <span class="token operator">=</span> <span class="token builtin">list</span><span class="token punctuation">(</span>valueCounts<span class="token punctuation">.</span>index<span class="token punctuation">)</span><span class="token punctuation">.</span>index<span class="token punctuation">(</span>indexForMinPcnt<span class="token punctuation">)</span>
                prevIndex <span class="token operator">=</span> <span class="token builtin">list</span><span class="token punctuation">(</span>valueCounts<span class="token punctuation">.</span>index<span class="token punctuation">)</span><span class="token punctuation">[</span>currentIndex <span class="token operator">-</span> <span class="token number">1</span><span class="token punctuation">]</span>
                df3 <span class="token operator">=</span> df2<span class="token punctuation">.</span>loc<span class="token punctuation">[</span>df2<span class="token punctuation">[</span><span class="token string">'temp_Bin'</span><span class="token punctuation">]</span><span class="token punctuation">.</span>isin<span class="token punctuation">(</span><span class="token punctuation">[</span>prevIndex<span class="token punctuation">,</span> indexForMinPcnt<span class="token punctuation">]</span><span class="token punctuation">)</span><span class="token punctuation">]</span>
                <span class="token punctuation">(</span>binBadRate<span class="token punctuation">,</span> df2b<span class="token punctuation">)</span> <span class="token operator">=</span> BinBadRate<span class="token punctuation">(</span>df3<span class="token punctuation">,</span> <span class="token string">'temp_Bin'</span><span class="token punctuation">,</span> target<span class="token punctuation">)</span>
                chisq1 <span class="token operator">=</span> Chi2<span class="token punctuation">(</span>df2b<span class="token punctuation">,</span> <span class="token string">'total'</span><span class="token punctuation">,</span> <span class="token string">'bad'</span><span class="token punctuation">)</span>
                <span class="token comment"># 和后一箱进行合并,并且计算卡方值</span>
                laterIndex <span class="token operator">=</span> <span class="token builtin">list</span><span class="token punctuation">(</span>valueCounts<span class="token punctuation">.</span>index<span class="token punctuation">)</span><span class="token punctuation">[</span>currentIndex <span class="token operator">+</span> <span class="token number">1</span><span class="token punctuation">]</span>
                df3b <span class="token operator">=</span> df2<span class="token punctuation">.</span>loc<span class="token punctuation">[</span>df2<span class="token punctuation">[</span><span class="token string">'temp_Bin'</span><span class="token punctuation">]</span><span class="token punctuation">.</span>isin<span class="token punctuation">(</span><span class="token punctuation">[</span>laterIndex<span class="token punctuation">,</span> indexForMinPcnt<span class="token punctuation">]</span><span class="token punctuation">)</span><span class="token punctuation">]</span>
                <span class="token punctuation">(</span>binBadRate<span class="token punctuation">,</span> df2b<span class="token punctuation">)</span> <span class="token operator">=</span> BinBadRate<span class="token punctuation">(</span>df3b<span class="token punctuation">,</span> <span class="token string">'temp_Bin'</span><span class="token punctuation">,</span> target<span class="token punctuation">)</span>
                chisq2 <span class="token operator">=</span> Chi2<span class="token punctuation">(</span>df2b<span class="token punctuation">,</span> <span class="token string">'total'</span><span class="token punctuation">,</span> <span class="token string">'bad'</span><span class="token punctuation">)</span>
                <span class="token keyword">if</span> chisq1 <span class="token operator">&lt;</span> chisq2<span class="token punctuation">:</span>
                    cutOffPoints<span class="token punctuation">.</span>remove<span class="token punctuation">(</span>cutOffPoints<span class="token punctuation">[</span>currentIndex <span class="token operator">-</span> <span class="token number">1</span><span class="token punctuation">]</span><span class="token punctuation">)</span>
                <span class="token keyword">else</span><span class="token punctuation">:</span>
                    cutOffPoints<span class="token punctuation">.</span>remove<span class="token punctuation">(</span>cutOffPoints<span class="token punctuation">[</span>currentIndex<span class="token punctuation">]</span><span class="token punctuation">)</span>
            groupedvalues <span class="token operator">=</span> df2<span class="token punctuation">[</span><span class="token string">'temp'</span><span class="token punctuation">]</span><span class="token punctuation">.</span><span class="token builtin">apply</span><span class="token punctuation">(</span><span class="token keyword">lambda</span> x<span class="token punctuation">:</span> AssignBin<span class="token punctuation">(</span>x<span class="token punctuation">,</span> cutOffPoints<span class="token punctuation">)</span><span class="token punctuation">)</span>
            df2<span class="token punctuation">[</span><span class="token string">'temp_Bin'</span><span class="token punctuation">]</span> <span class="token operator">=</span> groupedvalues
            valueCounts <span class="token operator">=</span> groupedvalues<span class="token punctuation">.</span>value_counts<span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">.</span>to_frame<span class="token punctuation">(</span><span class="token punctuation">)</span>
            valueCounts<span class="token punctuation">[</span><span class="token string">'pcnt'</span><span class="token punctuation">]</span> <span class="token operator">=</span> valueCounts<span class="token punctuation">[</span><span class="token string">'temp'</span><span class="token punctuation">]</span><span class="token punctuation">.</span><span class="token builtin">apply</span><span class="token punctuation">(</span><span class="token keyword">lambda</span> x<span class="token punctuation">:</span> x <span class="token operator">*</span> <span class="token number">1.0</span> <span class="token operator">/</span> N<span class="token punctuation">)</span>
            valueCounts <span class="token operator">=</span> valueCounts<span class="token punctuation">.</span>sort_index<span class="token punctuation">(</span><span class="token punctuation">)</span>
            minPcnt <span class="token operator">=</span> <span class="token builtin">min</span><span class="token punctuation">(</span>valueCounts<span class="token punctuation">[</span><span class="token string">'pcnt'</span><span class="token punctuation">]</span><span class="token punctuation">)</span>
    cutOffPoints <span class="token operator">=</span> special_attribute <span class="token operator">+</span> cutOffPoints
    <span class="token keyword">return</span> cutOffPoints

def BadRateEncoding(df, col, target):
‘’’
:return: 在数据集df中,用坏样本率给col进行编码。target表示坏样本标签
‘’’

regroup = BinBadRate(df, col, target, grantRateIndicator=0)[1]
br_dict = regroup[[col,‘bad_rate’]].set_index([col]).to_dict(orient=‘index’)
for k, v in br_dict.items():
br_dict[k] = v[‘bad_rate’]
badRateEnconding = df[col].map(lambda x: br_dict[x])
return {‘encoding’:badRateEnconding, ‘bad_rate’:br_dict}

def AssignBin(x, cutOffPoints,special_attribute=[]):
‘’’
:param x: 某个变量的某个取值
:param cutOffPoints: 上述变量的分箱结果,用切分点表示
:param special_attribute: 不参与分箱的特殊取值
:return: 分箱后的对应的第几个箱,从0开始
例如, cutOffPoints = [10,20,30], 对于 x = 7, 返回 Bin 0;对于x=23,返回Bin 2; 对于x = 35, return Bin 3。
对于特殊值,返回的序列数前加"-"
‘’’

cutOffPoints2 = [i for i in cutOffPoints if i not in special_attribute]
numBin = len(cutOffPoints2)
if x in special_attribute:
i = special_attribute.index(x)+1
return ‘Bin {}’.format(0-i)
if x<=cutOffPoints2[0]:
return ‘Bin 0’
elif x > cutOffPoints2[-1]:
return ‘Bin {}’.format(numBin)
else:
for i in range(0,numBin):
if cutOffPoints2[i] < x <= cutOffPoints2[i+1]:
return ‘Bin {}’.format(i+1)

def Ca

已标记关键词 清除标记
相关推荐
©️2020 CSDN 皮肤主题: 大白 设计师:CSDN官方博客 返回首页