import pandas as pd
import numpy as np
import matplotlib. pyplot as plt
from sklearn. ensemble import RandomForestRegressor
import seaborn as sns
from scipy import stats
import copy
from sklearn. model_selection import train_test_split
train_data = pd. read_csv( 'data/ScorecardsData.csv' )
train_data = train_data. iloc[ : , 1 : ]
train_data. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 SeriousDlqin2yrs 150000 non-null int64
1 RevolvingUtilizationOfUnsecuredLines 150000 non-null float64
2 age 150000 non-null int64
3 NumberOfTime30-59DaysPastDueNotWorse 150000 non-null int64
4 DebtRatio 150000 non-null float64
5 MonthlyIncome 120268 non-null float64
6 NumberOfOpenCreditLinesAndLoans 150000 non-null int64
7 NumberOfTimes90DaysLate 150000 non-null int64
8 NumberRealEstateLoansOrLines 150000 non-null int64
9 NumberOfTime60-89DaysPastDueNotWorse 150000 non-null int64
10 NumberOfDependents 146076 non-null float64
dtypes: float64(4), int64(7)
memory usage: 12.6 MB
train_box = train_data. iloc[ : , [ 3 , 7 , 9 ] ]
train_box. boxplot( )
<AxesSubplot:>
train, test = train_test_split( train_data, test_size = 0.2 , random_state= 100 )
train = train. dropna( )
train = train. drop_duplicates( )
train[ 'SeriousDlqin2yrs' ] = 1 - train[ 'SeriousDlqin2yrs' ]
train = train[ train[ 'NumberOfTime30-59DaysPastDueNotWorse' ] < 90 ]
train = train[ train. age> 0 ]
age = train[ 'age' ]
sns. distplot( age)
<AxesSubplot:xlabel='age'>
train_y = train. iloc[ : , 0 ]
train_X = train. iloc[ : , 1 : ]
test = test. dropna( )
test = test. drop_duplicates( )
test. info( )
test[ 'SeriousDlqin2yrs' ] = 1 - test[ 'SeriousDlqin2yrs' ]
test_y = test. iloc[ : , 0 ]
test_X = test. iloc[ : , 1 : ]
informationValue = [ ]
cut= [ ]
<class 'pandas.core.frame.DataFrame'>
Int64Index: 24034 entries, 149311 to 6008
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 SeriousDlqin2yrs 24034 non-null int64
1 RevolvingUtilizationOfUnsecuredLines 24034 non-null float64
2 age 24034 non-null int64
3 NumberOfTime30-59DaysPastDueNotWorse 24034 non-null int64
4 DebtRatio 24034 non-null float64
5 MonthlyIncome 24034 non-null float64
6 NumberOfOpenCreditLinesAndLoans 24034 non-null int64
7 NumberOfTimes90DaysLate 24034 non-null int64
8 NumberRealEstateLoansOrLines 24034 non-null int64
9 NumberOfTime60-89DaysPastDueNotWorse 24034 non-null int64
10 NumberOfDependents 24034 non-null float64
dtypes: float64(4), int64(7)
memory usage: 2.2 MB
from sklearn. tree import DecisionTreeClassifier
def optimal_binning_boundary ( x, y) :
'''
利用决策树获得最优分箱的边界值列表,利用决策树生成的内部划分节点的阈值,作为分箱的边界
'''
boundary = [ ]
x = x. fillna( - 1 ) . values
y = y. values
clf = DecisionTreeClassifier( criterion= 'entropy' ,
max_leaf_nodes= 6 ,
min_samples_leaf= 0.05 )
clf. fit( x. reshape( - 1 , 1 ) , y)
n_nodes = clf. tree_. node_count
children_left = clf. tree_. children_left
children_right = clf. tree_. children_right
threshold = clf. tree_. threshold
for i in range ( n_nodes) :
if children_left[ i] != children_right[ i] :
boundary. append( threshold[ i] )
boundary. sort( )
min_x = x. min ( )
max_x = x. max ( ) + 0.1
boundary = [ min_x] + boundary + [ max_x]
return boundary
def feature_woe_iv ( x, y) - > pd. DataFrame:
'''
计算变量各个分箱的WOE、IV值,返回一个DataFrame
'''
boundary = optimal_binning_boundary( x, y)
df = pd. concat( [ x, y] , axis= 1 )
df. columns = [ 'x' , 'y' ]
df[ 'bins' ] = pd. cut( x= x, bins= boundary, right= False )
grouped = df. groupby( 'bins' ) [ 'y' ]
result_df = grouped. agg( [ ( 'good' , lambda y: ( y == 1 ) . sum ( ) ) ,
( 'bad' , lambda y: ( y == 0 ) . sum ( ) ) ,
( 'total' , 'count' ) ] )
result_df[ 'good_pct' ] = result_df[ 'good' ] / result_df[ 'good' ] . sum ( )
result_df[ 'bad_pct' ] = result_df[ 'bad' ] / result_df[ 'bad' ] . sum ( )
result_df[ 'total_pct' ] = result_df[ 'total' ] / result_df[ 'total' ] . sum ( )
result_df[ 'bad_rate' ] = result_df[ 'bad' ] / result_df[ 'total' ]
result_df[ 'woe' ] = np. log( result_df[ 'good_pct' ] / result_df[ 'bad_pct' ] )
result_df[ 'iv' ] = ( result_df[ 'good_pct' ] - result_df[ 'bad_pct' ] ) * result_df[ 'woe' ]
woe= list ( result_df[ 'woe' ] . round ( 3 ) )
print ( f"该变量IV = {result_df['iv'].sum()}" )
iv= result_df[ 'iv' ] . sum ( )
print ( informationValue)
informationValue. append( iv)
return result_df, woe
result_df, x1_woe= feature_woe_iv( x= train_X[ 'RevolvingUtilizationOfUnsecuredLines' ] ,
y= train_y)
result_df
该变量IV = 1.009962126764656
[]
good bad total good_pct bad_pct total_pct bad_rate woe iv bins [0.0, 2.52e-05) 6034 217 6251 0.067479 0.032779 0.065087 0.034714 0.722008 0.025053 [2.52e-05, 0.142) 37270 704 37974 0.416792 0.106344 0.395394 0.018539 1.365905 0.424043 [0.142, 0.314) 13946 537 14483 0.155959 0.081118 0.150800 0.037078 0.653690 0.048923 [0.314, 0.501) 9424 587 10011 0.105389 0.088671 0.104237 0.058636 0.172730 0.002888 [0.501, 0.86) 11838 1497 13335 0.132385 0.226133 0.138847 0.112261 -0.535409 0.050193 [0.86, 29110.1) 10909 3078 13987 0.121996 0.464955 0.145636 0.220061 -1.337952 0.458862
result_df, x2_woe = feature_woe_iv( x= train_X[ 'age' ] ,
y= train_y)
result_df
该变量IV = 0.19925987486906607
[1.009962126764656]
good bad total good_pct bad_pct total_pct bad_rate woe iv bins [21.0, 33.5) 10302 1250 11552 0.115208 0.188822 0.120282 0.108206 -0.494066 0.036370 [33.5, 42.5) 15065 1540 16605 0.168473 0.232628 0.172895 0.092743 -0.322668 0.020701 [42.5, 56.5) 30800 2526 33326 0.344438 0.381571 0.346998 0.075797 -0.102383 0.003802 [56.5, 63.5) 14276 745 15021 0.159649 0.112538 0.156402 0.049597 0.349691 0.016474 [63.5, 67.5) 6080 224 6304 0.067993 0.033837 0.065639 0.035533 0.697854 0.023836 [67.5, 103.1) 12898 335 13233 0.144239 0.050604 0.137785 0.025315 1.047437 0.098077
result_df, x3_woe = feature_woe_iv( x= train_X[ 'NumberOfTime30-59DaysPastDueNotWorse' ] ,
y= train_y)
result_df
该变量IV = 0.6885535264867229
[1.009962126764656, 0.19925987486906607]
good bad total good_pct bad_pct total_pct bad_rate woe iv bins [0.0, 0.5) 76523 3344 79867 0.855761 0.505136 0.831593 0.041870 0.527163 0.184837 [0.5, 1.5) 9271 1613 10884 0.103678 0.243656 0.113327 0.148199 -0.854465 0.119606 [1.5, 13.1) 3627 1663 5290 0.040561 0.251208 0.055081 0.314367 -1.823478 0.384111
result_df, x4_woe = feature_woe_iv( x= train_X[ 'DebtRatio' ] ,
y= train_y)
result_df
该变量IV = 0.0893992422284528
[1.009962126764656, 0.19925987486906607, 0.6885535264867229]
good bad total good_pct bad_pct total_pct bad_rate woe iv bins [0.0, 0.0163) 8128 353 8481 0.090896 0.053323 0.088306 0.041622 0.533342 0.020039 [0.0163, 0.201) 22511 1572 24083 0.251742 0.237462 0.250757 0.065274 0.058395 0.000834 [0.201, 0.386) 27183 1569 28752 0.303989 0.237009 0.299372 0.054570 0.248893 0.016671 [0.386, 0.505) 11724 880 12604 0.131110 0.132931 0.131236 0.069819 -0.013789 0.000025 [0.505, 0.654) 8032 808 8840 0.089822 0.122054 0.092044 0.091403 -0.306633 0.009883 [0.654, 61106.6) 11843 1438 13281 0.132441 0.217221 0.138285 0.108275 -0.494776 0.041947
result_df, x5_woe = feature_woe_iv( x= train_X[ 'MonthlyIncome' ] ,
y= train_y)
result_df
该变量IV = 0.09696580500519011
[1.009962126764656, 0.19925987486906607, 0.6885535264867229, 0.0893992422284528]
good bad total good_pct bad_pct total_pct bad_rate woe iv bins [0.0, 1302.5) 4518 310 4828 0.050525 0.046828 0.050270 0.064209 0.075992 0.000281 [1302.5, 3332.5) 16217 1795 18012 0.181356 0.271148 0.187545 0.099656 -0.402205 0.036115 [3332.5, 4838.5) 16956 1530 18486 0.189620 0.231118 0.192480 0.082765 -0.197906 0.008213 [4838.5, 6620.5) 17446 1273 18719 0.195100 0.192296 0.194906 0.068006 0.014474 0.000041 [6620.5, 10237.0) 20825 1122 21947 0.232887 0.169486 0.228517 0.051123 0.317781 0.020148 [10237.0, 3008750.1) 13459 590 14049 0.150513 0.089124 0.146281 0.041996 0.524021 0.032169
result_df, x6_woe = feature_woe_iv( x= train_X[ 'NumberOfOpenCreditLinesAndLoans' ] ,
y= train_y)
result_df
该变量IV = 0.05605247509666227
[1.009962126764656, 0.19925987486906607, 0.6885535264867229, 0.0893992422284528, 0.09696580500519011]
good bad total good_pct bad_pct total_pct bad_rate woe iv bins [0.0, 2.5) 5952 849 6801 0.066562 0.128248 0.070814 0.124835 -0.655837 0.040456 [2.5, 3.5) 4819 443 5262 0.053891 0.066918 0.054789 0.084189 -0.216508 0.002821 [3.5, 4.5) 6448 496 6944 0.072108 0.074924 0.072302 0.071429 -0.038311 0.000108 [4.5, 5.5) 7563 552 8115 0.084577 0.083384 0.084495 0.068022 0.014215 0.000017 [5.5, 13.5) 50517 3204 53721 0.564934 0.483988 0.559355 0.059641 0.154650 0.012518 [13.5, 58.1) 14122 1076 15198 0.157927 0.162538 0.158245 0.070799 -0.028777 0.000133
result_df, x7_woe = feature_woe_iv( x= train_X[ 'NumberOfTimes90DaysLate' ] ,
y= train_y)
result_df
该变量IV = 0.7636872299659871
[1.009962126764656, 0.19925987486906607, 0.6885535264867229, 0.0893992422284528, 0.09696580500519011, 0.05605247509666227]
good bad total good_pct bad_pct total_pct bad_rate woe iv bins [0.0, 0.5) 86365 4461 90826 0.965825 0.673867 0.9457 0.049116 0.359949 0.105090 [0.5, 17.1) 3056 2159 5215 0.034175 0.326133 0.0543 0.413998 -2.255798 0.658597
result_df, x8_woe = feature_woe_iv( x= train_X[ 'NumberRealEstateLoansOrLines' ] ,
y= train_y)
result_df
该变量IV = 0.041023510733383756
[1.009962126764656, 0.19925987486906607, 0.6885535264867229, 0.0893992422284528, 0.09696580500519011, 0.05605247509666227, 0.7636872299659871]
good bad total good_pct bad_pct total_pct bad_rate woe iv bins [0.0, 0.5) 31646 2893 34539 0.353899 0.437009 0.359628 0.083760 -0.210942 0.017531 [0.5, 1.5) 31464 1915 33379 0.351864 0.289275 0.347549 0.057371 0.195866 0.012259 [1.5, 2.5) 20136 1237 21373 0.225182 0.186858 0.222540 0.057877 0.186560 0.007150 [2.5, 54.1) 6175 575 6750 0.069055 0.086858 0.070282 0.085185 -0.229366 0.004083
result_df, x9_woe = feature_woe_iv( x= train_X[ 'NumberOfTime60-89DaysPastDueNotWorse' ] ,
y= train_y)
result_df
该变量IV = 0.5139598716969411
[1.009962126764656, 0.19925987486906607, 0.6885535264867229, 0.0893992422284528, 0.09696580500519011, 0.05605247509666227, 0.7636872299659871, 0.041023510733383756]
good bad total good_pct bad_pct total_pct bad_rate woe iv bins [0.0, 0.5) 86234 4882 91116 0.96436 0.737462 0.94872 0.053580 0.268249 0.060865 [0.5, 11.1) 3187 1738 4925 0.03564 0.262538 0.05128 0.352893 -1.996915 0.453095
result_df, x10_woe = feature_woe_iv( x= train_X[ 'NumberOfDependents' ] ,
y= train_y)
result_df
该变量IV = 0.033540678987747465
[1.009962126764656, 0.19925987486906607, 0.6885535264867229, 0.0893992422284528, 0.09696580500519011, 0.05605247509666227, 0.7636872299659871, 0.041023510733383756, 0.5139598716969411]
good bad total good_pct bad_pct total_pct bad_rate woe iv bins [0.0, 0.5) 49009 3068 52077 0.548070 0.463444 0.542237 0.058913 0.167718 0.014193 [0.5, 1.5) 18063 1460 19523 0.202000 0.220544 0.203278 0.074784 -0.087831 0.001629 [1.5, 2.5) 13369 1160 14529 0.149506 0.175227 0.151279 0.079840 -0.158742 0.004083 [2.5, 20.1) 8980 932 9912 0.100424 0.140785 0.103206 0.094027 -0.337838 0.013636
corr = train_data. corr( )
xticks = [ 'x0' , 'x1' , 'x2' , 'x3' , 'x4' , 'x5' , 'x6' , 'x7' , 'x8' , 'x9' , 'x10' ]
yticks = list ( corr. index)
fig = plt. figure( )
ax1 = fig. add_subplot( 1 , 1 , 1 )
sns. heatmap( corr, annot= True , cmap= 'rainbow' , ax= ax1, annot_kws= { 'size' : 5 , 'color' : 'blue' } )
ax1. set_xticklabels( xticks, rotation= 0 , fontsize= 10 )
ax1. set_yticklabels( yticks, rotation= 0 , fontsize= 10 )
plt. show( )
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-TzYaRwaV-1625708289322)(output_18_0.png)]
print ( informationValue)
index= [ 'x1' , 'x2' , 'x3' , 'x4' , 'x5' , 'x6' , 'x7' , 'x8' , 'x9' , 'x10' ]
index_num = range ( len ( index) )
ax= plt. bar( index_num, informationValue, tick_label= index)
plt. show( )
[1.009962126764656, 0.19925987486906607, 0.6885535264867229, 0.0893992422284528, 0.09696580500519011, 0.05605247509666227, 0.7636872299659871, 0.041023510733383756, 0.5139598716969411, 0.033540678987747465]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-cqvCaJno-1625708289323)(output_19_1.png)]
选取x2、x3、x7、x9
def trans_woe ( var, var_name, x_woe, x_cut) :
woe_name = var_name + '_woe'
for i in range ( len ( x_woe) ) :
if i == 0 :
var. loc[ ( var[ var_name] <= x_cut[ i+ 1 ] ) , woe_name] = x_woe[ i]
elif ( i> 0 ) and ( i<= len ( x_woe) - 2 ) :
var. loc[ ( ( var[ var_name] > x_cut[ i] ) & ( var[ var_name] <= x_cut[ i+ 1 ] ) ) , woe_name] = x_woe[ i]
else :
var. loc[ ( var[ var_name] > x_cut[ len ( x_woe) - 1 ] ) , woe_name] = x_woe[ len ( x_woe) - 1 ]
return var
x1_name = 'RevolvingUtilizationOfUnsecuredLines'
x2_name = 'age'
x3_name = 'NumberOfTime30-59DaysPastDueNotWorse'
x4_name = 'DebtRatio'
x5_name = 'MonthlyIncome'
x6_name = 'NumberOfOpenCreditLinesAndLoans'
x7_name = 'NumberOfTimes90DaysLate'
x8_name = 'NumberRealEstateLoansOrLines'
x9_name = 'NumberOfTime60-89DaysPastDueNotWorse'
x1_cut = [ float ( '-inf' ) , 0.137 , 0.301 , 0.501 , 0.699 , 0.86 , 29110.1 , float ( '+inf' ) ]
x2_cut = [ float ( '-inf' ) , 21 , 36 , 46 , 56 , 63 , 67 , 107 , float ( '+inf' ) ]
x3_cut = [ float ( '-inf' ) , 0.5 , 1.5 , 13.1 , float ( '+inf' ) ]
x4_cut = [ float ( '-inf' ) , 0.0163 , 0.423 , 0.505 , 0.654 , 3.973 , 329664.1 , float ( '+inf' ) ]
x5_cut = [ float ( '-inf' ) , 0 , 0.5 , 1.5 , 13.1 , float ( '+inf' ) ]
x6_cut = [ float ( '-inf' ) , 2.5 , 4.5 , 5.5 , 8.5 , 14.5 , 58.1 , float ( '+inf' ) ]
x7_cut = [ float ( '-inf' ) , 0.5 , 17.1 , float ( '+inf' ) ]
x8_cut = [ float ( '-inf' ) , 0.5 , 1.5 , 2.5 , 54.1 , float ( '+inf' ) ]
x9_cut = [ float ( '-inf' ) , 1265.5 , 3600.5 , 3614.5 , 5300.5 , 6449.5 , 3008750.1 , float ( '+inf' ) ]
train_X = trans_woe( train_X, x2_name, x2_woe, x2_cut)
train_X = trans_woe( train_X, x4_name, x4_woe, x4_cut)
train_X = trans_woe( train_X, x5_name, x5_woe, x5_cut)
train_X = trans_woe( train_X, x6_name, x6_woe, x6_cut)
train_X = trans_woe( train_X, x7_name, x7_woe, x7_cut)
train_X = trans_woe( train_X, x8_name, x8_woe, x8_cut)
train_X = train_X. iloc[ : , - 6 : ]
from sklearn. linear_model import LogisticRegression
from sklearn. model_selection import train_test_split
train_X. head( )
age_woe DebtRatio_woe MonthlyIncome_woe NumberOfOpenCreditLinesAndLoans_woe NumberOfTimes90DaysLate_woe NumberRealEstateLoansOrLines_woe 3302 0.350 0.058 0.318 0.155 0.36 0.187 112869 -0.102 0.058 0.318 0.155 0.36 0.196 124934 1.047 0.533 0.318 -0.656 0.36 -0.211 14047 0.350 0.058 0.318 -0.217 0.36 -0.211 101221 -0.102 0.058 0.318 0.155 0.36 0.187
import statsmodels. api as sm
X1= sm. add_constant( train_X)
logit= sm. Logit( train_y, X1)
result= logit. fit( )
print ( result. summary( ) )
Optimization terminated successfully.
Current function value: 0.215753
Iterations 8
Logit Regression Results
==============================================================================
Dep. Variable: SeriousDlqin2yrs No. Observations: 96041
Model: Logit Df Residuals: 96034
Method: MLE Df Model: 6
Date: Thu, 08 Jul 2021 Pseudo R-squ.: 0.1399
Time: 09:02:25 Log-Likelihood: -20721.
converged: True LL-Null: -24093.
Covariance Type: nonrobust LLR p-value: 0.000
=======================================================================================================
coef std err z P>|z| [0.025 0.975]
-------------------------------------------------------------------------------------------------------
const 3.8840 0.145 26.733 0.000 3.599 4.169
age_woe 0.6528 0.030 21.499 0.000 0.593 0.712
DebtRatio_woe 1.6798 0.074 22.814 0.000 1.535 1.824
MonthlyIncome_woe -4.7477 0.459 -10.349 0.000 -5.647 -3.849
NumberOfOpenCreditLinesAndLoans_woe 0.2531 0.065 3.891 0.000 0.126 0.381
NumberOfTimes90DaysLate_woe 0.9489 0.013 73.846 0.000 0.924 0.974
NumberRealEstateLoansOrLines_woe 0.7415 0.072 10.329 0.000 0.601 0.882
=======================================================================================================
test_X = trans_woe( test_X, x2_name, x2_woe, x2_cut)
test_X = trans_woe( test_X, x4_name, x4_woe, x4_cut)
test_X = trans_woe( test_X, x5_name, x5_woe, x5_cut)
test_X = trans_woe( test_X, x6_name, x6_woe, x6_cut)
test_X = trans_woe( test_X, x7_name, x7_woe, x7_cut)
test_X = trans_woe( test_X, x8_name, x8_woe, x8_cut)
test_X = test_X. iloc[ : , - 6 : ]
from sklearn import metrics
X3 = sm. add_constant( test_X)
resu = result. predict( X3)
fpr, tpr, threshold = metrics. roc_curve( test_y, resu)
rocauc = metrics. auc( fpr, tpr)
plt. plot( fpr, tpr, 'b' , label= 'AUC = %0.2f' % rocauc)
plt. legend( loc= 'lower right' )
plt. plot( [ 0 , 1 ] , [ 0 , 1 ] , 'r--' )
plt. xlim( [ 0 , 1 ] )
plt. ylim( [ 0 , 1 ] )
plt. ylabel( 'TPR' )
plt. xlabel( 'FPR' )
plt. show( )
最终AUC如下图所示:
p = 20 / np. log( 2 )
q = 600 - 20 * np. log( 20 ) / np. log( 2 )
def get_score ( coe, woe, factor) :
scores= [ ]
for w in woe:
score= round ( coe* w* factor, 0 )
scores. append( score)
return scores
x_coe = [ 3.8840 , 0.6528 , 1.6798 , - 4.7477 , 0.2531 , 0.9489 , 0.7415 ]
baseScore = round ( q + p * x_coe[ 0 ] , 0 )
print ( baseScore)
基础分:626.0
x2_score = get_score( x_coe[ 1 ] , x2_woe, p)
x4_score = get_score( x_coe[ 2 ] , x4_woe, p)
x5_score = get_score( x_coe[ 3 ] , x5_woe, p)
x6_score = get_score( x_coe[ 4 ] , x6_woe, p)
x7_score = get_score( x_coe[ 5 ] , x7_woe, p)
x8_score = get_score( x_coe[ 6 ] , x8_woe, p)
print ( x2_score)
[-9.0, -6.0, -2.0, 7.0, 13.0, 20.0]
print ( x4_score)
[26.0, 3.0, 12.0, -1.0, -15.0, -24.0]
print ( x5_score)
[-10.0, 55.0, 27.0, -2.0, -44.0, -72.0]
print ( x6_score)
[-5.0, -2.0, -0.0, 0.0, 1.0, -0.0]
print ( x7_score)
[10.0, -62.0]
print ( x8_score)
[-5.0, 4.0, 4.0, -5.0]