本帖是在2019年5月初入门python之时,选取的较为系统的练手案例,主要内容是信用风险计量体系之主体评级模型的开发过程(可用“四张卡”来表示,分别是A卡、B卡、C卡和F卡)。 如今再回顾,结合前几月股票市场被割韭菜的切身体会,应该能应用这个模型来做股票市场的风险评级模型分析/收益评级模型分析。已经跃跃欲试了。 ——2020.11.6补充说明
—— —— —— —— —— 【本帖技术层面的说明】
此贴是评分卡技术实现过程,含完整代码。另文字报告版见本人知乎专栏: https://zhuanlan.zhihu.com/p/67031799
本次项目主要参考:https://www.jianshu.com/p/f931a4df202c?utm_campaign=maleskine&utm_content=note&utm_medium=seo_notes&utm_source=recommendation 原帖子代码不全,本帖自行补上。并且在分析过程中,做了一些延伸探讨,或者战略性舍弃。 本帖大幅的定义说明文字,系参考原贴,在此感谢。
另有一处自定义函数(self_bin)代码系参考此贴,在此感谢 https://blog.csdn.net/sunyaowu315/article/details/82981216 —— —— —— —— ——
【正文开篇】
信用风险计量体系包括主体评级模型和债项评级两部分。 主体评级和债项评级均有一系列评级模型组成:其中主体评级模型可用“四张卡”来表示,分别是A卡、B卡、C卡和F卡;债项评级模型通常按照主体的融资用途,分为企业融资模型、现金流融资模型和项目融资模型等。 我们主要讨论主体评级模型的开发过程。
一、项目流程
二、数据获取
三、数据预处理
import numpy as np
import pandas as pd
import matplotlib. pyplot as plt
import seaborn as sns
import warnings
warnings. filterwarnings( "ignore" )
from sklearn. ensemble import RandomForestClassifier
from sklearn. ensemble import RandomForestRegressor
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\ensemble\weight_boosting.py:29: DeprecationWarning: numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.
from numpy.core.umath_tests import inner1d
data = pd. read_csv( r'D:\2018_BigData\Python\Kaggle_learning\GiveMeSomeCredit\cs-training.csv' )
data. describe( ) . to_csv( r'D:\2018_BigData\Python\Kaggle_learning\GiveMeSomeCredit\DataDescribe.csv' )
data. describe( )
Unnamed: 0 SeriousDlqin2yrs RevolvingUtilizationOfUnsecuredLines age NumberOfTime30-59DaysPastDueNotWorse DebtRatio MonthlyIncome NumberOfOpenCreditLinesAndLoans NumberOfTimes90DaysLate NumberRealEstateLoansOrLines NumberOfTime60-89DaysPastDueNotWorse NumberOfDependents count 150000.000000 150000.000000 150000.000000 150000.000000 150000.000000 150000.000000 1.202690e+05 150000.000000 150000.000000 150000.000000 150000.000000 146076.000000 mean 75000.500000 0.066840 6.048438 52.295207 0.421033 353.005076 6.670221e+03 8.452760 0.265973 1.018240 0.240387 0.757222 std 43301.414527 0.249746 249.755371 14.771866 4.192781 2037.818523 1.438467e+04 5.145951 4.169304 1.129771 4.155179 1.115086 min 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000e+00 0.000000 0.000000 0.000000 0.000000 0.000000 25% 37500.750000 0.000000 0.029867 41.000000 0.000000 0.175074 3.400000e+03 5.000000 0.000000 0.000000 0.000000 0.000000 50% 75000.500000 0.000000 0.154181 52.000000 0.000000 0.366508 5.400000e+03 8.000000 0.000000 1.000000 0.000000 0.000000 75% 112500.250000 0.000000 0.559046 63.000000 0.000000 0.868254 8.249000e+03 11.000000 0.000000 2.000000 0.000000 1.000000 max 150000.000000 1.000000 50708.000000 109.000000 98.000000 329664.000000 3.008750e+06 58.000000 98.000000 54.000000 98.000000 20.000000
data= data. dropna( )
data = data. drop_duplicates( )
data. to_csv( r'D:\2018_BigData\Python\Kaggle_learning\GiveMeSomeCredit\MissingData.csv' , index= False )
data. shape
(120269, 12)
data = data[ data[ 'age' ] > 0 ]
fig, axes = plt. subplots( 1 , 3 )
color = dict ( boxes= 'DarkGreen' , whiskers= 'DarkOrange' ,
medians= 'DarkBlue' , caps= 'Red' )
datatemp1= data[ [ "NumberOfTime30-59DaysPastDueNotWorse" , "NumberOfTimes90DaysLate" , "NumberOfTime60-89DaysPastDueNotWorse" ] ]
datatemp1. plot( kind= 'box' , ax= axes, subplots= True ,
title= '3 Different boxplots' , color= color, sym= 'r+' )
axes[ 0 ] . set_ylabel( 'NumberOfTime30-59DaysPastDueNotWorse' )
axes[ 1 ] . set_ylabel( 'NumberOfTimes90DaysLate' )
axes[ 2 ] . set_ylabel( 'NumberOfTime60-89DaysPastDueNotWorse' )
fig. subplots_adjust( wspace= 3 , hspace= 1 )
print ( np. unique( datatemp1[ "NumberOfTime30-59DaysPastDueNotWorse" ] ) )
print ( np. unique( datatemp1[ "NumberOfTimes90DaysLate" ] ) )
print ( np. unique( datatemp1[ "NumberOfTime60-89DaysPastDueNotWorse" ] ) )
[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 96 98]
[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 17 96 98]
[ 0 1 2 3 4 5 6 7 8 9 11 96 98]
data. head( 2 )
Unnamed: 0 SeriousDlqin2yrs RevolvingUtilizationOfUnsecuredLines age NumberOfTime30-59DaysPastDueNotWorse DebtRatio MonthlyIncome NumberOfOpenCreditLinesAndLoans NumberOfTimes90DaysLate NumberRealEstateLoansOrLines NumberOfTime60-89DaysPastDueNotWorse NumberOfDependents 0 1 1 0.766127 45 2 0.802982 9120.0 13 0 6 0 2.0 1 2 0 0.957151 40 0 0.121876 2600.0 4 0 0 0 1.0
data = data[ data[ 'NumberOfTime30-59DaysPastDueNotWorse' ] < 90 ]
data[ 'SeriousDlqin2yrs' ] = 1 - data[ 'SeriousDlqin2yrs' ]
data. head( 2 )
Unnamed: 0 SeriousDlqin2yrs RevolvingUtilizationOfUnsecuredLines age NumberOfTime30-59DaysPastDueNotWorse DebtRatio MonthlyIncome NumberOfOpenCreditLinesAndLoans NumberOfTimes90DaysLate NumberRealEstateLoansOrLines NumberOfTime60-89DaysPastDueNotWorse NumberOfDependents 0 1 0 0.766127 45 2 0.802982 9120.0 13 0 6 0 2.0 1 2 1 0.957151 40 0 0.121876 2600.0 4 0 0 0 1.0
data= data. drop( [ "Unnamed: 0" ] , axis= 1 )
data. head( 2 )
SeriousDlqin2yrs RevolvingUtilizationOfUnsecuredLines age NumberOfTime30-59DaysPastDueNotWorse DebtRatio MonthlyIncome NumberOfOpenCreditLinesAndLoans NumberOfTimes90DaysLate NumberRealEstateLoansOrLines NumberOfTime60-89DaysPastDueNotWorse NumberOfDependents 0 0 0.766127 45 2 0.802982 9120.0 13 0 6 0 2.0 1 1 0.957151 40 0 0.121876 2600.0 4 0 0 0 1.0
from sklearn. cross_validation import train_test_split
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
"This module will be removed in 0.20.", DeprecationWarning)
Y = data[ 'SeriousDlqin2yrs' ]
X = data. ix[ : , 1 : ]
X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size= 0.3 , random_state= 0 )
train = pd. concat( [ Y_train, X_train] , axis= 1 )
test = pd. concat( [ Y_test, X_test] , axis= 1 )
test. head( )
SeriousDlqin2yrs RevolvingUtilizationOfUnsecuredLines age NumberOfTime30-59DaysPastDueNotWorse DebtRatio MonthlyIncome NumberOfOpenCreditLinesAndLoans NumberOfTimes90DaysLate NumberRealEstateLoansOrLines NumberOfTime60-89DaysPastDueNotWorse NumberOfDependents 110247 1 1.061975 24 0 0.129502 4802.0 4 0 0 0 0.0 62343 1 0.144533 74 0 0.358967 3601.0 12 0 2 0 0.0 89351 1 0.012432 49 0 0.226289 10300.0 4 0 1 0 0.0 119386 1 0.302073 45 0 0.472505 5000.0 5 0 1 0 0.0 19054 1 0.000000 34 0 0.184868 1400.0 5 0 0 0 1.0
test[ 'SeriousDlqin2yrs' ] . head( )
110247 1
62343 1
89351 1
119386 1
19054 1
Name: SeriousDlqin2yrs, dtype: int64
test[ 'SeriousDlqin2yrs' ] . value_counts( )
1 33589
0 2447
Name: SeriousDlqin2yrs, dtype: int64
clasTest = test. groupby( [ 'SeriousDlqin2yrs' ] ) [ 'SeriousDlqin2yrs' ] . count( )
train. to_csv( r'D:\2018_BigData\Python\Kaggle_learning\GiveMeSomeCredit\TrainData.csv' , index= False )
test. to_csv( r'D:\2018_BigData\Python\Kaggle_learning\GiveMeSomeCredit\TestData.csv' , index= False )
clasTest. head( )
SeriousDlqin2yrs
0 2447
1 33589
Name: SeriousDlqin2yrs, dtype: int64
四、探索性分析
plt. figure( figsize= ( 12 , 8 ) )
bins= 30
plt. subplot( 211 )
ax1= plt. hist( data. age, bins, color= "green" , alpha= 0.8 , rwidth= 0.9 )
plt. title( "Age distribution" )
plt. ylabel( '# of age' , fontsize= 12 )
plt. xlabel( 'age' , fontsize= 12 )
plt. subplot( 212 )
ax2= plt. hist( data. MonthlyIncome, bins, color= "green" , alpha= 0.8 , rwidth= 0.9 )
plt. title( "MonthlyIncome distribution" )
plt. ylabel( '# of MonthlyIncome' , fontsize= 12 )
plt. xlabel( 'MonthlyIncome' , fontsize= 12 )
plt. show( )
datatemp2= data[ "MonthlyIncome" ]
datatemp2. plot( kind= 'box' , title= 'MonthlyIncome Distribution' , sym= 'r+' ) ;
print ( data[ data[ 'MonthlyIncome' ] > 50000 ] . count( ) )
data = data[ data[ 'MonthlyIncome' ] < 50000 ]
plt. figure( figsize= ( 15 , 5 ) )
plt. hist( data. MonthlyIncome, bins, color= "green" , alpha= 0.8 , rwidth= 0.9 )
plt. title( "MonthlyIncome distribution" )
plt. ylabel( '# of MonthlyIncome' , fontsize= 12 )
plt. xlabel( 'MonthlyIncome' , fontsize= 12 )
plt. show( )
SeriousDlqin2yrs 301
RevolvingUtilizationOfUnsecuredLines 301
age 301
NumberOfTime30-59DaysPastDueNotWorse 301
DebtRatio 301
MonthlyIncome 301
NumberOfOpenCreditLinesAndLoans 301
NumberOfTimes90DaysLate 301
NumberRealEstateLoansOrLines 301
NumberOfTime60-89DaysPastDueNotWorse 301
NumberOfDependents 301
dtype: int64
data. hist( bins= 50 , figsize= ( 20 , 15 ) )
plt. show( )
plt. figure( figsize= ( 16 , 8 ) )
data. plot( kind= 'box' , title= 'Various Var Distribution' , sym= 'r+' ) ;
<Figure size 1152x576 with 0 Axes>
五、变量选择
from scipy import stats
def mono_bin ( Y, X, n = 20 ) :
r = 0
good= Y. sum ( )
bad= Y. count( ) - good
while np. abs ( r) < 1 :
d1 = pd. DataFrame( { "X" : X, "Y" : Y, "Bucket" : pd. qcut( X, n, duplicates= "drop" ) } )
d2 = d1. groupby( 'Bucket' , as_index = True )
r, p = stats. spearmanr( d2. mean( ) . X, d2. mean( ) . Y)
n = n - 1
d3 = pd. DataFrame( d2. X. min ( ) , columns = [ 'min' ] )
d3[ 'min' ] = d2. min ( ) . X
d3[ 'max' ] = d2. max ( ) . X
d3[ 'sum' ] = d2. sum ( ) . Y
d3[ 'total' ] = d2. count( ) . Y
d3[ 'rate' ] = d2. mean( ) . Y
d3[ 'woe' ] = np. log( ( d3[ 'rate' ] / ( 1 - d3[ 'rate' ] ) ) / ( good/ bad) )
d4 = ( d3. sort_index( by = 'min' ) ) . reset_index( drop= True )
print ( "=" * 60 )
print ( d4)
woe= list ( d4[ 'woe' ] . round ( 3 ) )
return d4
data= data. drop_duplicates( subset= None , keep= 'first' , inplace= False )
data. shape
(119703, 11)
mono_bin( data. SeriousDlqin2yrs, data. RevolvingUtilizationOfUnsecuredLines)
============================================================
min max sum total rate woe
0 0.000000 0.035034 29333 29926 0.980184 1.298275
1 0.035037 0.176771 29205 29926 0.975907 1.098457
2 0.176777 0.577036 28305 29925 0.945865 0.257613
3 0.577040 50708.000000 24607 29926 0.822262 -1.071254
min max sum total rate woe 0 0.000000 0.035034 29333 29926 0.980184 1.298275 1 0.035037 0.176771 29205 29926 0.975907 1.098457 2 0.176777 0.577036 28305 29925 0.945865 0.257613 3 0.577040 50708.000000 24607 29926 0.822262 -1.071254
mono_bin( data. SeriousDlqin2yrs, data. age)
============================================================
min max sum total rate woe
0 21 30 7913 8885 0.890602 -0.506093
1 31 34 6640 7383 0.899363 -0.412828
2 35 38 7594 8386 0.905557 -0.342447
3 39 41 7131 7849 0.908523 -0.307262
4 42 43 4890 5362 0.911973 -0.265031
5 44 46 8163 8868 0.920501 -0.153830
6 47 48 5776 6274 0.920625 -0.152133
7 49 51 8545 9280 0.920797 -0.149768
8 52 53 5454 5901 0.924250 -0.101453
9 54 56 7922 8463 0.936075 0.080980
10 57 59 7517 7946 0.946011 0.260466
11 60 61 4942 5200 0.950385 0.349567
12 62 64 7464 7776 0.959877 0.571844
13 65 68 6968 7212 0.966167 0.748916
14 69 75 7911 8141 0.971748 0.934931
15 76 103 6620 6777 0.976833 1.138606
min max sum total rate woe 0 21 30 7913 8885 0.890602 -0.506093 1 31 34 6640 7383 0.899363 -0.412828 2 35 38 7594 8386 0.905557 -0.342447 3 39 41 7131 7849 0.908523 -0.307262 4 42 43 4890 5362 0.911973 -0.265031 5 44 46 8163 8868 0.920501 -0.153830 6 47 48 5776 6274 0.920625 -0.152133 7 49 51 8545 9280 0.920797 -0.149768 8 52 53 5454 5901 0.924250 -0.101453 9 54 56 7922 8463 0.936075 0.080980 10 57 59 7517 7946 0.946011 0.260466 11 60 61 4942 5200 0.950385 0.349567 12 62 64 7464 7776 0.959877 0.571844 13 65 68 6968 7212 0.966167 0.748916 14 69 75 7911 8141 0.971748 0.934931 15 76 103 6620 6777 0.976833 1.138606
mono_bin( data. SeriousDlqin2yrs, data. MonthlyIncome)
============================================================
min max sum total rate woe
0 0.0 3400.0 27355 30073 0.909620 -0.293996
1 3401.0 5400.0 27655 30008 0.921588 -0.138884
2 5401.0 8200.0 27925 29725 0.939445 0.138736
3 8201.0 49750.0 28515 29897 0.953775 0.423899
min max sum total rate woe 0 0.0 3400.0 27355 30073 0.909620 -0.293996 1 3401.0 5400.0 27655 30008 0.921588 -0.138884 2 5401.0 8200.0 27925 29725 0.939445 0.138736 3 8201.0 49750.0 28515 29897 0.953775 0.423899
pinf = float ( 'inf' )
ninf = float ( '-inf' )
cutx3 = [ ninf, 0 , 1 , 3 , 5 , pinf]
cutx6 = [ ninf, 1 , 2 , 3 , 5 , pinf]
cutx7 = [ ninf, 0 , 1 , 3 , 5 , pinf]
cutx8 = [ ninf, 0 , 1 , 2 , 3 , pinf]
cutx9 = [ ninf, 0 , 1 , 3 , pinf]
cutx10 = [ ninf, 0 , 1 , 2 , 3 , 5 , pinf]
corr = data. corr( )
xticks = [ 'x0' , 'x1' , 'x2' , 'x3' , 'x4' , 'x5' , 'x6' , 'x7' , 'x8' , 'x9' , 'x10' ]
yticks = list ( corr. index)
fig = plt. figure( )
ax1 = fig. add_subplot( 1 , 1 , 1 )
sns. heatmap( corr, annot= True , cmap= 'rainbow' , ax= ax1, annot_kws= { 'size' : 9 , 'weight' : 'bold' , 'color' : 'blue' } )
ax1. set_xticklabels( xticks, rotation= 0 , fontsize= 10 )
ax1. set_yticklabels( yticks, rotation= 0 , fontsize= 10 )
plt. show( )
def mono_bin ( Y, X, n = 20 ) :
r = 0
good= Y. sum ( )
bad= Y. count( ) - good
while np. abs ( r) < 1 :
d1 = pd. DataFrame( { "X" : X, "Y" : Y, "Bucket" : pd. qcut( X, n) } )
d2 = d1. groupby( 'Bucket' , as_index = True )
r, p = stats. spearmanr( d2. mean( ) . X, d2. mean( ) . Y)
n = n - 1
d3 = pd. DataFrame( d2. X. min ( ) , columns = [ 'min' ] )
d3[ 'min' ] = d2. min ( ) . X
d3[ 'max' ] = d2. max ( ) . X
d3[ 'sum' ] = d2. sum ( ) . Y
d3[ 'total' ] = d2. count( ) . Y
d3[ 'rate' ] = d2. mean( ) . Y
d3[ 'woe' ] = np. log( ( d3[ 'rate' ] / ( 1 - d3[ 'rate' ] ) ) / ( good/ bad) )
d3[ 'goodattribute' ] = d3[ 'sum' ] / good
d3[ 'badattribute' ] = ( d3[ 'total' ] - d3[ 'sum' ] ) / bad
iv= ( ( d3[ 'goodattribute' ] - d3[ 'badattribute' ] ) * d3[ 'woe' ] ) . sum ( )
d4 = ( d3. sort_index( by = 'min' ) ) . reset_index( drop= True )
print ( "=" * 60 )
print ( d4)
cut= [ ]
cut. append( float ( '-inf' ) )
for i in range ( 1 , n+ 1 ) :
qua= X. quantile( i/ ( n+ 1 ) )
cut. append( round ( qua, 4 ) )
cut. append( float ( 'inf' ) )
woe= list ( d4[ 'woe' ] . round ( 3 ) )
return d4, iv, cut, woe
def self_bin ( Y, X, cat) :
good= Y. sum ( )
bad= Y. count( ) - good
d1= pd. DataFrame( { 'X' : X, 'Y' : Y, 'Bucket' : pd. cut( X, cat) } )
d2= d1. groupby( 'Bucket' , as_index = True )
d3 = pd. DataFrame( d2. X. min ( ) , columns= [ 'min' ] )
d3[ 'min' ] = d2. min ( ) . X
d3[ 'max' ] = d2. max ( ) . X
d3[ 'sum' ] = d2. sum ( ) . Y
d3[ 'total' ] = d2. count( ) . Y
d3[ 'rate' ] = d2. mean( ) . Y
d3[ 'woe' ] = np. log( ( d3[ 'rate' ] / ( 1 - d3[ 'rate' ] ) ) / ( good / bad) )
d3[ 'goodattribute' ] = d3[ 'sum' ] / good
d3[ 'badattribute' ] = ( d3[ 'total' ] - d3[ 'sum' ] ) / bad
iv = ( ( d3[ 'goodattribute' ] - d3[ 'badattribute' ] ) * d3[ 'woe' ] ) . sum ( )
d4 = ( d3. sort_index( by= 'min' ) )
print ( "=" * 60 )
print ( d4)
woe = list ( d4[ 'woe' ] . round ( 3 ) )
return d4, iv, woe
dfx1, ivx1, cutx1, woex1 = mono_bin( data. SeriousDlqin2yrs, data. RevolvingUtilizationOfUnsecuredLines, n= 10 )
dfx2, ivx2, cutx2, woex2 = mono_bin( data. SeriousDlqin2yrs, data. age, n= 10 )
dfx4, ivx4, cutx4, woex4 = mono_bin( data. SeriousDlqin2yrs, data. DebtRatio, n= 20 )
dfx5, ivx5, cutx5, woex5 = mono_bin( data. SeriousDlqin2yrs, data. MonthlyIncome, n= 10 )
cutx3 = [ ninf, 0 , 1 , 3 , 5 , pinf]
cutx6 = [ ninf, 1 , 2 , 3 , 5 , pinf]
cutx7 = [ ninf, 0 , 1 , 3 , 5 , pinf]
cutx8 = [ ninf, 0 , 1 , 2 , 3 , pinf]
cutx9 = [ ninf, 0 , 1 , 3 , pinf]
cutx10 = [ ninf, 0 , 1 , 2 , 3 , 5 , pinf]
dfx3, ivx3, woex3 = self_bin( data. SeriousDlqin2yrs, data[ 'NumberOfTime30-59DaysPastDueNotWorse' ] , cutx3)
dfx6, ivx6, woex6= self_bin( data. SeriousDlqin2yrs, data[ 'NumberOfOpenCreditLinesAndLoans' ] , cutx6)
dfx7, ivx7, woex7 = self_bin( data. SeriousDlqin2yrs, data[ 'NumberOfTimes90DaysLate' ] , cutx7)
dfx8, ivx8, woex8 = self_bin( data. SeriousDlqin2yrs, data[ 'NumberRealEstateLoansOrLines' ] , cutx8)
dfx9, ivx9, woex9 = self_bin( data. SeriousDlqin2yrs, data[ 'NumberOfTime60-89DaysPastDueNotWorse' ] , cutx9)
dfx10, ivx10, woex10 = self_bin( data. SeriousDlqin2yrs, data[ 'NumberOfDependents' ] , cutx10)
============================================================
min max sum total rate woe goodattribute \
0 0.000000 0.035034 29333 29926 0.980184 1.298275 0.263194
1 0.035037 0.176771 29205 29926 0.975907 1.098457 0.262046
2 0.176777 0.577036 28305 29925 0.945865 0.257613 0.253970
3 0.577040 50708.000000 24607 29926 0.822262 -1.071254 0.220790
badattribute
0 0.071853
1 0.087362
2 0.196292
3 0.644493
============================================================
min max sum total rate woe goodattribute badattribute
0 21 33 12867 14407 0.893108 -0.480116 0.115451 0.186599
1 34 39 11600 12806 0.905825 -0.339303 0.104083 0.146129
2 40 44 12301 13454 0.914301 -0.235686 0.110372 0.139707
3 45 49 14269 15543 0.918034 -0.187071 0.128031 0.154368
4 50 53 11069 11978 0.924111 -0.103441 0.099318 0.110142
5 54 58 13016 13846 0.940055 0.149510 0.116788 0.100569
6 59 63 12629 13270 0.951696 0.377722 0.113315 0.077669
7 64 70 11804 12185 0.968732 0.830395 0.105913 0.046165
8 71 103 11895 12214 0.973882 1.015683 0.106729 0.038653
============================================================
min max sum total rate woe goodattribute badattribute
0 0.0 61106.5 111450 119703 0.931054 0.0 1.0 1.0
============================================================
min max sum total rate woe goodattribute \
0 0.0 3400.0 27355 30073 0.909620 -0.293996 0.245446
1 3401.0 5400.0 27655 30008 0.921588 -0.138884 0.248138
2 5401.0 8200.0 27925 29725 0.939445 0.138736 0.250561
3 8201.0 49750.0 28515 29897 0.953775 0.423899 0.255855
badattribute
0 0.329335
1 0.285108
2 0.218103
3 0.167454
============================================================
min max sum total rate woe goodattribute \
Bucket
(-inf, 0.0] 0 0 95431 99609 0.958056 0.525572 0.856267
(0.0, 1.0] 1 1 11501 13499 0.851989 -0.852712 0.103194
(1.0, 3.0] 2 3 3900 5466 0.713502 -1.690547 0.034993
(3.0, 5.0] 4 5 510 917 0.556161 -2.377402 0.004576
(5.0, inf] 6 13 108 212 0.509434 -2.565259 0.000969
badattribute
Bucket
(-inf, 0.0] 0.506240
(0.0, 1.0] 0.242094
(1.0, 3.0] 0.189749
(3.0, 5.0] 0.049315
(5.0, inf] 0.012601
============================================================
min max sum total rate woe goodattribute \
Bucket
(-inf, 1.0] 0 1 3255 3880 0.838918 -0.952803 0.029206
(1.0, 2.0] 2 2 4159 4613 0.901582 -0.388067 0.037317
(2.0, 3.0] 3 3 6020 6570 0.916286 -0.210075 0.054015
(3.0, 5.0] 4 5 17435 18729 0.930909 -0.002258 0.156438
(5.0, inf] 6 58 80581 85911 0.937959 0.112912 0.723024
badattribute
Bucket
(-inf, 1.0] 0.075730
(1.0, 2.0] 0.055010
(2.0, 3.0] 0.066642
(3.0, 5.0] 0.156791
(5.0, inf] 0.645826
============================================================
min max sum total rate woe goodattribute \
Bucket
(-inf, 0.0] 0 0 107586 113143 0.950885 0.360233 0.965330
(0.0, 1.0] 1 1 2857 4301 0.664264 -1.920644 0.025635
(1.0, 3.0] 2 3 844 1771 0.476567 -2.696800 0.007573
(3.0, 5.0] 4 5 111 344 0.322674 -3.344508 0.000996
(5.0, inf] 6 17 52 144 0.361111 -3.173544 0.000467
badattribute
Bucket
(-inf, 0.0] 0.673331
(0.0, 1.0] 0.174967
(1.0, 3.0] 0.112323
(3.0, 5.0] 0.028232
(5.0, inf] 0.011147
============================================================
min max sum total rate woe goodattribute \
Bucket
(-inf, 0.0] 0 0 39502 43118 0.916137 -0.212016 0.354437
(0.0, 1.0] 1 1 39268 41621 0.943466 0.211720 0.352337
(1.0, 2.0] 2 2 25027 26570 0.941927 0.183227 0.224558
(2.0, 3.0] 3 3 4974 5345 0.930589 -0.007222 0.044630
(3.0, inf] 4 54 2679 3049 0.878649 -0.623303 0.024038
badattribute
Bucket
(-inf, 0.0] 0.438144
(0.0, 1.0] 0.285108
(1.0, 2.0] 0.186962
(2.0, 3.0] 0.044953
(3.0, inf] 0.044832
============================================================
min max sum total rate woe goodattribute \
Bucket
(-inf, 0.0] 0 0 107494 113592 0.946317 0.266475 0.964504
(0.0, 1.0] 1 1 3331 4792 0.695117 -1.778848 0.029888
(1.0, 3.0] 2 3 571 1179 0.484309 -2.665785 0.005123
(3.0, inf] 4 11 54 140 0.385714 -3.068363 0.000485
badattribute
Bucket
(-inf, 0.0] 0.738883
(0.0, 1.0] 0.177027
(1.0, 3.0] 0.073670
(3.0, inf] 0.010420
============================================================
min max sum total rate woe goodattribute \
Bucket
(-inf, 0.0] 0.0 0.0 61292 65127 0.941115 0.168481 0.549951
(0.0, 1.0] 1.0 1.0 22482 24283 0.925833 -0.078626 0.201723
(1.0, 2.0] 2.0 2.0 16567 18033 0.918705 -0.178124 0.148650
(2.0, 3.0] 3.0 3.0 7922 8707 0.909843 -0.291284 0.071081
(3.0, 5.0] 4.0 5.0 2983 3322 0.897953 -0.428315 0.026765
(5.0, inf] 6.0 20.0 204 231 0.883117 -0.580716 0.001830
badattribute
Bucket
(-inf, 0.0] 0.464680
(0.0, 1.0] 0.218224
(1.0, 2.0] 0.177632
(2.0, 3.0] 0.095117
(3.0, 5.0] 0.041076
(5.0, inf] 0.003272
ivlist= [ ivx1, ivx2, ivx3, ivx4, ivx5, ivx6, ivx7, ivx8, ivx9, ivx10]
index= [ 'x1' , 'x2' , 'x3' , 'x4' , 'x5' , 'x6' , 'x7' , 'x8' , 'x9' , 'x10' ]
fig1 = plt. figure( 1 )
ax1 = fig1. add_subplot( 1 , 1 , 1 )
x = np. arange( len ( index) ) + 1
ax1. bar( x, ivlist, width= 0.4 )
ax1. set_xticks( x)
ax1. set_xticklabels( index, rotation= 0 , fontsize= 12 )
ax1. set_ylabel( 'IV(Information Value)' , fontsize= 14 )
for a, b in zip ( x, ivlist) :
plt. text( a, b + 0.01 , '%.4f' % b, ha= 'center' , va= 'bottom' , fontsize= 10 )
plt. show( )
data. columns
Index(['SeriousDlqin2yrs', 'RevolvingUtilizationOfUnsecuredLines', 'age',
'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome',
'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',
'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse',
'NumberOfDependents'],
dtype='object')
小结