kaggle提交结果: 我的private score 0.86699, public score 0.86101, 榜单第一名private score 0.86955, public score 0.86390.
Importing libraries
import pandas as pd
import numpy as np
import os
from sklearn. ensemble import RandomForestClassifier
from sklearn. ensemble import GradientBoostingClassifier
import imblearn
from imblearn. over_sampling import SMOTE
from sklearn. model_selection import train_test_split
from sklearn. linear_model import LogisticRegression
from sklearn. model_selection import GridSearchCV
from sklearn. model_selection import cross_val_score
import matplotlib. pyplot as plt
import seaborn as sns
import scikitplot as skplt
import scipy
from sklearn. model_selection import GridSearchCV, RandomizedSearchCV
from sklearn. metrics import roc_curve, roc_auc_score, confusion_matrix as cm, auc, roc_curve
Reading the data
data = pd. read_csv( 'cs-training.csv' , index_col= 0 )
test_data = pd. read_csv( 'cs-test.csv' , index_col= 0 )
sample_data = pd. read_csv( 'sampleEntry.csv' , index_col= 0 )
EDA
data. head( )
SeriousDlqin2yrs
RevolvingUtilizationOfUnsecuredLines
age
NumberOfTime30-59DaysPastDueNotWorse
DebtRatio
MonthlyIncome
NumberOfOpenCreditLinesAndLoans
NumberOfTimes90DaysLate
NumberRealEstateLoansOrLines
NumberOfTime60-89DaysPastDueNotWorse
NumberOfDependents
1
1
0.766127
45
2
0.802982
9120.0
13
0
6
0
2.0
2
0
0.957151
40
0
0.121876
2600.0
4
0
0
0
1.0
3
0
0.658180
38
1
0.085113
3042.0
2
1
0
0
0.0
4
0
0.233810
30
0
0.036050
3300.0
5
0
0
0
0.0
5
0
0.907239
49
1
0.024926
63588.0
7
0
1
0
0.0
test_data. head( )
SeriousDlqin2yrs
RevolvingUtilizationOfUnsecuredLines
age
NumberOfTime30-59DaysPastDueNotWorse
DebtRatio
MonthlyIncome
NumberOfOpenCreditLinesAndLoans
NumberOfTimes90DaysLate
NumberRealEstateLoansOrLines
NumberOfTime60-89DaysPastDueNotWorse
NumberOfDependents
1
NaN
0.885519
43
0
0.177513
5700.0
4
0
0
0
0.0
2
NaN
0.463295
57
0
0.527237
9141.0
15
0
4
0
2.0
3
NaN
0.043275
59
0
0.687648
5083.0
12
0
1
0
2.0
4
NaN
0.280308
38
1
0.925961
3200.0
7
0
2
0
0.0
5
NaN
1.000000
27
0
0.019917
3865.0
4
0
0
0
1.0
data. describe( [ 0.01 , 0.1 , 0.25 , .5 , .75 , .9 , .99 ] ) . T
count
mean
std
min
1%
10%
25%
50%
75%
90%
99%
max
SeriousDlqin2yrs
150000.0
0.066840
0.249746
0.0
0.0
0.000000
0.000000
0.000000
0.000000
0.000000
1.000000
1.0
RevolvingUtilizationOfUnsecuredLines
150000.0
6.048438
249.755371
0.0
0.0
0.002969
0.029867
0.154181
0.559046
0.981278
1.092956
50708.0
age
150000.0
52.295207
14.771866
0.0
24.0
33.000000
41.000000
52.000000
63.000000
72.000000
87.000000
109.0
NumberOfTime30-59DaysPastDueNotWorse
150000.0
0.421033
4.192781
0.0
0.0
0.000000
0.000000
0.000000
0.000000
1.000000
4.000000
98.0
DebtRatio
150000.0
353.005076
2037.818523
0.0
0.0
0.030874
0.175074
0.366508
0.868254
1267.000000
4979.040000
329664.0
MonthlyIncome
120269.0
6670.221237
14384.674215
0.0
0.0
2005.000000
3400.000000
5400.000000
8249.000000
11666.000000
25000.000000
3008750.0
NumberOfOpenCreditLinesAndLoans
150000.0
8.452760
5.145951
0.0
0.0
3.000000
5.000000
8.000000
11.000000
15.000000
24.000000
58.0
NumberOfTimes90DaysLate
150000.0
0.265973
4.169304
0.0
0.0
0.000000
0.000000
0.000000
0.000000
0.000000
3.000000
98.0
NumberRealEstateLoansOrLines
150000.0
1.018240
1.129771
0.0
0.0
0.000000
0.000000
1.000000
2.000000
2.000000
4.000000
54.0
NumberOfTime60-89DaysPastDueNotWorse
150000.0
0.240387
4.155179
0.0
0.0
0.000000
0.000000
0.000000
0.000000
0.000000
2.000000
98.0
NumberOfDependents
146076.0
0.757222
1.115086
0.0
0.0
0.000000
0.000000
0.000000
1.000000
2.000000
4.000000
20.0
test_data. describe( [ 0.01 , 0.1 , 0.25 , .5 , .75 , .9 , .99 ] ) . T
count
mean
std
min
1%
10%
25%
50%
75%
90%
99%
max
SeriousDlqin2yrs
0.0
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
RevolvingUtilizationOfUnsecuredLines
101503.0
5.310000
196.156039
0.0
0.0
0.003008
0.030131
0.152586
0.564225
0.983342
1.08869
21821.0
age
101503.0
52.405436
14.779756
21.0
24.0
33.000000
41.000000
52.000000
63.000000
72.000000
87.00000
104.0
NumberOfTime30-59DaysPastDueNotWorse
101503.0
0.453770
4.538487
0.0
0.0
0.000000
0.000000
0.000000
0.000000
1.000000
4.00000
98.0
DebtRatio
101503.0
344.475020
1632.595231
0.0
0.0
0.030058
0.173423
0.364260
0.851619
1238.800000
4963.00000
268326.0
MonthlyIncome
81400.0
6855.035590
36508.600375
0.0
0.0
2083.000000
3408.000000
5400.000000
8200.000000
11500.000000
25916.01000
7727000.0
NumberOfOpenCreditLinesAndLoans
101503.0
8.453514
5.144100
0.0
0.0
3.000000
5.000000
8.000000
11.000000
15.000000
25.00000
85.0
NumberOfTimes90DaysLate
101503.0
0.296691
4.515859
0.0
0.0
0.000000
0.000000
0.000000
0.000000
0.000000
3.00000
98.0
NumberRealEstateLoansOrLines
101503.0
1.013074
1.110253
0.0
0.0
0.000000
0.000000
1.000000
2.000000
2.000000
4.00000
37.0
NumberOfTime60-89DaysPastDueNotWorse
101503.0
0.270317
4.503578
0.0
0.0
0.000000
0.000000
0.000000
0.000000
0.000000
2.00000
98.0
NumberOfDependents
98877.0
0.769046
1.136778
0.0
0.0
0.000000
0.000000
0.000000
1.000000
2.000000
4.00000
43.0
data. shape
(150000, 11)
test_data. shape
(101503, 11)
data. info( )
<class 'pandas.core.frame.DataFrame'>
Int64Index: 150000 entries, 1 to 150000
Data columns (total 11 columns):
SeriousDlqin2yrs 150000 non-null int64
RevolvingUtilizationOfUnsecuredLines 150000 non-null float64
age 150000 non-null int64
NumberOfTime30-59DaysPastDueNotWorse 150000 non-null int64
DebtRatio 150000 non-null float64
MonthlyIncome 120269 non-null float64
NumberOfOpenCreditLinesAndLoans 150000 non-null int64
NumberOfTimes90DaysLate 150000 non-null int64
NumberRealEstateLoansOrLines 150000 non-null int64
NumberOfTime60-89DaysPastDueNotWorse 150000 non-null int64
NumberOfDependents 146076 non-null float64
dtypes: float64(4), int64(7)
memory usage: 13.7 MB
test_data. info( )
<class 'pandas.core.frame.DataFrame'>
Int64Index: 101503 entries, 1 to 101503
Data columns (total 11 columns):
SeriousDlqin2yrs 0 non-null float64
RevolvingUtilizationOfUnsecuredLines 101503 non-null float64
age 101503 non-null int64
NumberOfTime30-59DaysPastDueNotWorse 101503 non-null int64
DebtRatio 101503 non-null float64
MonthlyIncome 81400 non-null float64
NumberOfOpenCreditLinesAndLoans 101503 non-null int64
NumberOfTimes90DaysLate 101503 non-null int64
NumberRealEstateLoansOrLines 101503 non-null int64
NumberOfTime60-89DaysPastDueNotWorse 101503 non-null int64
NumberOfDependents 98877 non-null float64
dtypes: float64(5), int64(6)
memory usage: 9.3 MB
drop_duplicates
data. drop_duplicates( inplace= True )
data. info( )
data. index = range ( data. shape[ 0 ] )
data. info( )
<class 'pandas.core.frame.DataFrame'>
Int64Index: 149391 entries, 1 to 150000
Data columns (total 11 columns):
SeriousDlqin2yrs 149391 non-null int64
RevolvingUtilizationOfUnsecuredLines 149391 non-null float64
age 149391 non-null int64
NumberOfTime30-59DaysPastDueNotWorse 149391 non-null int64
DebtRatio 149391 non-null float64
MonthlyIncome 120170 non-null float64
NumberOfOpenCreditLinesAndLoans 149391 non-null int64
NumberOfTimes90DaysLate 149391 non-null int64
NumberRealEstateLoansOrLines 149391 non-null int64
NumberOfTime60-89DaysPastDueNotWorse 149391 non-null int64
NumberOfDependents 145563 non-null float64
dtypes: float64(4), int64(7)
memory usage: 13.7 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149391 entries, 0 to 149390
Data columns (total 11 columns):
SeriousDlqin2yrs 149391 non-null int64
RevolvingUtilizationOfUnsecuredLines 149391 non-null float64
age 149391 non-null int64
NumberOfTime30-59DaysPastDueNotWorse 149391 non-null int64
DebtRatio 149391 non-null float64
MonthlyIncome 120170 non-null float64
NumberOfOpenCreditLinesAndLoans 149391 non-null int64
NumberOfTimes90DaysLate 149391 non-null int64
NumberRealEstateLoansOrLines 149391 non-null int64
NumberOfTime60-89DaysPastDueNotWorse 149391 non-null int64
NumberOfDependents 145563 non-null float64
dtypes: float64(4), int64(7)
memory usage: 12.5 MB
Checking Null values
data. isnull( ) . mean( )
SeriousDlqin2yrs 0.000000
RevolvingUtilizationOfUnsecuredLines 0.000000
age 0.000000
NumberOfTime30-59DaysPastDueNotWorse 0.000000
DebtRatio 0.000000
MonthlyIncome 0.198207
NumberOfOpenCreditLinesAndLoans 0.000000
NumberOfTimes90DaysLate 0.000000
NumberRealEstateLoansOrLines 0.000000
NumberOfTime60-89DaysPastDueNotWorse 0.000000
NumberOfDependents 0.026160
dtype: float64
data. nunique( )
SeriousDlqin2yrs 2
RevolvingUtilizationOfUnsecuredLines 125728
age 86
NumberOfTime30-59DaysPastDueNotWorse 16
DebtRatio 114194
MonthlyIncome 13594
NumberOfOpenCreditLinesAndLoans 58
NumberOfTimes90DaysLate 19
NumberRealEstateLoansOrLines 28
NumberOfTime60-89DaysPastDueNotWorse 13
NumberOfDependents 13
dtype: int64
test_data. isnull( ) . mean( )
SeriousDlqin2yrs 1.000000
RevolvingUtilizationOfUnsecuredLines 0.000000
age 0.000000
NumberOfTime30-59DaysPastDueNotWorse 0.000000
DebtRatio 0.000000
MonthlyIncome 0.198053
NumberOfOpenCreditLinesAndLoans 0.000000
NumberOfTimes90DaysLate 0.000000
NumberRealEstateLoansOrLines 0.000000
NumberOfTime60-89DaysPastDueNotWorse 0.000000
NumberOfDependents 0.025871
dtype: float64
data[ 'NumberOfDependents' ] . fillna( int ( data[ 'NumberOfDependents' ] . mode( ) [ 0 ] ) , inplace= True )
data. isnull( ) . mean( )
SeriousDlqin2yrs 0.000000
RevolvingUtilizationOfUnsecuredLines 0.000000
age 0.000000
NumberOfTime30-59DaysPastDueNotWorse 0.000000
DebtRatio 0.000000
MonthlyIncome 0.198207
NumberOfOpenCreditLinesAndLoans 0.000000
NumberOfTimes90DaysLate 0.000000
NumberRealEstateLoansOrLines 0.000000
NumberOfTime60-89DaysPastDueNotWorse 0.000000
NumberOfDependents 0.000000
dtype: float64
test_data[ 'NumberOfDependents' ] . fillna( int ( test_data[ 'NumberOfDependents' ] . mode( ) [ 0 ] ) , inplace= True )
test_data. isnull( ) . mean( )
SeriousDlqin2yrs 1.000000
RevolvingUtilizationOfUnsecuredLines 0.000000
age 0.000000
NumberOfTime30-59DaysPastDueNotWorse 0.000000
DebtRatio 0.000000
MonthlyIncome 0.198053
NumberOfOpenCreditLinesAndLoans 0.000000
NumberOfTimes90DaysLate 0.000000
NumberRealEstateLoansOrLines 0.000000
NumberOfTime60-89DaysPastDueNotWorse 0.000000
NumberOfDependents 0.000000
dtype: float64
data[ 'MonthlyIncome' ] . fillna( int ( data[ 'MonthlyIncome' ] . mean( ) ) , inplace= True )
data. isnull( ) . mean( )
SeriousDlqin2yrs 0.0
RevolvingUtilizationOfUnsecuredLines 0.0
age 0.0
NumberOfTime30-59DaysPastDueNotWorse 0.0
DebtRatio 0.0
MonthlyIncome 0.0
NumberOfOpenCreditLinesAndLoans 0.0
NumberOfTimes90DaysLate 0.0
NumberRealEstateLoansOrLines 0.0
NumberOfTime60-89DaysPastDueNotWorse 0.0
NumberOfDependents 0.0
dtype: float64
test_data[ 'MonthlyIncome' ] . fillna( int ( test_data[ 'MonthlyIncome' ] . mean( ) ) , inplace= True )
test_data. isnull( ) . mean( )
SeriousDlqin2yrs 1.0
RevolvingUtilizationOfUnsecuredLines 0.0
age 0.0
NumberOfTime30-59DaysPastDueNotWorse 0.0
DebtRatio 0.0
MonthlyIncome 0.0
NumberOfOpenCreditLinesAndLoans 0.0
NumberOfTimes90DaysLate 0.0
NumberRealEstateLoansOrLines 0.0
NumberOfTime60-89DaysPastDueNotWorse 0.0
NumberOfDependents 0.0
dtype: float64
Visualization
sns. countplot( x= 'SeriousDlqin2yrs' , data= data)
plt. show( )
f, ax = plt. subplots( figsize= ( 10 , 10 ) )
sns. heatmap( data. corr( ) , annot= True , linewidths= .5 , fmt= '.3f' , ax= ax)
plt. show( )
auusual values
data. loc[ data[ 'age' ] == 0 , 'age' ] = int ( data[ 'age' ] . mean( ) )
test_data. loc[ test_data[ 'age' ] == 0 , 'age' ] = int ( test_data[ 'age' ] . mean( ) )
X = data. loc[ : , data. columns!= 'SeriousDlqin2yrs' ]
y = data. loc[ : , 'SeriousDlqin2yrs' ]
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size= 0.2 , random_state= 116214 )
smote = SMOTE( random_state= 0 )
s_X, s_y= smote. fit_sample( X_train, y_train)
LogisticRegression model
LR = LogisticRegression( )
LR. fit( X_train, y_train)
print ( '训练集精确度:{0},测试集精确度:{1}\n' . format ( LR. score( X_train, y_train) , LR. score( X_test, y_test) ) )
print ( 'predict 训练集auc:{0},测试集auc:{1}\n' . format ( roc_auc_score( s_y, LR. predict( s_X) ) ,
roc_auc_score( y_test, LR. predict( X_test) ) ) )
print ( 'predict_proba 训练集auc:{0},测试集auc:{1}\n' . format ( roc_auc_score( s_y, LR. predict_proba( s_X) [ : , 1 ] ) ,
roc_auc_score( y_test, LR. predict_proba( X_test) [ : , 1 ] ) ) )
训练集精确度:0.933075,测试集精确度:0.9334666666666667
predict 训练集auc:0.5064304086881967,测试集auc:0.5078828573129228
predict_proba 训练集auc:0.6839255109393088,测试集auc:0.6871543091031289
LR_s = LogisticRegression( )
LR_s. fit( s_X, s_y)
print ( '训练集精确度:{0},测试集精确度:{1}\n' . format ( LR_s. score( s_X, s_y) , LR_s. score( X_test, y_test) ) )
print ( 'predict 训练集auc:{0},测试集auc:{1}\n' . format ( roc_auc_score( s_y, LR_s. predict( s_X) ) ,
roc_auc_score( y_test, LR_s. predict( X_test) ) ) )
print ( 'predict_proba 训练集auc:{0},测试集auc:{1}\n' . format ( roc_auc_score( s_y, LR_s. predict_proba( s_X) [ : , 1 ] ) ,
roc_auc_score( y_test, LR_s. predict_proba( X_test) [ : , 1 ] ) ) )
skplt. metrics. plot_roc( y_test, pd. DataFrame( LR_s. predict_proba( X_test) ) , plot_micro= False , figsize= ( 6 , 6 ) , plot_macro= False )
训练集精确度:0.6768451700485854,测试集精确度:0.8703333333333333
predict 训练集auc:0.6768451700485852,测试集auc:0.7351483859562692
predict_proba 训练集auc:0.7682496553604893,测试集auc:0.8034115289523501
LR_r = LogisticRegression(