kaggle提交结果: 我的private score 0.86699, public score 0.86101, 榜单第一名private score 0.86955, public score 0.86390.
Importing libraries
import pandas as pd
import numpy as np
import os
from sklearn. ensemble import RandomForestClassifier
from sklearn. ensemble import GradientBoostingClassifier
import imblearn
from imblearn. over_sampling import SMOTE
from sklearn. model_selection import train_test_split
from sklearn. linear_model import LogisticRegression
from sklearn. model_selection import GridSearchCV
from sklearn. model_selection import cross_val_score
import matplotlib. pyplot as plt
import seaborn as sns
import scikitplot as skplt
import scipy
from sklearn. model_selection import GridSearchCV, RandomizedSearchCV
from sklearn. metrics import roc_curve, roc_auc_score, confusion_matrix as cm, auc, roc_curve
Reading the data
data = pd. read_csv( 'cs-training.csv' , index_col= 0 )
test_data = pd. read_csv( 'cs-test.csv' , index_col= 0 )
sample_data = pd. read_csv( 'sampleEntry.csv' , index_col= 0 )
EDA
data. head( )
SeriousDlqin2yrs RevolvingUtilizationOfUnsecuredLines age NumberOfTime30-59DaysPastDueNotWorse DebtRatio MonthlyIncome NumberOfOpenCreditLinesAndLoans NumberOfTimes90DaysLate NumberRealEstateLoansOrLines NumberOfTime60-89DaysPastDueNotWorse NumberOfDependents 1 1 0.766127 45 2 0.802982 9120.0 13 0 6 0 2.0 2 0 0.957151 40 0 0.121876 2600.0 4 0 0 0 1.0 3 0 0.658180 38 1 0.085113 3042.0 2 1 0 0 0.0 4 0 0.233810 30 0 0.036050 3300.0 5 0 0 0 0.0 5 0 0.907239 49 1 0.024926 63588.0 7 0 1 0 0.0
test_data. head( )
SeriousDlqin2yrs RevolvingUtilizationOfUnsecuredLines age NumberOfTime30-59DaysPastDueNotWorse DebtRatio MonthlyIncome NumberOfOpenCreditLinesAndLoans NumberOfTimes90DaysLate NumberRealEstateLoansOrLines NumberOfTime60-89DaysPastDueNotWorse NumberOfDependents 1 NaN 0.885519 43 0 0.177513 5700.0 4 0 0 0 0.0 2 NaN 0.463295 57 0 0.527237 9141.0 15 0 4 0 2.0 3 NaN 0.043275 59 0 0.687648 5083.0 12 0 1 0 2.0 4 NaN 0.280308 38 1 0.925961 3200.0 7 0 2 0 0.0 5 NaN 1.000000 27 0 0.019917 3865.0 4 0 0 0 1.0
data. describe( [ 0.01 , 0.1 , 0.25 , .5 , .75 , .9 , .99 ] ) . T
count mean std min 1% 10% 25% 50% 75% 90% 99% max SeriousDlqin2yrs 150000.0 0.066840 0.249746 0.0 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 1.0 RevolvingUtilizationOfUnsecuredLines 150000.0 6.048438 249.755371 0.0 0.0 0.002969 0.029867 0.154181 0.559046 0.981278 1.092956 50708.0 age 150000.0 52.295207 14.771866 0.0 24.0 33.000000 41.000000 52.000000 63.000000 72.000000 87.000000 109.0 NumberOfTime30-59DaysPastDueNotWorse 150000.0 0.421033 4.192781 0.0 0.0 0.000000 0.000000 0.000000 0.000000 1.000000 4.000000 98.0 DebtRatio 150000.0 353.005076 2037.818523 0.0 0.0 0.030874 0.175074 0.366508 0.868254 1267.000000 4979.040000 329664.0 MonthlyIncome 120269.0 6670.221237 14384.674215 0.0 0.0 2005.000000 3400.000000 5400.000000 8249.000000 11666.000000 25000.000000 3008750.0 NumberOfOpenCreditLinesAndLoans 150000.0 8.452760 5.145951 0.0 0.0 3.000000 5.000000 8.000000 11.000000 15.000000 24.000000 58.0 NumberOfTimes90DaysLate 150000.0 0.265973 4.169304 0.0 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 3.000000 98.0 NumberRealEstateLoansOrLines 150000.0 1.018240 1.129771 0.0 0.0 0.000000 0.000000 1.000000 2.000000 2.000000 4.000000 54.0 NumberOfTime60-89DaysPastDueNotWorse 150000.0 0.240387 4.155179 0.0 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 2.000000 98.0 NumberOfDependents 146076.0 0.757222 1.115086 0.0 0.0 0.000000 0.000000 0.000000 1.000000 2.000000 4.000000 20.0
test_data. describe( [ 0.01 , 0.1 , 0.25 , .5 , .75 , .9 , .99 ] ) . T
count mean std min 1% 10% 25% 50% 75% 90% 99% max SeriousDlqin2yrs 0.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN RevolvingUtilizationOfUnsecuredLines 101503.0 5.310000 196.156039 0.0 0.0 0.003008 0.030131 0.152586 0.564225 0.983342 1.08869 21821.0 age 101503.0 52.405436 14.779756 21.0 24.0 33.000000 41.000000 52.000000 63.000000 72.000000 87.00000 104.0 NumberOfTime30-59DaysPastDueNotWorse 101503.0 0.453770 4.538487 0.0 0.0 0.000000 0.000000 0.000000 0.000000 1.000000 4.00000 98.0 DebtRatio 101503.0 344.475020 1632.595231 0.0 0.0 0.030058 0.173423 0.364260 0.851619 1238.800000 4963.00000 268326.0 MonthlyIncome 81400.0 6855.035590 36508.600375 0.0 0.0 2083.000000 3408.000000 5400.000000 8200.000000 11500.000000 25916.01000 7727000.0 NumberOfOpenCreditLinesAndLoans 101503.0 8.453514 5.144100 0.0 0.0 3.000000 5.000000 8.000000 11.000000 15.000000 25.00000 85.0 NumberOfTimes90DaysLate 101503.0 0.296691 4.515859 0.0 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 3.00000 98.0 NumberRealEstateLoansOrLines 101503.0 1.013074 1.110253 0.0 0.0 0.000000 0.000000 1.000000 2.000000 2.000000 4.00000 37.0 NumberOfTime60-89DaysPastDueNotWorse 101503.0 0.270317 4.503578 0.0 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 2.00000 98.0 NumberOfDependents 98877.0 0.769046 1.136778 0.0 0.0 0.000000 0.000000 0.000000 1.000000 2.000000 4.00000 43.0
data. shape
(150000, 11)
test_data. shape
(101503, 11)
data. info( )
<class 'pandas.core.frame.DataFrame'>
Int64Index: 150000 entries, 1 to 150000
Data columns (total 11 columns):
SeriousDlqin2yrs 150000 non-null int64
RevolvingUtilizationOfUnsecuredLines 150000 non-null float64
age 150000 non-null int64
NumberOfTime30-59DaysPastDueNotWorse 150000 non-null int64
DebtRatio 150000 non-null float64
MonthlyIncome 120269 non-null float64
NumberOfOpenCreditLinesAndLoans 150000 non-null int64
NumberOfTimes90DaysLate 150000 non-null int64
NumberRealEstateLoansOrLines 150000 non-null int64
NumberOfTime60-89DaysPastDueNotWorse 150000 non-null int64
NumberOfDependents 146076 non-null float64
dtypes: float64(4), int64(7)
memory usage: 13.7 MB
test_data. info( )
<class 'pandas.core.frame.DataFrame'>
Int64Index: 101503 entries, 1 to 101503
Data columns (total 11 columns):
SeriousDlqin2yrs 0 non-null float64
RevolvingUtilizationOfUnsecuredLines 101503 non-null float64
age 101503 non-null int64
NumberOfTime30-59DaysPastDueNotWorse 101503 non-null int64
DebtRatio 101503 non-null float64
MonthlyIncome 81400 non-null float64
NumberOfOpenCreditLinesAndLoans 101503 non-null int64
NumberOfTimes90DaysLate 101503 non-null int64
NumberRealEstateLoansOrLines 101503 non-null int64
NumberOfTime60-89DaysPastDueNotWorse 101503 non-null int64
NumberOfDependents 98877 non-null float64
dtypes: float64(5), int64(6)
memory usage: 9.3 MB
drop_duplicates
data. drop_duplicates( inplace= True )
data. info( )
data. index = range ( data. shape[ 0 ] )
data. info( )
<class 'pandas.core.frame.DataFrame'>
Int64Index: 149391 entries, 1 to 150000
Data columns (total 11 columns):
SeriousDlqin2yrs 149391 non-null int64
RevolvingUtilizationOfUnsecuredLines 149391 non-null float64
age 149391 non-null int64
NumberOfTime30-59DaysPastDueNotWorse 149391 non-null int64
DebtRatio 149391 non-null float64
MonthlyIncome 120170 non-null float64
NumberOfOpenCreditLinesAndLoans 149391 non-null int64
NumberOfTimes90DaysLate 149391 non-null int64
NumberRealEstateLoansOrLines 149391 non-null int64
NumberOfTime60-89DaysPastDueNotWorse 149391 non-null int64
NumberOfDependents 145563 non-null float64
dtypes: float64(4), int64(7)
memory usage: 13.7 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149391 entries, 0 to 149390
Data columns (total 11 columns):
SeriousDlqin2yrs 149391 non-null int64
RevolvingUtilizationOfUnsecuredLines 149391 non-null float64
age 149391 non-null int64
NumberOfTime30-59DaysPastDueNotWorse 149391 non-null int64
DebtRatio 149391 non-null float64
MonthlyIncome 120170 non-null float64
NumberOfOpenCreditLinesAndLoans 149391 non-null int64
NumberOfTimes90DaysLate 149391 non-null int64
NumberRealEstateLoansOrLines 149391 non-null int64
NumberOfTime60-89DaysPastDueNotWorse 149391 non-null int64
NumberOfDependents 145563 non-null float64
dtypes: float64(4), int64(7)
memory usage: 12.5 MB
Checking Null values
data. isnull( ) . mean( )
SeriousDlqin2yrs 0.000000
RevolvingUtilizationOfUnsecuredLines 0.000000
age 0.000000
NumberOfTime30-59DaysPastDueNotWorse 0.000000
DebtRatio 0.000000
MonthlyIncome 0.198207
NumberOfOpenCreditLinesAndLoans 0.000000
NumberOfTimes90DaysLate 0.000000
NumberRealEstateLoansOrLines 0.000000
NumberOfTime60-89DaysPastDueNotWorse 0.000000
NumberOfDependents 0.026160
dtype: float64
data. nunique( )
SeriousDlqin2yrs 2
RevolvingUtilizationOfUnsecuredLines 125728
age 86
NumberOfTime30-59DaysPastDueNotWorse 16
DebtRatio 114194
MonthlyIncome 13594
NumberOfOpenCreditLinesAndLoans 58
NumberOfTimes90DaysLate 19
NumberRealEstateLoansOrLines 28
NumberOfTime60-89DaysPastDueNotWorse 13
NumberOfDependents 13
dtype: int64
test_data. isnull( ) . mean( )
SeriousDlqin2yrs 1.000000
RevolvingUtilizationOfUnsecuredLines 0.000000
age 0.000000
NumberOfTime30-59DaysPastDueNotWorse 0.000000
DebtRatio 0.000000
MonthlyIncome 0.198053
NumberOfOpenCreditLinesAndLoans 0.000000
NumberOfTimes90DaysLate 0.000000
NumberRealEstateLoansOrLines 0.000000
NumberOfTime60-89DaysPastDueNotWorse 0.000000
NumberOfDependents 0.025871
dtype: float64
data[ 'NumberOfDependents' ] . fillna( int ( data[ 'NumberOfDependents' ] . mode( ) [ 0 ] ) , inplace= True )
data. isnull( ) . mean( )
SeriousDlqin2yrs 0.000000
RevolvingUtilizationOfUnsecuredLines 0.000000
age 0.000000
NumberOfTime30-59DaysPastDueNotWorse 0.000000
DebtRatio 0.000000
MonthlyIncome 0.198207
NumberOfOpenCreditLinesAndLoans 0.000000
NumberOfTimes90DaysLate 0.000000
NumberRealEstateLoansOrLines 0.000000
NumberOfTime60-89DaysPastDueNotWorse 0.000000
NumberOfDependents 0.000000
dtype: float64
test_data[ 'NumberOfDependents' ] . fillna( int ( test_data[ 'NumberOfDependents' ] . mode( ) [ 0 ] ) , inplace= True )
test_data. isnull( ) . mean( )
SeriousDlqin2yrs 1.000000
RevolvingUtilizationOfUnsecuredLines 0.000000
age 0.000000
NumberOfTime30-59DaysPastDueNotWorse 0.000000
DebtRatio 0.000000
MonthlyIncome 0.198053
NumberOfOpenCreditLinesAndLoans 0.000000
NumberOfTimes90DaysLate 0.000000
NumberRealEstateLoansOrLines 0.000000
NumberOfTime60-89DaysPastDueNotWorse 0.000000
NumberOfDependents 0.000000
dtype: float64
data[ 'MonthlyIncome' ] . fillna( int ( data[ 'MonthlyIncome' ] . mean( ) ) , inplace= True )
data. isnull( ) . mean( )
SeriousDlqin2yrs 0.0
RevolvingUtilizationOfUnsecuredLines 0.0
age 0.0
NumberOfTime30-59DaysPastDueNotWorse 0.0
DebtRatio 0.0
MonthlyIncome 0.0
NumberOfOpenCreditLinesAndLoans 0.0
NumberOfTimes90DaysLate 0.0
NumberRealEstateLoansOrLines 0.0
NumberOfTime60-89DaysPastDueNotWorse 0.0
NumberOfDependents 0.0
dtype: float64
test_data[ 'MonthlyIncome' ] . fillna( int ( test_data[ 'MonthlyIncome' ] . mean( ) ) , inplace= True )
test_data. isnull( ) . mean( )
SeriousDlqin2yrs 1.0
RevolvingUtilizationOfUnsecuredLines 0.0
age 0.0
NumberOfTime30-59DaysPastDueNotWorse 0.0
DebtRatio 0.0
MonthlyIncome 0.0
NumberOfOpenCreditLinesAndLoans 0.0
NumberOfTimes90DaysLate 0.0
NumberRealEstateLoansOrLines 0.0
NumberOfTime60-89DaysPastDueNotWorse 0.0
NumberOfDependents 0.0
dtype: float64
Visualization
sns. countplot( x= 'SeriousDlqin2yrs' , data= data)
plt. show( )
f, ax = plt. subplots( figsize= ( 10 , 10 ) )
sns. heatmap( data. corr( ) , annot= True , linewidths= .5 , fmt= '.3f' , ax= ax)
plt. show( )
auusual values
data. loc[ data[ 'age' ] == 0 , 'age' ] = int ( data[ 'age' ] . mean( ) )
test_data. loc[ test_data[ 'age' ] == 0 , 'age' ] = int ( test_data[ 'age' ] . mean( ) )
X = data. loc[ : , data. columns!= 'SeriousDlqin2yrs' ]
y = data. loc[ : , 'SeriousDlqin2yrs' ]
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size= 0.2 , random_state= 116214 )
smote = SMOTE( random_state= 0 )
s_X, s_y= smote. fit_sample( X_train, y_train)
LogisticRegression model
LR = LogisticRegression( )
LR. fit( X_train, y_train)
print ( '训练集精确度:{0},测试集精确度:{1}\n' . format ( LR. score( X_train, y_train) , LR. score( X_test, y_test) ) )
print ( 'predict 训练集auc:{0},测试集auc:{1}\n' . format ( roc_auc_score( s_y, LR. predict( s_X) ) ,
roc_auc_score( y_test, LR. predict( X_test) ) ) )
print ( 'predict_proba 训练集auc:{0},测试集auc:{1}\n' . format ( roc_auc_score( s_y, LR. predict_proba( s_X) [ : , 1 ] ) ,
roc_auc_score( y_test, LR. predict_proba( X_test) [ : , 1 ] ) ) )
训练集精确度:0.933075,测试集精确度:0.9334666666666667
predict 训练集auc:0.5064304086881967,测试集auc:0.5078828573129228
predict_proba 训练集auc:0.6839255109393088,测试集auc:0.6871543091031289
LR_s = LogisticRegression( )
LR_s. fit( s_X, s_y)
print ( '训练集精确度:{0},测试集精确度:{1}\n' . format ( LR_s. score( s_X, s_y) , LR_s. score( X_test, y_test) ) )
print ( 'predict 训练集auc:{0},测试集auc:{1}\n' . format ( roc_auc_score( s_y, LR_s. predict( s_X) ) ,
roc_auc_score( y_test, LR_s. predict( X_test) ) ) )
print ( 'predict_proba 训练集auc:{0},测试集auc:{1}\n' . format ( roc_auc_score( s_y, LR_s. predict_proba( s_X) [ : , 1 ] ) ,
roc_auc_score( y_test, LR_s. predict_proba( X_test) [ : , 1 ] ) ) )
skplt. metrics. plot_roc( y_test, pd. DataFrame( LR_s. predict_proba( X_test) ) , plot_micro= False , figsize= ( 6 , 6 ) , plot_macro= False )
训练集精确度:0.6768451700485854,测试集精确度:0.8703333333333333
predict 训练集auc:0.6768451700485852,测试集auc:0.7351483859562692
predict_proba 训练集auc:0.7682496553604893,测试集auc:0.8034115289523501
LR_r = LogisticRegression( )
param_dist = {
'penalty' : [ 'l1' , 'l2' ] ,
'C' : [ 0 , 0.01 , 0.05 , 0.1 , 0.5 , 1 , 5 , 10 , 50 , 100 ] ,
'solver' : [ 'newton-cg' , 'lbfgs' , 'liblinear' , 'sag' , 'saga' ] ,
'max_iter' : range ( 1 , 300 , 50 ) ,
}
LR_random = RandomizedSearchCV( LR_r, param_distributions= param_dist, cv= 5 , n_jobs = - 1 )
LR_random. fit( s_X, s_y)
best_est_LR = LR_random. best_estimator_
print ( '最优参数:{0}\n' . format ( best_est_LR) )
print ( '训练集精确度:{0},测试集精确度:{1}\n' . format ( LR_random. score( s_X, s_y) , LR_random. score( X_test, y_test) ) )
y_pred_train_sr = LR_random. predict( s_X)
y_pred_sr = LR_random. predict( X_test)
print ( '训练集auc:{0},测试集auc:{1}\n' . format ( roc_auc_score( s_y, y_pred_train_sr) , roc_auc_score( y_test, y_pred_sr) ) )
print ( '训练集混淆矩阵:\n{0},\n\n测试集混淆矩阵:\n{1}' . format ( cm( s_y, y_pred_train_sr, labels= [ 1 , 0 ] ) ,
cm( y_test, y_pred_sr, labels= [ 1 , 0 ] ) ) )
最优参数:LogisticRegression(C=50, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=None, max_iter=251,
multi_class='auto', n_jobs=None, penalty='l2',
random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
warm_start=False)
训练集精确度:0.6722322449271221,测试集精确度:0.722
训练集auc:0.672232244927122,测试集auc:0.6570985182928633
训练集混淆矩阵:
[[68407 43561]
[29838 82130]],
测试集混淆矩阵:
[[ 1161 833]
[ 7507 20499]]
auc_score = [ ]
for i in [ 'newton-cg' , 'lbfgs' , 'liblinear' , 'sag' , 'saga' ] :
LR_m = LogisticRegression( solver= i)
LR_m. fit( s_X, s_y)
auc_score. append( roc_auc_score( y_test, LR_m. predict_proba( X_test) [ : , 1 ] ) )
print ( max ( auc_score) )
plt. plot( [ 'newton-cg' , 'lbfgs' , 'liblinear' , 'sag' , 'saga' ] , auc_score)
plt. show( )
0.8034115289523501
auc_score = [ ]
for i in [ 0.1 , 0.5 , 1 ] :
LR_m = LogisticRegression( solver= 'lbfgs' , C= i)
LR_m. fit( s_X, s_y)
auc_score. append( roc_auc_score( y_test, LR_m. predict_proba( X_test) [ : , 1 ] ) )
print ( max ( auc_score) )
plt. plot( [ 0.1 , 0.5 , 1 ] , auc_score)
plt. show( )
0.8072174013291749
auc_score = [ ]
for i in [ 0.01 , 0.05 , 0.1 ] :
LR_m = LogisticRegression( solver= 'lbfgs' , C= i)
LR_m. fit( s_X, s_y)
auc_score. append( roc_auc_score( y_test, LR_m. predict_proba( X_test) [ : , 1 ] ) )
print ( max ( auc_score) )
plt. plot( [ 0.01 , 0.05 , 0.1 ] , auc_score)
plt. show( )
0.8072174013291749
auc_score = [ ]
for i in [ 50 , 100 , 150 , 200 ] :
LR_m = LogisticRegression( solver= 'lbfgs' , C= 0.1 , max_iter= i)
LR_m. fit( s_X, s_y)
auc_score. append( roc_auc_score( y_test, LR_m. predict_proba( X_test) [ : , 1 ] ) )
print ( max ( auc_score) )
plt. plot( [ 50 , 100 , 150 , 200 ] , auc_score)
plt. show( )
0.8072174013291749
auc_score = [ ]
for i in range ( 30 , 120 , 10 ) :
LR_m = LogisticRegression( solver= 'lbfgs' , C= 0.1 , max_iter= i)
LR_m. fit( s_X, s_y)
auc_score. append( roc_auc_score( y_test, LR_m. predict_proba( X_test) [ : , 1 ] ) )
print ( list ( range ( 30 , 120 , 10 ) ) [ auc_score. index( max ( auc_score) ) ] , max ( auc_score) )
plt. plot( range ( 30 , 120 , 10 ) , auc_score)
plt. show( )
80 0.807376433735972
best_est_LR = LogisticRegression( solver= 'lbfgs' , C= 0.1 , max_iter= 80 ) . fit( s_X, s_y)
df_test = test_data. drop( 'SeriousDlqin2yrs' , axis= 1 )
sample_data[ "Probability" ] = best_est_LR. predict_proba( df_test) [ : , 1 ]
print ( sample_data. head( ) )
sample_data. to_csv( "submission_LR.csv" , index= False )
Probability
Id
1 0.403650
2 0.392559
3 0.419110
4 0.487821
5 0.472049
Random Forest model
RFC = RandomForestClassifier( )
RFC. fit( X_train, y_train)
print ( '训练集精确度:{0},测试集精确度:{1}\n' . format ( RFC. score( X_train, y_train) , RFC. score( X_test, y_test) ) )
print ( 'predict 训练集auc:{0},测试集auc:{1}\n' . format ( roc_auc_score( y_train, RFC. predict( X_train) ) ,
roc_auc_score( y_test, RFC. predict( X_test) ) ) )
print ( 'predict_proba 训练集auc:{0},测试集auc:{1}\n' . format ( roc_auc_score( y_train, RFC. predict_proba( X_train) [ : , 1 ] ) ,
roc_auc_score( y_test, RFC. predict_proba( X_test) [ : , 1 ] ) ) )
训练集精确度:0.9995916666666667,测试集精确度:0.9359
predict 训练集auc:0.997585340973915,测试集auc:0.5895362836348795
predict_proba 训练集auc:0.9999771540267908,测试集auc:0.8437090354115978
RFC_s = RandomForestClassifier( )
RFC_s. fit( s_X, s_y)
print ( '训练集精确度:{0},测试集精确度:{1}\n' . format ( RFC_s. score( s_X, s_y) , RFC_s. score( X_test, y_test) ) )
print ( 'predict 训练集auc:{0},测试集auc:{1}\n' . format ( roc_auc_score( s_y, RFC_s. predict( s_X) ) ,
roc_auc_score( y_test, RFC_s. predict( X_test) ) ) )
print ( 'predict_proba 训练集auc:{0},测试集auc:{1}\n' . format ( roc_auc_score( s_y, RFC_s. predict_proba( s_X) [ : , 1 ] ) ,
roc_auc_score( y_test, RFC_s. predict_proba( X_test) [ : , 1 ] ) ) )
训练集精确度:0.9993390968848243,测试集精确度:0.8908333333333334
predict 训练集auc:0.9993390968848243,测试集auc:0.6839441591216555
predict_proba 训练集auc:0.9999081457017259,测试集auc:0.8237175910363382
RFC_r = RandomForestClassifier( )
param_dist = {
'n_estimators' : [ 10 , 20 , 30 , 40 , 50 , 100 , 150 , 200 , 300 ] ,
'max_depth' : range ( 2 , 10 , 1 ) ,
'min_samples_leaf' : range ( 2 , 10 , 1 ) ,
}
RFC_random = RandomizedSearchCV( RFC_r, param_distributions= param_dist, cv= 5 , n_jobs = - 1 )
RFC_random. fit( s_X, s_y)
best_est_RFC = RFC_random. best_estimator_
print ( '最优参数:{0}\n' . format ( best_est_RFC) )
print ( '训练集精确度:{0},测试集精确度:{1}\n' . format ( RFC_random. score( s_X, s_y) , RFC_random. score( X_test, y_test) ) )
y_pred_train_sr = RFC_random. predict( s_X)
y_pred_sr = RFC_random. predict( X_test)
print ( '训练集auc:{0},测试集auc:{1}\n' . format ( auc( s_y, y_pred_train_sr) , auc( y_test, y_pred_sr) ) )
print ( '训练集混淆矩阵:\n{0},\n\n测试集混淆矩阵:\n{1}' . format ( cm( s_y, y_pred_train_sr, labels= [ 1 , 0 ] ) ,
cm( y_test, y_pred_sr, labels= [ 1 , 0 ] ) ) )
最优参数:RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=9, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=3, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=10,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False)
训练集精确度:0.8192925526193122,测试集精确度:0.7796780347401184
训练集auc:0.8192925526193122,测试集auc:0.7544857947353516
训练集混淆矩阵:
[[94898 16658]
[23660 87896]],
测试集混淆矩阵:
[[ 1489 564]
[ 6019 21807]]
auc_score = [ ]
for i in range ( 50 , 350 , 50 ) :
RF_m = RandomForestClassifier( n_estimators= i, max_depth= 5 , min_samples_split= 200 , min_samples_leaf= 100 ,
max_features= 'sqrt' , bootstrap= 0.7 , n_jobs= - 1 , random_state= 10 , )
RF_m. fit( X_train, y_train)
auc_score. append( roc_auc_score( y_test, RF_m. predict_proba( X_test) [ : , 1 ] ) )
print ( list ( range ( 50 , 350 , 50 ) ) [ auc_score. index( max ( auc_score) ) ] , max ( auc_score) )
plt. plot( range ( 50 , 350 , 50 ) , auc_score)
plt. show( )
250 0.8640378842017733
auc_score = [ ]
for i in range ( 50 , 350 , 50 ) :
RF_m = RandomForestClassifier( n_estimators= i, max_depth= 5 , min_samples_split= 200 , min_samples_leaf= 100 ,
max_features= 'sqrt' , bootstrap= 0.7 , n_jobs= - 1 , random_state= 10 , )
RF_m. fit( s_X, s_y)
auc_score. append( roc_auc_score( y_test, RF_m. predict_proba( X_test) [ : , 1 ] ) )
print ( list ( range ( 50 , 350 , 50 ) ) [ auc_score. index( max ( auc_score) ) ] , max ( auc_score) )
plt. plot( range ( 50 , 350 , 50 ) , auc_score)
plt. show( )
250 0.8389906830396209
auc_score = [ ]
for i in range ( 200 , 300 , 10 ) :
RF_m = RandomForestClassifier( n_estimators= i, max_depth= 5 , min_samples_split= 200 , min_samples_leaf= 100 ,
max_features= 'sqrt' , bootstrap= 0.7 , n_jobs= - 1 , random_state= 10 , )
RF_m. fit( X_train, y_train)
auc_score. append( roc_auc_score( y_test, RF_m. predict_proba( X_test) [ : , 1 ] ) )
print ( list ( range ( 200 , 300 , 10 ) ) [ auc_score. index( max ( auc_score) ) ] , max ( auc_score) )
plt. plot( range ( 200 , 300 , 10 ) , auc_score)
plt. show( )
230 0.8640656490645973
auc_score = [ ]
for i in range ( 2 , 10 , 1 ) :
RF_m = RandomForestClassifier( n_estimators= 230 , max_depth= i, min_samples_split= 200 , min_samples_leaf= 100 ,
max_features= 'sqrt' , bootstrap= 0.7 , n_jobs= - 1 , random_state= 10 , )
RF_m. fit( X_train, y_train)
auc_score. append( roc_auc_score( y_test, RF_m. predict_proba( X_test) [ : , 1 ] ) )
print ( list ( range ( 2 , 10 , 1 ) ) [ auc_score. index( max ( auc_score) ) ] , max ( auc_score) )
plt. plot( range ( 2 , 10 , 1 ) , auc_score)
plt. show( )
9 0.8677676427124693
auc_score = [ ]
for i in range ( 9 , 15 , 1 ) :
RF_m = RandomForestClassifier( n_estimators= 230 , max_depth= i, min_samples_split= 200 , min_samples_leaf= 100 ,
max_features= 'sqrt' , bootstrap= 0.7 , n_jobs= - 1 , random_state= 10 , )
RF_m. fit( X_train, y_train)
auc_score. append( roc_auc_score( y_test, RF_m. predict_proba( X_test) [ : , 1 ] ) )
print ( list ( range ( 9 , 15 , 1 ) ) [ auc_score. index( max ( auc_score) ) ] , max ( auc_score) )
plt. plot( range ( 9 , 15 , 1 ) , auc_score)
plt. show( )
12 0.8684788028299709
auc_score = [ ]
for i in range ( 50 , 350 , 50 ) :
RF_m = RandomForestClassifier( n_estimators= 230 , max_depth= 12 , min_samples_split= i, min_samples_leaf= 100 ,
max_features= 'sqrt' , bootstrap= 0.7 , n_jobs= - 1 , random_state= 10 , )
RF_m. fit( X_train, y_train)
auc_score. append( roc_auc_score( y_test, RF_m. predict_proba( X_test) [ : , 1 ] ) )
print ( list ( range ( 50 , 350 , 50 ) ) [ auc_score. index( max ( auc_score) ) ] , max ( auc_score) )
plt. plot( range ( 50 , 350 , 50 ) , auc_score)
plt. show( )
50 0.8684788028299709
auc_score = [ ]
for i in range ( 10 , 55 , 10 ) :
RF_m = RandomForestClassifier( n_estimators= 230 , max_depth= 12 , min_samples_split= i, min_samples_leaf= 100 ,
max_features= 'sqrt' , bootstrap= 0.7 , n_jobs= - 1 , random_state= 10 , )
RF_m. fit( X_train, y_train)
auc_score. append( roc_auc_score( y_test, RF_m. predict_proba( X_test) [ : , 1 ] ) )
print ( list ( range ( 10 , 55 , 10 ) ) [ auc_score. index( max ( auc_score) ) ] , max ( auc_score) )
plt. plot( range ( 10 , 55 , 10 ) , auc_score)
plt. show( )
10 0.8684788028299709
auc_score = [ ]
for i in range ( 2 , 11 , 1 ) :
RF_m = RandomForestClassifier( n_estimators= 230 , max_depth= 12 , min_samples_split= i, min_samples_leaf= 100 ,
max_features= 'sqrt' , bootstrap= 0.7 , n_jobs= - 1 , random_state= 10 , )
RF_m. fit( X_train, y_train)
auc_score. append( roc_auc_score( y_test, RF_m. predict_proba( X_test) [ : , 1 ] ) )
print ( list ( range ( 2 , 11 , 1 ) ) [ auc_score. index( max ( auc_score) ) ] , max ( auc_score) )
plt. plot( range ( 2 , 11 , 1 ) , auc_score)
plt. show( )
2 0.8684788028299709
auc_score = [ ]
for i in range ( 10 , 110 , 10 ) :
RF_m = RandomForestClassifier( n_estimators= 230 , max_depth= 12 , min_samples_split= 2 , min_samples_leaf= i,
max_features= 'sqrt' , bootstrap= 0.7 , n_jobs= - 1 , random_state= 10 , )
RF_m. fit( X_train, y_train)
auc_score. append( roc_auc_score( y_test, RF_m. predict_proba( X_test) [ : , 1 ] ) )
print ( list ( range ( 10 , 110 , 10 ) ) [ auc_score. index( max ( auc_score) ) ] , max ( auc_score) )
plt. plot( range ( 10 , 110 , 10 ) , auc_score)
plt. show( )
40 0.8687270122873082
auc_score = [ ]
for i in range ( 36 , 46 , 2 ) :
RF_m = RandomForestClassifier( n_estimators= 230 , max_depth= 12 , min_samples_split= 2 , min_samples_leaf= i,
max_features= 'sqrt' , bootstrap= 0.7 , n_jobs= - 1 , random_state= 10 , )
RF_m. fit( X_train, y_train)
auc_score. append( roc_auc_score( y_test, RF_m. predict_proba( X_test) [ : , 1 ] ) )
print ( list ( range ( 36 , 46 , 2 ) ) [ auc_score. index( max ( auc_score) ) ] , max ( auc_score) )
plt. plot( range ( 36 , 46 , 2 ) , auc_score)
plt. show( )
36 0.8691479440821931
auc_score = [ ]
for i in range ( 30 , 50 , 2 ) :
RF_m = RandomForestClassifier( n_estimators= 230 , max_depth= 12 , min_samples_split= 2 , min_samples_leaf= i,
max_features= 'sqrt' , bootstrap= 0.7 , n_jobs= - 1 , random_state= 10 , )
RF_m. fit( X_train, y_train)
auc_score. append( roc_auc_score( y_test, RF_m. predict_proba( X_test) [ : , 1 ] ) )
print ( list ( range ( 30 , 50 , 2 ) ) [ auc_score. index( max ( auc_score) ) ] , max ( auc_score) )
plt. plot( range ( 30 , 50 , 2 ) , auc_score)
plt. show( )
36 0.8691479440821931
auc_score = [ ]
for i in range ( 1 , 11 , 1 ) :
RF_m = RandomForestClassifier( n_estimators= 230 , max_depth= 12 , min_samples_split= 2 , min_samples_leaf= 36 ,
max_features= i/ 10 , bootstrap= 0.7 , n_jobs= - 1 , random_state= 10 , )
RF_m. fit( X_train, y_train)
auc_score. append( roc_auc_score( y_test, RF_m. predict_proba( X_test) [ : , 1 ] ) )
print ( list ( range ( 1 , 11 , 1 ) ) [ auc_score. index( max ( auc_score) ) ] , max ( auc_score) )
plt. plot( range ( 1 , 11 , 1 ) , auc_score)
plt. show( )
3 0.8691479440821931
auc_score = [ ]
for i in range ( 1 , 11 , 1 ) :
RF_m = RandomForestClassifier( n_estimators= 230 , max_depth= 12 , min_samples_split= 2 , min_samples_leaf= 36 ,
max_features= 0.3 , bootstrap= i, n_jobs= - 1 , random_state= 10 , )
RF_m. fit( X_train, y_train)
auc_score. append( roc_auc_score( y_test, RF_m. predict_proba( X_test) [ : , 1 ] ) )
print ( list ( range ( 1 , 11 , 1 ) ) [ auc_score. index( max ( auc_score) ) ] , max ( auc_score) )
plt. plot( range ( 1 , 11 , 1 ) , auc_score)
plt. show( )
1 0.8691479440821931
auc_score = [ ]
for i in range ( 1 , 15 , 2 ) :
RF_m = RandomForestClassifier( n_estimators= 230 , max_depth= 12 , min_samples_split= 2 , min_samples_leaf= 36 ,
max_features= 0.3 , bootstrap= 0.1 , n_jobs= - 1 , random_state= 10 , class_weight= { 1 : i} )
RF_m. fit( X_train, y_train)
auc_score. append( roc_auc_score( y_test, RF_m. predict_proba( X_test) [ : , 1 ] ) )
print ( list ( range ( 1 , 15 , 2 ) ) [ auc_score. index( max ( auc_score) ) ] , max ( auc_score) )
plt. plot( range ( 1 , 15 , 2 ) , auc_score)
plt. show( )
1 0.8691479440821931
best_est_RFC = RandomForestClassifier( n_estimators= 230 , max_depth= 12 , min_samples_split= 2 , min_samples_leaf= 36 ,
max_features= 0.3 , bootstrap= 0.1 , n_jobs= - 1 , random_state= 10 , )
best_est_RFC. fit( X_train, y_train)
print ( roc_auc_score( y_test, best_est_RFC. predict_proba( X_test) [ : , 1 ] ) )
0.8691479440821931
df_test = test_data. drop( 'SeriousDlqin2yrs' , axis= 1 )
sample_data[ "Probability" ] = best_est_RFC. predict_proba( df_test) [ : , 1 ]
print ( sample_data. head( ) )
sample_data. to_csv( "submission_RFC.csv" , index= True )
Probability
Id
1 0.079361
2 0.036505
3 0.016210
4 0.064601
5 0.126260
Grandient Boosting model
GB = GradientBoostingClassifier( )
GB. fit( X_train, y_train)
print ( '训练集精确度:{0},测试集精确度:{1}\n' . format ( GB. score( X_train, y_train) , GB. score( X_test, y_test) ) )
print ( 'predict 训练集auc:{0},测试集auc:{1}\n' . format ( roc_auc_score( y_train, GB. predict( X_train) ) ,
roc_auc_score( y_test, GB. predict( X_test) ) ) )
print ( 'predict_proba 训练集auc:{0},测试集auc:{1}\n' . format ( roc_auc_score( y_train, GB. predict_proba( X_train) [ : , 1 ] ) ,
roc_auc_score( y_test, GB. predict_proba( X_test) [ : , 1 ] ) ) )
训练集精确度:0.9389916666666667,测试集精确度:0.9377333333333333
predict 训练集auc:0.6017569654221069,测试集auc:0.5916827107760473
predict_proba 训练集auc:0.8685441072547121,测试集auc:0.8677365023729332
GB_s = GradientBoostingClassifier( )
GB_s. fit( s_X, s_y)
print ( '训练集精确度:{0},测试集精确度:{1}\n' . format ( GB_s. score( s_X, s_y) , GB_s. score( X_test, y_test) ) )
print ( 'predict 训练集auc:{0},测试集auc:{1}\n' . format ( roc_auc_score( s_y, GB_s. predict( s_X) ) ,
roc_auc_score( y_test, GB_s. predict( X_test) ) ) )
print ( 'predict_proba 训练集auc:{0},测试集auc:{1}\n' . format ( roc_auc_score( s_y, GB_s. predict_proba( s_X) [ : , 1 ] ) ,
roc_auc_score( y_test, GB_s. predict_proba( X_test) [ : , 1 ] ) ) )
训练集精确度:0.8654704915690197,测试集精确度:0.8668
predict 训练集auc:0.8654704915690197,测试集auc:0.7248715725122952
predict_proba 训练集auc:0.9424107237355832,测试集auc:0.8360759633753794
auc_score = [ ]
for i in range ( 50 , 350 , 50 ) :
GB_m = GradientBoostingClassifier( learning_rate= 0.1 , n_estimators= i, subsample= 0.7 , min_samples_split= 200 ,
min_samples_leaf= 100 , max_depth= 3 , random_state= 10 , max_features= 'sqrt' )
GB_m. fit( X_train, y_train)
auc_score. append( roc_auc_score( y_test, GB_m. predict_proba( X_test) [ : , 1 ] ) )
print ( list ( range ( 50 , 350 , 50 ) ) [ auc_score. index( max ( auc_score) ) ] , max ( auc_score) )
plt. plot( range ( 50 , 350 , 50 ) , auc_score)
plt. show( )
250 0.8684184382039928
auc_score = [ ]
for i in range ( 200 , 300 , 10 ) :
GB_m = GradientBoostingClassifier( learning_rate= 0.1 , n_estimators= i, subsample= 0.7 , min_samples_split= 200 ,
min_samples_leaf= 100 , max_depth= 3 , random_state= 10 , max_features= 'sqrt' )
GB_m. fit( X_train, y_train)
auc_score. append( roc_auc_score( y_test, GB_m. predict_proba( X_test) [ : , 1 ] ) )
print ( list ( range ( 200 , 300 , 10 ) ) [ auc_score. index( max ( auc_score) ) ] , max ( auc_score) )
plt. plot( range ( 200 , 300 , 10 ) , auc_score)
plt. show( )
250 0.8684184382039928
auc_score = [ ]
for i in range ( 2 , 10 , 1 ) :
GB_m = GradientBoostingClassifier( learning_rate= 0.1 , n_estimators= 250 , subsample= 0.7 , min_samples_split= 200 ,
min_samples_leaf= 100 , max_depth= i, random_state= 10 , max_features= 'sqrt' )
GB_m. fit( X_train, y_train)
auc_score. append( roc_auc_score( y_test, GB_m. predict_proba( X_test) [ : , 1 ] ) )
print ( list ( range ( 2 , 10 , 1 ) ) [ auc_score. index( max ( auc_score) ) ] , max ( auc_score) )
plt. plot( range ( 2 , 10 , 1 ) , auc_score)
plt. show( )
4 0.8694069872976782
auc_score = [ ]
for i in range ( 1 , 11 , 1 ) :
GB_m = GradientBoostingClassifier( learning_rate= 0.1 , n_estimators= 250 , subsample= 0.7 , min_samples_split= 200 ,
min_samples_leaf= 100 , max_depth= 4 , random_state= 10 , max_features= i/ 10 )
GB_m. fit( X_train, y_train)
auc_score. append( roc_auc_score( y_test, GB_m. predict_proba( X_test) [ : , 1 ] ) )
print ( list ( range ( 1 , 11 , 1 ) ) [ auc_score. index( max ( auc_score) ) ] , max ( auc_score) )
plt. plot( range ( 1 , 11 , 1 ) , auc_score)
plt. show( )
3 0.8694069872976782
auc_score = [ ]
for i in range ( 10 , 310 , 50 ) :
GB_m = GradientBoostingClassifier( learning_rate= 0.1 , n_estimators= 250 , subsample= 0.7 , min_samples_split= 200 ,
min_samples_leaf= i, max_depth= 4 , random_state= 10 , max_features= 0.3 )
GB_m. fit( X_train, y_train)
auc_score. append( roc_auc_score( y_test, GB_m. predict_proba( X_test) [ : , 1 ] ) )
print ( list ( range ( 10 , 310 , 50 ) ) [ auc_score. index( max ( auc_score) ) ] , max ( auc_score) )
plt. plot( range ( 10 , 310 , 50 ) , auc_score)
plt. show( )
110 0.8694922785209158
auc_score = [ ]
for i in range ( 70 , 130 , 10 ) :
GB_m = GradientBoostingClassifier( learning_rate= 0.1 , n_estimators= 250 , subsample= 0.7 , min_samples_split= 200 ,
min_samples_leaf= i, max_depth= 4 , random_state= 10 , max_features= 0.3 )
GB_m. fit( X_train, y_train)
auc_score. append( roc_auc_score( y_test, GB_m. predict_proba( X_test) [ : , 1 ] ) )
print ( list ( range ( 70 , 130 , 10 ) ) [ auc_score. index( max ( auc_score) ) ] , max ( auc_score) )
plt. plot( range ( 70 , 130 , 10 ) , auc_score)
plt. show( )
90 0.8697006000505265
auc_score = [ ]
for i in range ( 10 , 300 , 50 ) :
GB_m = GradientBoostingClassifier( learning_rate= 0.1 , n_estimators= 250 , subsample= 0.7 , min_samples_split= i,
min_samples_leaf= 90 , max_depth= 4 , random_state= 10 , max_features= 0.3 )
GB_m. fit( X_train, y_train)
auc_score. append( roc_auc_score( y_test, GB_m. predict_proba( X_test) [ : , 1 ] ) )
print ( list ( range ( 10 , 300 , 50 ) ) [ auc_score. index( max ( auc_score) ) ] , max ( auc_score) )
plt. plot( range ( 10 , 300 , 50 ) , auc_score)
plt. show( )
260 0.869784512431818
auc_score = [ ]
for i in range ( 250 , 410 , 50 ) :
GB_m = GradientBoostingClassifier( learning_rate= 0.1 , n_estimators= 250 , subsample= 0.7 , min_samples_split= i,
min_samples_leaf= 90 , max_depth= 4 , random_state= 10 , max_features= 0.3 )
GB_m. fit( X_train, y_train)
auc_score. append( roc_auc_score( y_test, GB_m. predict_proba( X_test) [ : , 1 ] ) )
print ( list ( range ( 250 , 410 , 50 ) ) [ auc_score. index( max ( auc_score) ) ] , max ( auc_score) )
plt. plot( range ( 250 , 410 , 50 ) , auc_score)
plt. show( )
250 0.8696629236420251
auc_score = [ ]
for i in range ( 250 , 300 , 10 ) :
GB_m = GradientBoostingClassifier( learning_rate= 0.1 , n_estimators= 250 , subsample= 0.7 , min_samples_split= i,
min_samples_leaf= 90 , max_depth= 4 , random_state= 10 , max_features= 0.3 )
GB_m. fit( X_train, y_train)
auc_score. append( roc_auc_score( y_test, GB_m. predict_proba( X_test) [ : , 1 ] ) )
print ( list ( range ( 250 , 300 , 10 ) ) [ auc_score. index( max ( auc_score) ) ] , max ( auc_score) )
plt. plot( range ( 250 , 300 , 10 ) , auc_score)
plt. show( )
260 0.869784512431818
auc_score = [ ]
for i in range ( 1 , 11 , 1 ) :
GB_m = GradientBoostingClassifier( learning_rate= 0.1 , n_estimators= 250 , subsample= i/ 10 , min_samples_split= 260 ,
min_samples_leaf= 90 , max_depth= 4 , random_state= 10 , max_features= 0.3 )
GB_m. fit( X_train, y_train)
auc_score. append( roc_auc_score( y_test, GB_m. predict_proba( X_test) [ : , 1 ] ) )
print ( list ( range ( 1 , 11 , 1 ) ) [ auc_score. index( max ( auc_score) ) ] , max ( auc_score) )
plt. plot( range ( 1 , 11 , 1 ) , auc_score)
plt. show( )
7 0.869784512431818
auc_score = [ ]
for i in [ 10 , 42 , 100 , 116214 ] :
GB_m = GradientBoostingClassifier( learning_rate= 0.1 , n_estimators= 250 , subsample= 0.7 , min_samples_split= 260 ,
min_samples_leaf= 90 , max_depth= 4 , random_state= i, max_features= 0.3 )
GB_m. fit( X_train, y_train)
auc_score. append( roc_auc_score( y_test, GB_m. predict_proba( X_test) [ : , 1 ] ) )
print ( [ 10 , 42 , 100 , 116214 ] [ auc_score. index( max ( auc_score) ) ] , max ( auc_score) )
plt. plot( [ 10 , 42 , 100 , 116214 ] , auc_score)
plt. show( )
10 0.869784512431818
GB_m = GradientBoostingClassifier( learning_rate= 0.1 , n_estimators= 250 , subsample= 0.7 , min_samples_split= 260 ,
min_samples_leaf= 90 , max_depth= 4 , random_state= 10 , max_features= 0.3 )
GB_m. fit( X_train, y_train)
print ( '训练集精确度:{0},测试集精确度:{1}\n' . format ( GB_m. score( X_train, y_train) , GB_m. score( X_test, y_test) ) )
print ( 'predict 训练集auc:{0},测试集auc:{1}\n' . format ( roc_auc_score( y_train, GB_m. predict( X_train) ) ,
roc_auc_score( y_test, GB_m. predict( X_test) ) ) )
print ( 'predict_proba 训练集auc:{0},测试集auc:{1}\n' . format ( roc_auc_score( y_train, GB_m. predict_proba( X_train) [ : , 1 ] ) ,
roc_auc_score( y_test, GB_m. predict_proba( X_test) [ : , 1 ] ) ) )
训练集精确度:0.9400916666666667,测试集精确度:0.9371666666666667
predict 训练集auc:0.6089339590765261,测试集auc:0.5925436990826797
predict_proba 训练集auc:0.874079427703056,测试集auc:0.869784512431818
GB_m = GradientBoostingClassifier( learning_rate= 0.05 , n_estimators= 500 , subsample= 0.7 , min_samples_split= 260 ,
min_samples_leaf= 90 , max_depth= 4 , random_state= 10 , max_features= 0.3 )
GB_m. fit( X_train, y_train)
print ( '训练集精确度:{0},测试集精确度:{1}\n' . format ( GB_m. score( X_train, y_train) , GB_m. score( X_test, y_test) ) )
print ( 'predict 训练集auc:{0},测试集auc:{1}\n' . format ( roc_auc_score( y_train, GB_m. predict( X_train) ) ,
roc_auc_score( y_test, GB_m. predict( X_test) ) ) )
print ( 'predict_proba 训练集auc:{0},测试集auc:{1}\n' . format ( roc_auc_score( y_train, GB_m. predict_proba( X_train) [ : , 1 ] ) ,
roc_auc_score( y_test, GB_m. predict_proba( X_test) [ : , 1 ] ) ) )
训练集精确度:0.940275,测试集精确度:0.9373666666666667
predict 训练集auc:0.6067785694888352,测试集auc:0.5903218295893178
predict_proba 训练集auc:0.8745757049324849,测试集auc:0.8691238322551743
best_est_GB = GradientBoostingClassifier( learning_rate= 0.1 , n_estimators= 250 , subsample= 0.7 , min_samples_split= 260 ,
min_samples_leaf= 90 , max_depth= 4 , random_state= 10 , max_features= 0.3 )
best_est_GB. fit( X_train, y_train)
GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
learning_rate=0.1, loss='deviance', max_depth=4,
max_features=0.3, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=90, min_samples_split=260,
min_weight_fraction_leaf=0.0, n_estimators=250,
n_iter_no_change=None, presort='deprecated',
random_state=10, subsample=0.7, tol=0.0001,
validation_fraction=0.1, verbose=0,
warm_start=False)
df_test = test_data. drop( 'SeriousDlqin2yrs' , axis= 1 )
sample_data[ "Probability" ] = best_est_GB. predict_proba( df_test) [ : , 1 ]
print ( sample_data. head( ) )
sample_data. to_csv( "submission_GB.csv" , index= True )
Probability
Id
1 0.072539
2 0.043227
3 0.013998
4 0.064475
5 0.083119