【天池】贷款违约预测task3——特征工程篇
阶段目标:1)特征预处理:包括缺失值处理、异常值处理、特征分框;2)特征编码:对象型特征编码、特征归一化等;3)特征筛选:过滤无用特征(卡方、相关系数、正则化项等);4)简单建模:XGBoost、LightGBM
import pandas as pd
import numpy as np
import matplotlib. pyplot as plt
import seaborn as sns
import datetime
from tqdm import tqdm
from sklearn. preprocessing import LabelEncoder
from sklearn. feature_selection import SelectKBest
from sklearn. feature_selection import chi2
from sklearn. preprocessing import MinMaxScaler
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
import warnings
from sklearn. model_selection import StratifiedKFold, KFold
from sklearn. metrics import accuracy_score, f1_score, roc_auc_score, log_loss
warnings. filterwarnings( 'ignore' )
data_train = pd. read_csv( r'/home/corn/桌面/tianchifengkong/train.csv' )
data_test_a = pd. read_csv( r'/home/corn/桌面/tianchifengkong/testA.csv' )
numerical_fea = list ( data_train. select_dtypes( exclude= [ 'object' ] ) . columns)
category_fea = list ( filter ( lambda x: x not in numerical_fea, list ( data_train. columns) ) )
label = 'isDefault'
numerical_fea. remove( label)
data_train[ numerical_fea] = data_train[ numerical_fea] . fillna( data_train[ numerical_fea] . median( ) )
data_test_a[ numerical_fea] = data_test_a[ numerical_fea] . fillna( data_train[ numerical_fea] . median( ) )
data_train[ category_fea] = data_train[ category_fea] . fillna( data_train[ category_fea] . mode( ) )
data_test_a[ category_fea] = data_test_a[ category_fea] . fillna( data_train[ category_fea] . mode( ) )
for data in [ data_train, data_test_a] :
data[ 'issueDate' ] = pd. to_datetime( data[ 'issueDate' ] , format = '%Y-%m-%d' )
startdate = datetime. datetime. strptime( '2007-06-01' , '%Y-%m-%d' )
data[ 'issueDateDT' ] = data[ 'issueDate' ] . apply ( lambda x: x- startdate) . dt. days
def employmentLength_to_int ( s) :
if pd. isnull( s) :
return s
else :
return np. int8( s. split( ) [ 0 ] )
for data in [ data_train, data_test_a] :
data[ 'employmentLength' ] . replace( to_replace= '10+ years' , value= '10 years' , inplace= True )
data[ 'employmentLength' ] . replace( '< 1 year' , '0 years' , inplace= True )
data[ 'employmentLength' ] = data[ 'employmentLength' ] . apply ( employmentLength_to_int)
for data in [ data_train, data_test_a] :
data[ 'earliesCreditLine' ] = data[ 'earliesCreditLine' ] . apply ( lambda s: int ( s[ - 4 : ] ) )
cate_features = [ 'grade' , 'subGrade' , 'employmentTitle' , 'homeOwnership' , \
'verificationStatus' , 'purpose' , 'postCode' , 'regionCode' , \
'applicationType' , 'initialListStatus' , 'title' , 'policyCode' ]
for data in [ data_train, data_test_a] :
data[ 'grade' ] = data[ 'grade' ] . map ( { 'A' : 1 , 'B' : 2 , 'C' : 3 , 'D' : 4 , 'E' : 5 , 'F' : 6 , 'G' : 7 } )
for data in [ data_train, data_test_a] :
data = pd. get_dummies( data, columns= [ 'subGrade' , 'homeOwnership' , 'verificationStatus' , \
'purpose' , 'regionCode' ] , drop_first= True )
def find_outliers_by_3segama ( data, fea) :
data_std = np. std( data[ fea] )
data_mean = np. mean( data[ fea] )
outliers_cut_off = data_std * 3
lower_rule = data_mean - outliers_cut_off
upper_rule = data_mean + outliers_cut_off
data[ fea+ '_outliers' ] = data[ fea] . apply ( lambda x: str ( '异常值' ) if x > upper_rule \
or x < lower_rule else '正常值' )
return data
data_train = data_train. copy( )
for fea in numerical_fea:
data_train = find_outliers_by_3segama( data_train, fea)
for fea in numerical_fea:
data_train = data_train[ data_train[ fea+ '_outliers' ] == '正常值' ]
data_train = data_train. reset_index( drop= True )
data[ 'loanAmnt_bin3' ] = pd. qcut( data[ 'loanAmnt' ] , 10 , labels= False )
for col in tqdm( [ 'employmentTitle' , 'postCode' , 'title' , 'subGrade' ] ) :
le = LabelEncoder( )
le. fit( list ( data_train[ col] . astype( str ) . values) + \
list ( data_test_a[ col] . astype( str ) . values) )
data_train[ col] = le. transform( list ( data_train[ col] . astype( str ) . values) )
data_test_a[ col] = le. transform( list ( data_test_a[ col] . astype( str ) . values) )
print ( 'Label Encoding 完成' )
100%|██████████| 4/4 [00:03<00:00, 1.13it/s]
Label Encoding 完成
for col in tqdm( [ 'employmentTitle' , 'postCode' , 'title' , 'subGrade' ] ) :
le = LabelEncoder( )
le. fit( list ( data_train[ col] . astype( str ) . values) + list ( data_test_a[ col] . astype( str ) . values) )
data_train[ col] = le. transform( list ( data_train[ col] . astype( str ) . values) )
data_test_a[ col] = le. transform( list ( data_test_a[ col] . astype( str ) . values) )
print ( 'Label Encoding 完成' )
100%|██████████| 4/4 [00:03<00:00, 1.27it/s]
Label Encoding 完成
from sklearn. feature_selection import VarianceThreshold
VarianceThreshold( threshold= 3 ) . fit_transform( train, target_train)
from sklearn. feature_selection import SelectKBest
from scipy. stats import pearsonr
SelectKBest( k= 5 ) . fit_transform( train, target_train)
from sklearn. feature_selection import SelectKBest
from sklearn. feature_selection import chi2
SelectKBest( chi2, k= 5 ) . fit_transform( train, target_train)
from sklearn. feature_selection import RFE
from sklearn. linear_model import LogisticRegression
RFE( estimator= LogisticRegression( ) ,
n_features_to_select= 2 ) . fit_transform( train, target_train)
from sklearn. feature_selection import SelectFromModel
from sklearn. linear_model import LogisticRegression
SelectFromModel( LogisticRegression( penalty= "l1" , C= 0.1 ) ) . fit_transform( train, target_train)
from sklearn. feature_selection import SelectFromModel
from sklearn. ensemble import GradientBoostingClassifier
SelectFromModel( GradientBoostingClassifier( ) ) . fit_transform( train, target_train)
% matplotlib inline
x_train = data_train
data_corr = x_train. corrwith( data_train. isDefault)
result = pd. DataFrame( columns= [ 'features' , 'corr' ] )
result[ 'features' ] = data_corr. index
result[ 'corr' ] = data_corr. values
data_numeric = data_train[ numerical_fea]
correlation = data_numeric. corr( )
f , ax = plt. subplots( figsize = ( 7 , 7 ) )
plt. title( 'Correlation of Numeric Features with Price' , y= 1 , size= 16 )
sns. heatmap( correlation, square = True , vmax= 0.8 )
<AxesSubplot:title={'center':'Correlation of Numeric Features with Price'}>
features = [ f for f in data_train. columns if f not in [ 'id' , 'issueDate' , 'isDefault' ] and
'_outliers' not in f]
x_train = data_train[ features]
x_test = data_test_a[ features]
y_train = data_train[ 'isDefault' ]
def cv_model ( clf, train_x, train_y, test_x, clf_name) :
folds = 5
seed = 2020
kf = KFold( n_splits= folds, shuffle= True , random_state= seed)
train = np. zeros( train_x. shape[ 0 ] )
test = np. zeros( test_x. shape[ 0 ] )
cv_scores = [ ]
for i, ( train_index, valid_index) in enumerate ( kf. split( train_x, train_y) ) :
print ( '** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** { } \
** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** '. format ( str ( i+ 1 ) ) )
trn_x, trn_y, val_x, val_y = train_x. iloc[ train_index] , train_y[ train_index] , \
train_x. iloc[ valid_index] , train_y[ valid_index]
if clf_name == "lgb" :
train_matrix = clf. Dataset( trn_x, label= trn_y)
valid_matrix = clf. Dataset( val_x, label= val_y)
params = {
'boosting_type' : 'gbdt' ,
'objective' : 'binary' ,
'metric' : 'auc' ,
'min_child_weight' : 5 ,
'num_leaves' : 2 ** 5 ,
'lambda_l2' : 10 ,
'feature_fraction' : 0.8 ,
'bagging_fraction' : 0.8 ,
'bagging_freq' : 4 ,
'learning_rate' : 0.1 ,
'seed' : 2020 ,
'nthread' : 28 ,
'n_jobs' : 24 ,
'silent' : True ,
'verbose' : - 1 ,
}
model = clf. train( params, train_matrix, 50000 , valid_sets= [ train_matrix, \
valid_matrix] , verbose_eval= 200 , early_stopping_rounds= 200 )
val_pred = model. predict( val_x, num_iteration= model. best_iteration)
test_pred = model. predict( test_x, num_iteration= model. best_iteration)
if clf_name == "xgb" :
train_matrix = clf. DMatrix( trn_x , label= trn_y)
valid_matrix = clf. DMatrix( val_x , label= val_y)
params = { 'booster' : 'gbtree' ,
'objective' : 'binary:logistic' ,
'eval_metric' : 'auc' ,
'gamma' : 1 ,
'min_child_weight' : 1.5 ,
'max_depth' : 5 ,
'lambda' : 10 ,
'subsample' : 0.7 ,
'colsample_bytree' : 0.7 ,
'colsample_bylevel' : 0.7 ,
'eta' : 0.04 ,
'tree_method' : 'exact' ,
'seed' : 2020 ,
'nthread' : 36 ,
"silent" : True ,
}
watchlist = [ ( train_matrix, 'train' ) , ( valid_matrix, 'eval' ) ]
model = clf. train( params, train_matrix, num_boost_round= 50000 , evals= watchlist, \
verbose_eval= 200 , early_stopping_rounds= 200 )
val_pred = model. predict( valid_matrix, ntree_limit= model. best_ntree_limit)
test_pred = model. predict( test_x , ntree_limit= model. best_ntree_limit)
if clf_name == "cat" :
params = { 'learning_rate' : 0.05 ,
'depth' : 5 ,
'l2_leaf_reg' : 10 ,
'bootstrap_type' : 'Bernoulli' ,
'od_type' : 'Iter' ,
'od_wait' : 50 , 'random_seed' : 11 ,
'allow_writing_files' : False
}
model = clf( iterations= 20000 , ** params)
model. fit( trn_x, trn_y, eval_set= ( val_x, val_y) , \
cat_features= [ ] , use_best_model= True , verbose= 500 )
val_pred = model. predict( val_x)
test_pred = model. predict( test_x)
train[ valid_index] = val_pred
test = test_pred / kf. n_splits
cv_scores. append( roc_auc_score( val_y, val_pred) )
print ( cv_scores)
print ( "%s_scotrainre_list:" % clf_name, cv_scores)
print ( "%s_score_mean:" % clf_name, np. mean( cv_scores) )
print ( "%s_score_std:" % clf_name, np. std( cv_scores) )
return train, test
def lgb_model ( x_train, y_train, x_test) :
lgb_train, lgb_test = cv_model( lgb, x_train, y_train, x_test, "lgb" )
return lgb_train, lgb_test
def xgb_model ( x_train, y_train, x_test) :
xgb_train, xgb_test = cv_model( xgb, x_train, y_train, x_test, "xgb" )
return xgb_train, xgb_test
def cat_model ( x_train, y_train, x_test) :
cat_train, cat_test = cv_model( CatBoostRegressor, x_train, y_train, x_test, "cat" )
lgb_train, lgb_test = lgb_model( x_train, y_train, x_test)
************************************ 1************************************
[LightGBM] [Warning] num_threads is set with n_jobs=24, nthread=28 will be ignored. Current value: num_threads=24
[LightGBM] [Warning] Unknown parameter: silent
Training until validation scores don't improve for 200 rounds
[200] training's auc: 0.74909 valid_1's auc: 0.729732
[400] training's auc: 0.764514 valid_1's auc: 0.730261
[600] training's auc: 0.777878 valid_1's auc: 0.730283
Early stopping, best iteration is:
[541] training's auc: 0.774137 valid_1's auc: 0.730481
[0.7304805291626117]
************************************ 2************************************
[LightGBM] [Warning] num_threads is set with n_jobs=24, nthread=28 will be ignored. Current value: num_threads=24
[LightGBM] [Warning] Unknown parameter: silent
Training until validation scores don't improve for 200 rounds
[200] training's auc: 0.748524 valid_1's auc: 0.731572
[400] training's auc: 0.764231 valid_1's auc: 0.732453
[600] training's auc: 0.778023 valid_1's auc: 0.732473
Early stopping, best iteration is:
[592] training's auc: 0.777512 valid_1's auc: 0.732533
[0.7304805291626117, 0.7325333805806149]
************************************ 3************************************
[LightGBM] [Warning] num_threads is set with n_jobs=24, nthread=28 will be ignored. Current value: num_threads=24
[LightGBM] [Warning] Unknown parameter: silent
Training until validation scores don't improve for 200 rounds
[200] training's auc: 0.748256 valid_1's auc: 0.733065
[400] training's auc: 0.763732 valid_1's auc: 0.733821
[600] training's auc: 0.777215 valid_1's auc: 0.733752
Early stopping, best iteration is:
[456] training's auc: 0.767666 valid_1's auc: 0.733934
[0.7304805291626117, 0.7325333805806149, 0.7339337227125723]
************************************ 4************************************
[LightGBM] [Warning] num_threads is set with n_jobs=24, nthread=28 will be ignored. Current value: num_threads=24
[LightGBM] [Warning] Unknown parameter: silent
Training until validation scores don't improve for 200 rounds
[200] training's auc: 0.749134 valid_1's auc: 0.728116
[400] training's auc: 0.764696 valid_1's auc: 0.728999
Early stopping, best iteration is:
[399] training's auc: 0.764622 valid_1's auc: 0.729022
[0.7304805291626117, 0.7325333805806149, 0.7339337227125723, 0.7290224622861343]
************************************ 5************************************
[LightGBM] [Warning] num_threads is set with n_jobs=24, nthread=28 will be ignored. Current value: num_threads=24
[LightGBM] [Warning] Unknown parameter: silent
Training until validation scores don't improve for 200 rounds
[200] training's auc: 0.748309 valid_1's auc: 0.733303
[400] training's auc: 0.763856 valid_1's auc: 0.733901
Early stopping, best iteration is:
[363] training's auc: 0.76104 valid_1's auc: 0.733908
[0.7304805291626117, 0.7325333805806149, 0.7339337227125723, 0.7290224622861343, 0.7339081024026323]
lgb_scotrainre_list: [0.7304805291626117, 0.7325333805806149, 0.7339337227125723, 0.7290224622861343, 0.7339081024026323]
lgb_score_mean: 0.7319756394289131
lgb_score_std: 0.0019409373213926035