kaggle风控建模实战(XGB+LGB+RF+LR)

目录

一、数据读取及预处理

1、数据读取

2、数据预处理

二、变量统计及筛选

1、变量列表

2、iv计算

3、特征筛选

三、模型构建及评估

四、划重点

少走10年弯路


一、数据读取及预处理

1、数据读取

        使用kaggle上的风控练习数据集(下图仅为部分字段),数据集中包含数值型、类别型、日期等变量,同时存在缺失问题,适合初学者入门练习。文末获取数据集

2、数据预处理

        (1)抽样:由于数据量大,仅抽取5w条作为示例;

        (2)y标签编码:对loan_status逾期状态进行编码 {'Fully Paid':0,'Late (31-120 days)':1,'Late (16-30 days)':1},剔除current状态的在途订单;

        (3)数据集划分:按照7:3的比例随机划分训练集、测试集;

        (4)类别型变量编码:使用目标编码、即使用训练集(避免穿越)每类类别型变量对应的y均值进行编码。对于grade这种顺序型变量,按照等级顺序进行编码;

        (5)缺失值填充:由于前面已经将类别型变量编码,这里使用列均值进行缺失填充;

        (6)日期字段处理:由于大多日期字段含义不明确,所以先剔除

数据读取、预处理代码

import re
import os
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
import warnings
import toad
warnings.filterwarnings('ignore')
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve,roc_auc_score
from sklearn.metrics import roc_curve, auc
import xgboost as xgb
from xgboost.sklearn import XGBClassifier,XGBRegressor
import lightgbm as lgb
import matplotlib.pyplot as plt
import gc
import json


def data_pred():
    df_init=pd.read_csv('accepted_2007_to_2018Q4.csv')
    df=df_init.sample(50000).reset_index(drop=True)
    
    df['y']=df.loan_status.map({'Fully Paid':0,'Late (31-120 days)':1,'Late (16-30 days)':1})
    df_copy=df[df.y.isin([0,1])].copy()
    return df_copy.reset_index(drop=True)

df=data_pred()


date_fea=[
    'issue_d',
    'earliest_cr_line',
    'last_pymnt_d',
    'last_credit_pull_d',
    'sec_app_earliest_cr_line',
    'hardship_start_date',
    'hardship_end_date',
    'payment_plan_start_date',
    'debt_settlement_flag_date',
    'settlement_date',
    'next_pymnt_d'
]

target_label_fea=[
    'sub_grade',
    'emp_title',
    'home_ownership',
    'verification_status',
    'pymnt_plan',
    'purpose',
    'addr_state',
    'initial_list_status',
    'application_type',
    'verification_status_joint',
    'hardship_flag',
    'hardship_type',
    'hardship_reason',
    'hardship_status',
    'hardship_loan_status',
    'disbursement_method',
    'debt_settlement_flag',
    'settlement_status',        
]

def target_label_mean(df_train,col,y_name='y'): # 单变量目标编码
    return dict(round(df_train.groupby([col])[y_name].mean(),6))  

def target_label_mean_all(df_train,col_list,y_name='y'): # 全部变量目标编码
    return {col:target_label_mean(df_train,col,y_name) for col in col_list}

def train_test_split_self(df_copy,y_name='y'):  # 数据划分
    frac=0.7
    train_index=list(df_copy[df_copy[y_name]==1].sample(frac=frac,random_state=1).index)+list(df_copy[df_copy[y_name]==0].sample(frac=frac,random_state=1).index)
    df_copy.loc[df_copy.index.isin(train_index),'sample_label']='train'
    df_copy.loc[~df_copy.index.isin(train_index),'sample_label']='test'
    
    return df_copy

def train_test_target_label(df_copy,col_list):  # df目标编码
    df=df_copy.pipe(train_test_split_self)
    all_col_map=target_label_mean_all(df[df.sample_label=='train'],col_list,y_name='y')
    for col in col_list:
        df[col]=df[col].map(all_col_map[col])
    
    return df,all_col_map

def get_fill_df(df_pred,y_name='y'):  # 缺失填充
    df=df_pred.copy()
    fill_col=[col for col in df.columns if col not in [y_name]]
    df=df.fillna(df.mean())

    return df
 
def fea_engineer(df_copy):  # 所有预处理汇总
    df=df_copy.copy()
    df.term=df.term.str.replace('months','').str.replace(' ','').astype('float')
    
    grade_map=dict(enumerate(list('ABCDEFG')))
    grade_map={value:key for key,value in grade_map.items()}
    df.grade=df.grade.map(grade_map)
    
    df.emp_length=df.emp_length.str.replace('years','').str.replace('year','').str.replace(' ','').map({'<1':0,'10+':10})
    
    float_col=list(df_copy.select_dtypes(include=['int','float']).columns)
    float_col.remove('y')
    
    drop_col=['member_id','loan_status','url','desc','title','zip_code']
    df=df.drop(drop_col+date_fea,axis=1)  # 日期字段含义不明确,所以先不用了
    
    
    df,all_col_map=train_test_target_label(df,target_label_fea)  # 部分类别型变量进行target_label编码
    return df,all_col_map

df_pred,all_col_map=fea_engineer(df)
df_pred_fill=get_fill_df(df_pred,y_name='y')
df_pred_fill

二、变量统计及筛选

1、变量列表

        在数据预处理的基础上,定义变量列表、长度133(已经均为数值型变量)

2、iv计算

        使用toad计算133个变量的iv值

3、特征筛选

        由于本文也使用到逻辑回归,所以需要按照缺失程度、变量相关性、iv等对变量进行筛选,最终胜于变量59个

变量统计及筛选代码

fea_list=list(df_pred.drop(['id','y','sample_label'],axis=1).columns)
len(fea_list)

import toad
def iv_miss(df,var_list,y):
    df_tmp=df[df[y].notnull()].copy()
    
    iv_all=toad.quality(df_tmp[var_list+[y]], target=y, indicators = ['iv','unique'])[['unique','iv']]

    miss_per=pd.DataFrame(df[var_list].isnull().sum()/(df.shape[0]))
    miss_per.columns=['缺失率']
    result=pd.concat([miss_per,iv_all],axis=1)
    return result.sort_values('iv',ascending=False)


df_iv=iv_miss(df_pred,fea_list,'y') # iv计算
df_iv

y_name='y'
# 变量筛选
train_selected,drop_col=toad.selection.select(df_pred[fea_list+[y_name]],y_name,empty=0.9,
                                     iv=0.02,corr=0.7,return_drop=True)
train_selected

# 剩余变量列表
fea_list_select=list(train_selected.drop('y',axis=1).columns)
print(len(fea_list),len(fea_list_select))

三、模型构建及评估

        分别使用lgb、xgb、rf、lr构建二分类模型,使用ks、auc进行评估

def init_params(model_select='lgb'):  # 初始化参数
    params_lr={
        'n_jobs':4,
        'max_iter':100,
        'penalty':'l2',
        'solver':'lbfgs',
        'random_state':1,
        'tol':0.0001,
        'C':1,
        'verbose':1,
    }
    params_rf={
        'n_jobs':4,
        'max_samples':0.8,
        'n_estimators':500,
        'max_depth':3,
        'min_samples_leaf':50,
        'max_features':'auto',
        'min_impurity_decrease':0,
        'bootstrap':True,
        'oob_score':True,
        'random_state':1,
        'verbose':1
    }
    params_xgb={
        'objective':'binary:logistic',
        'eval_metric':'auc',
        'nthread':4,
        'n_estimators':500,
        'eta':0.02,
        'max_depth':3,
        'min_child_weight':50,
        'scale_pos_weight':1,
        'gamma':10,
        'reg_alpha':2,
        'reg_lambda':2,
        'subsample':0.8,
        'colsample_bytree':0.8,
        'seed':123
    }
    params_lgb={
        'boosting_type': 'gbdt', 
        'objective': 'binary',
        'metric':'auc',
        'n_jobs': 4,
        'n_estimators':params_xgb.get('n_estimators',100),
        'learning_rate': params_xgb.get('eta',0.1), 
        'max_depth':params_xgb.get('max_depth',4), 
        'num_leaves': params_xgb.get('num_leaves',20), 
        'max_bin':255,
        'subsample_for_bin':100000, # 构建直方图的样本量
        'min_split_gain':params_xgb.get('gamma',10),
        'min_child_samples':params_xgb.get('min_child_weight',50),
        'colsample_bytree': params_xgb.get('colsample_bytree',0.8),
        'subsample': params_xgb.get('subsample',0.8), 
        'subsample_freq': 1,   # 每 k 次迭代执行bagging      
        'feature_fraction_seed':2,
        'bagging_seed': 1,
        'reg_alpha':params_xgb.get('reg_alpha',10),
        'reg_lambda':params_xgb.get('reg_lambda',10),
        'scale_pos_weight':params_xgb.get('scale_pos_weight',1), # 等价于is_unbalance=False
        'silent':True,
        'random_state':1,
        'verbose':-1, # 控制模型训练过程的输出信息,-1为不输出信息
     }
    if model_select=='lr':
        return params_lr
    if model_select=='rf':
        return params_rf
    if model_select=='xgb':
        return params_xgb
    if model_select=='lgb':
        return params_lgb

def ks_auc_value(y_value,df,model): # 计算ks、auc
    y_pred=model.predict_proba(df)[:,1]
    fpr,tpr,thresholds= roc_curve(list(y_value),list(y_pred))
    ks=max(tpr-fpr)
    auc= roc_auc_score(list(y_value),list(y_pred))
    return ks,auc

def model_train_sklearn(df,y_name,model_fea,model_select='lgb'):
    
    params=init_params(model_select=model_select)
    if model_select=='xgb':
        model=XGBClassifier(**params)
    elif model_select=='lgb':
        model=lgb.LGBMClassifier(**params)
    elif model_select=='rf':
        model=RandomForestClassifier(**params)
    elif model_select=='lr':
        model=LogisticRegression(**params)
        
    x_train,y_train=df[df.sample_label=='train'][model_fea],df[df.sample_label=='train'][y_name]
    x_test,y_test=df[df.sample_label=='test'][model_fea],df[df.sample_label=='test'][y_name]
    
    if model_select in ['rf','lr']:
        model.fit(x_train,y_train)
    else:
        model.fit(x_train,y_train,eval_set=[(x_train, y_train),(x_test, y_test)],verbose=True)
    
    train_ks,train_auc=ks_auc_value(y_train,x_train,model)
    test_ks,test_auc=ks_auc_value(y_test,x_test,model)
    
    model_result={
        'train_good':(y_train.count()-y_train.sum()),'train_bad':y_train.sum(),
        'test_good':(y_test.count()-y_test.sum()),'test_bad':y_test.sum(),
        'train_ks':train_ks,'train_auc':train_auc,
        'test_ks':test_ks,'test_auc':test_auc,
    }
    return model_result,model

xgb_model_result,xgb_model=model_train_sklearn(df_pred,'y',fea_list_select,model_select='xgb')
lgb_model_result,lgb_model=model_train_sklearn(df_pred,'y',fea_list_select,model_select='lgb')
rf_model_result,rf_model=model_train_sklearn(df_pred_fill,'y',fea_list_select,model_select='rf')
lr_model_result,lr_model=model_train_sklearn(df_pred_fill,'y',fea_list_select,model_select='lr')
pd.DataFrame([xgb_model_result,lgb_model_result,rf_model_result,lr_model_result],index=['xgb','lgb','rf','lr'])

四、划重点

少走10年弯路

        关注威信公众号 Python风控模型与数据分析,回复 kaggle风控实战 获取本篇数据及代码

        还有更多理论、代码分享等你来拿

  • 0
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值