目录
一、数据读取及预处理
1、数据读取
使用kaggle上的风控练习数据集(下图仅为部分字段),数据集中包含数值型、类别型、日期等变量,同时存在缺失问题,适合初学者入门练习。文末获取数据集
2、数据预处理
(1)抽样:由于数据量大,仅抽取5w条作为示例;
(2)y标签编码:对loan_status逾期状态进行编码 {'Fully Paid':0,'Late (31-120 days)':1,'Late (16-30 days)':1},剔除current状态的在途订单;
(3)数据集划分:按照7:3的比例随机划分训练集、测试集;
(4)类别型变量编码:使用目标编码、即使用训练集(避免穿越)每类类别型变量对应的y均值进行编码。对于grade这种顺序型变量,按照等级顺序进行编码;
(5)缺失值填充:由于前面已经将类别型变量编码,这里使用列均值进行缺失填充;
(6)日期字段处理:由于大多日期字段含义不明确,所以先剔除
数据读取、预处理代码
import re
import os
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
import warnings
import toad
warnings.filterwarnings('ignore')
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve,roc_auc_score
from sklearn.metrics import roc_curve, auc
import xgboost as xgb
from xgboost.sklearn import XGBClassifier,XGBRegressor
import lightgbm as lgb
import matplotlib.pyplot as plt
import gc
import json
def data_pred():
df_init=pd.read_csv('accepted_2007_to_2018Q4.csv')
df=df_init.sample(50000).reset_index(drop=True)
df['y']=df.loan_status.map({'Fully Paid':0,'Late (31-120 days)':1,'Late (16-30 days)':1})
df_copy=df[df.y.isin([0,1])].copy()
return df_copy.reset_index(drop=True)
df=data_pred()
date_fea=[
'issue_d',
'earliest_cr_line',
'last_pymnt_d',
'last_credit_pull_d',
'sec_app_earliest_cr_line',
'hardship_start_date',
'hardship_end_date',
'payment_plan_start_date',
'debt_settlement_flag_date',
'settlement_date',
'next_pymnt_d'
]
target_label_fea=[
'sub_grade',
'emp_title',
'home_ownership',
'verification_status',
'pymnt_plan',
'purpose',
'addr_state',
'initial_list_status',
'application_type',
'verification_status_joint',
'hardship_flag',
'hardship_type',
'hardship_reason',
'hardship_status',
'hardship_loan_status',
'disbursement_method',
'debt_settlement_flag',
'settlement_status',
]
def target_label_mean(df_train,col,y_name='y'): # 单变量目标编码
return dict(round(df_train.groupby([col])[y_name].mean(),6))
def target_label_mean_all(df_train,col_list,y_name='y'): # 全部变量目标编码
return {col:target_label_mean(df_train,col,y_name) for col in col_list}
def train_test_split_self(df_copy,y_name='y'): # 数据划分
frac=0.7
train_index=list(df_copy[df_copy[y_name]==1].sample(frac=frac,random_state=1).index)+list(df_copy[df_copy[y_name]==0].sample(frac=frac,random_state=1).index)
df_copy.loc[df_copy.index.isin(train_index),'sample_label']='train'
df_copy.loc[~df_copy.index.isin(train_index),'sample_label']='test'
return df_copy
def train_test_target_label(df_copy,col_list): # df目标编码
df=df_copy.pipe(train_test_split_self)
all_col_map=target_label_mean_all(df[df.sample_label=='train'],col_list,y_name='y')
for col in col_list:
df[col]=df[col].map(all_col_map[col])
return df,all_col_map
def get_fill_df(df_pred,y_name='y'): # 缺失填充
df=df_pred.copy()
fill_col=[col for col in df.columns if col not in [y_name]]
df=df.fillna(df.mean())
return df
def fea_engineer(df_copy): # 所有预处理汇总
df=df_copy.copy()
df.term=df.term.str.replace('months','').str.replace(' ','').astype('float')
grade_map=dict(enumerate(list('ABCDEFG')))
grade_map={value:key for key,value in grade_map.items()}
df.grade=df.grade.map(grade_map)
df.emp_length=df.emp_length.str.replace('years','').str.replace('year','').str.replace(' ','').map({'<1':0,'10+':10})
float_col=list(df_copy.select_dtypes(include=['int','float']).columns)
float_col.remove('y')
drop_col=['member_id','loan_status','url','desc','title','zip_code']
df=df.drop(drop_col+date_fea,axis=1) # 日期字段含义不明确,所以先不用了
df,all_col_map=train_test_target_label(df,target_label_fea) # 部分类别型变量进行target_label编码
return df,all_col_map
df_pred,all_col_map=fea_engineer(df)
df_pred_fill=get_fill_df(df_pred,y_name='y')
df_pred_fill
二、变量统计及筛选
1、变量列表
在数据预处理的基础上,定义变量列表、长度133(已经均为数值型变量)
2、iv计算
使用toad计算133个变量的iv值
3、特征筛选
由于本文也使用到逻辑回归,所以需要按照缺失程度、变量相关性、iv等对变量进行筛选,最终胜于变量59个
变量统计及筛选代码
fea_list=list(df_pred.drop(['id','y','sample_label'],axis=1).columns)
len(fea_list)
import toad
def iv_miss(df,var_list,y):
df_tmp=df[df[y].notnull()].copy()
iv_all=toad.quality(df_tmp[var_list+[y]], target=y, indicators = ['iv','unique'])[['unique','iv']]
miss_per=pd.DataFrame(df[var_list].isnull().sum()/(df.shape[0]))
miss_per.columns=['缺失率']
result=pd.concat([miss_per,iv_all],axis=1)
return result.sort_values('iv',ascending=False)
df_iv=iv_miss(df_pred,fea_list,'y') # iv计算
df_iv
y_name='y'
# 变量筛选
train_selected,drop_col=toad.selection.select(df_pred[fea_list+[y_name]],y_name,empty=0.9,
iv=0.02,corr=0.7,return_drop=True)
train_selected
# 剩余变量列表
fea_list_select=list(train_selected.drop('y',axis=1).columns)
print(len(fea_list),len(fea_list_select))
三、模型构建及评估
分别使用lgb、xgb、rf、lr构建二分类模型,使用ks、auc进行评估
def init_params(model_select='lgb'): # 初始化参数
params_lr={
'n_jobs':4,
'max_iter':100,
'penalty':'l2',
'solver':'lbfgs',
'random_state':1,
'tol':0.0001,
'C':1,
'verbose':1,
}
params_rf={
'n_jobs':4,
'max_samples':0.8,
'n_estimators':500,
'max_depth':3,
'min_samples_leaf':50,
'max_features':'auto',
'min_impurity_decrease':0,
'bootstrap':True,
'oob_score':True,
'random_state':1,
'verbose':1
}
params_xgb={
'objective':'binary:logistic',
'eval_metric':'auc',
'nthread':4,
'n_estimators':500,
'eta':0.02,
'max_depth':3,
'min_child_weight':50,
'scale_pos_weight':1,
'gamma':10,
'reg_alpha':2,
'reg_lambda':2,
'subsample':0.8,
'colsample_bytree':0.8,
'seed':123
}
params_lgb={
'boosting_type': 'gbdt',
'objective': 'binary',
'metric':'auc',
'n_jobs': 4,
'n_estimators':params_xgb.get('n_estimators',100),
'learning_rate': params_xgb.get('eta',0.1),
'max_depth':params_xgb.get('max_depth',4),
'num_leaves': params_xgb.get('num_leaves',20),
'max_bin':255,
'subsample_for_bin':100000, # 构建直方图的样本量
'min_split_gain':params_xgb.get('gamma',10),
'min_child_samples':params_xgb.get('min_child_weight',50),
'colsample_bytree': params_xgb.get('colsample_bytree',0.8),
'subsample': params_xgb.get('subsample',0.8),
'subsample_freq': 1, # 每 k 次迭代执行bagging
'feature_fraction_seed':2,
'bagging_seed': 1,
'reg_alpha':params_xgb.get('reg_alpha',10),
'reg_lambda':params_xgb.get('reg_lambda',10),
'scale_pos_weight':params_xgb.get('scale_pos_weight',1), # 等价于is_unbalance=False
'silent':True,
'random_state':1,
'verbose':-1, # 控制模型训练过程的输出信息,-1为不输出信息
}
if model_select=='lr':
return params_lr
if model_select=='rf':
return params_rf
if model_select=='xgb':
return params_xgb
if model_select=='lgb':
return params_lgb
def ks_auc_value(y_value,df,model): # 计算ks、auc
y_pred=model.predict_proba(df)[:,1]
fpr,tpr,thresholds= roc_curve(list(y_value),list(y_pred))
ks=max(tpr-fpr)
auc= roc_auc_score(list(y_value),list(y_pred))
return ks,auc
def model_train_sklearn(df,y_name,model_fea,model_select='lgb'):
params=init_params(model_select=model_select)
if model_select=='xgb':
model=XGBClassifier(**params)
elif model_select=='lgb':
model=lgb.LGBMClassifier(**params)
elif model_select=='rf':
model=RandomForestClassifier(**params)
elif model_select=='lr':
model=LogisticRegression(**params)
x_train,y_train=df[df.sample_label=='train'][model_fea],df[df.sample_label=='train'][y_name]
x_test,y_test=df[df.sample_label=='test'][model_fea],df[df.sample_label=='test'][y_name]
if model_select in ['rf','lr']:
model.fit(x_train,y_train)
else:
model.fit(x_train,y_train,eval_set=[(x_train, y_train),(x_test, y_test)],verbose=True)
train_ks,train_auc=ks_auc_value(y_train,x_train,model)
test_ks,test_auc=ks_auc_value(y_test,x_test,model)
model_result={
'train_good':(y_train.count()-y_train.sum()),'train_bad':y_train.sum(),
'test_good':(y_test.count()-y_test.sum()),'test_bad':y_test.sum(),
'train_ks':train_ks,'train_auc':train_auc,
'test_ks':test_ks,'test_auc':test_auc,
}
return model_result,model
xgb_model_result,xgb_model=model_train_sklearn(df_pred,'y',fea_list_select,model_select='xgb')
lgb_model_result,lgb_model=model_train_sklearn(df_pred,'y',fea_list_select,model_select='lgb')
rf_model_result,rf_model=model_train_sklearn(df_pred_fill,'y',fea_list_select,model_select='rf')
lr_model_result,lr_model=model_train_sklearn(df_pred_fill,'y',fea_list_select,model_select='lr')
pd.DataFrame([xgb_model_result,lgb_model_result,rf_model_result,lr_model_result],index=['xgb','lgb','rf','lr'])
四、划重点
少走10年弯路
关注威信公众号 Python风控模型与数据分析,回复 kaggle风控实战 获取本篇数据及代码
还有更多理论、代码分享等你来拿