金融风控实战——集成学习

该博客介绍了如何利用LightGBM建立信用评分卡模型,通过去除共线性、变量选择来提升模型性能。文章对比了LightGBM与逻辑回归(LR)的效果,并展示了LightGBM在训练集和验证集上的KS和AUC指标。此外,还探讨了评分卡公式的变形,并给出了模型预测分数的计算方法,最终生成了评估报告。
摘要由CSDN通过智能技术生成

xgb依然要去除共线性、变量选择

lr bivar要严格单调,xgb、lightGBM不需要

LightGBM评分卡

import pandas as pd
from sklearn.metrics import roc_auc_score,roc_curve,auc
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
import numpy as np
import random
import math
import time
import lightgbm as lgb

data = pd.read_csv('Bcard.txt')
data.head()

在这里插入图片描述

data.shape
#(95806, 13)
#看一下月份分布,我们用最后一个月做为跨时间验证集合
data.obs_mth.unique()
#array(['2018-10-31', '2018-07-31', '2018-09-30', '2018-06-30',
#       '2018-11-30'], dtype=object)
df_train = data[data.obs_mth != '2018-11-30'].reset_index().copy()
val = data[data.obs_mth == '2018-11-30'].reset_index().copy()
#这是我们全部的变量,info结尾的是自己做的无监督系统输出的个人表现,score结尾的是收费的外部征信数据
lst = ['person_info','finance_info','credit_info','act_info','td_score','jxl_score','mj_score','rh_score']

df_train = df_train.sort_values(by = 'obs_mth',ascending = False)
df_train.head()

在这里插入图片描述

df_train = df_train.sort_values(by = 'obs_mth',ascending = False)

rank_lst = []
for i in range(1,len(df_train)+1):
    rank_lst.append(i)
    
df_train['rank'] = rank_lst

df_train['rank'] = df_train['rank']/len(df_train)

pct_lst = []
for x in df_train['rank']:
    if x <= 0.2:
        x = 1
    elif x <= 0.4:
        x = 2
    elif x <= 0.6:
        x = 3
    elif x <= 0.8:
        x = 4
    else:
        x = 5
    pct_lst.append(x)
df_train['rank'] = pct_lst        
#train = train.drop('obs_mth',axis = 1)
df_train.head()

在这里插入图片描述

df_train['rank'].groupby(df_train['rank']).count()
#rank
#1    15966
#2    15966
#3    15966
#4    15966
#5    15967
#Name: rank, dtype: int64
#定义lgb函数
def LGB_test(train_x,train_y,test_x,test_y):
    from multiprocessing import cpu_count
    clf = lgb.LGBMClassifier(
        boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,
        max_depth=2, n_estimators=800, objective='binary',
        subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
        learning_rate=0.05, min_child_weight=50,random_state=None,n_jobs=cpu_count()-1,
        num_iterations = 800 #迭代次数
    )
    clf.fit(train_x, train_y,eval_set=[(train_x, train_y),(test_x,test_y)],eval_metric='auc',early_stopping_rounds=100)
    print(clf.n_features_)
    return clf,clf.best_score_[ 'valid_1']['auc']

feature_lst = {}
ks_train_lst = []
ks_test_lst = []
for rk in set(df_train['rank']):   
    
    # 测试集8.18以后作为跨时间验证集
    
    #定义模型训练集与测试集
    ttest = df_train[df_train['rank'] ==  rk]
    ttrain = df_train[df_train['rank'] !=  rk]
    
    train = ttrain[lst]
    train_y = ttrain.bad_ind
    
    test = ttest[lst]
    test_y = ttest.bad_ind    
    
    start = time.time()
    model,auc = LGB_test(train,train_y,test,test_y)                    
    end = time.time()
    
    #模型贡献度放在feture中
    feature = pd.DataFrame(
                {'name' : model.booster_.feature_name(),
                'importance' : model.feature_importances_
              }).sort_values(by =  ['importance'],ascending = False)
    
       
    #计算训练集、测试集、验证集上的KS和AUC

    y_pred_train_lgb = model.predict_proba(train)[:, 1]
    y_pred_test_lgb = model.predict_proba(test)[:, 1]


    train_fpr_lgb, train_tpr_lgb, _ = roc_curve(train_y, y_pred_train_lgb)
    test_fpr_lgb, test_tpr_lgb, _ = roc_curve(test_y, y_pred_test_lgb)


    train_ks = abs(train_fpr_lgb - train_tpr_lgb).max()
    test_ks = abs(test_fpr_lgb - test_tpr_lgb).max()


    train_auc = metrics.auc(train_fpr_lgb, train_tpr_lgb)
    test_auc = metrics.auc(test_fpr_lgb, test_tpr_lgb)
    
    ks_train_lst.append(train_ks)
    ks_test_lst.append(test_ks)    

    feature_lst[str(rk)] = feature[feature.importance>=20].name

train_ks = np.mean(ks_train_lst)
test_ks = np.mean(ks_test_lst)

ft_lst = {}
for i in range(1,6):
    ft_lst[str(i)] = feature_lst[str(i)]

fn_lst=list(set(ft_lst['1']) & set(ft_lst['2']) 
    & set(ft_lst['3']) & set(ft_lst['4']) &set(ft_lst['5']))

print('train_ks: ',train_ks)
print('test_ks: ',test_ks)
print('ft_lst: ',fn_lst )
#[LightGBM] [Warning] Unknown parameter: max_features
#[1]	training's auc: 0.726731	training's binary_logloss: 0.0827979	valid_1's auc: 0.742666	valid_1's binary_logloss: 0.12066
#[2]	training's auc: 0.769499	training's binary_logloss: 0.0822062	valid_1's auc: 0.753919	valid_1's binary_logloss: 0.119728
#[3]	training's auc: 0.788952	training's binary_logloss: 0.0816227	valid_1's auc: 0.762911	valid_1's binary_logloss: 0.118777
#. . .
#[188]	training's auc: 0.827082	training's binary_logloss: 0.0777181	valid_1's auc: 0.786679	valid_1's binary_logloss: 0.078782
#[189]	training's auc: 0.827128	training's binary_logloss: 0.0777136	valid_1's auc: 0.786756	valid_1's binary_logloss: 0.0787781
#[190]	training's auc: 0.827162	training's binary_logloss: 0.0777108	valid_1's auc: 0.786696	valid_1's binary_logloss: 0.0787811

#train_ks:  0.4907124806547195
#test_ks:  0.47382530047645305
#ft_lst:  ['credit_info', 'person_info', 'finance_info']
lst = ['person_info','finance_info','credit_info','act_info']

train = data[data.obs_mth != '2018-11-30'].reset_index().copy()
evl = data[data.obs_mth == '2018-11-30'].reset_index().copy()

x = train[lst]
y = train['bad_ind']

evl_x =  evl[lst]
evl_y = evl['bad_ind']

model,auc = LGB_test(x,y,evl_x,evl_y)

y_pred = model.predict_proba(x)[:,1]
fpr_lgb_train,tpr_lgb_train,_ = roc_curve(y,y_pred)
train_ks = abs(fpr_lgb_train - tpr_lgb_train).max()
print('train_ks : ',train_ks)

y_pred = model.predict_proba(evl_x)[:,1]
fpr_lgb,tpr_lgb,_ = roc_curve(evl_y,y_pred)
evl_ks = abs(fpr_lgb - tpr_lgb).max()
print('evl_ks : ',evl_ks)

from matplotlib import pyplot as plt
plt.plot(fpr_lgb_train,tpr_lgb_train,label = 'train LR')
plt.plot(fpr_lgb,tpr_lgb,label = 'evl LR')
plt.plot([0,1],[0,1],'k--')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC Curve')
plt.legend(loc = 'best')
plt.show()
#[1]	training's binary_logloss: 0.090317	training's auc: 0.712883	valid_1's binary_logloss: 0.0986629	valid_1's auc: 0.678619
#Training until validation scores don't improve for 100 rounds.
#[2]	training's binary_logloss: 0.0896369	training's auc: 0.779216	valid_1's binary_logloss: 0.0978883	valid_1's auc: 0.755811
#[3]	training's binary_logloss: 0.0885026	training's auc: 0.779149	valid_1's binary_logloss: 0.0966811	valid_1's auc: 0.749375
#[4]	training's binary_logloss: 0.087998	training's auc: 0.780539	valid_1's binary_logloss: 0.0961527	valid_1's auc: 0.759009
#...
#[179]	training's binary_logloss: 0.0784288	training's auc: 0.812571	valid_1's binary_logloss: 0.0900886	valid_1's auc: 0.779962
#[180]	training's binary_logloss: 0.0784267	training's auc: 0.812602	valid_1's binary_logloss: 0.0900914	valid_1's auc: 0.779887
#[181]	training's binary_logloss: 0.078425	training's auc: 0.812601	valid_1's binary_logloss: 0.0900941	valid_1's auc: 0.779927
#[182]	training's binary_logloss: 0.0784229	training's auc: 0.8126	valid_1's binary_logloss: 0.0900964	valid_1's auc: 0.779932
#Early stopping, best iteration is:
#[82]	training's binary_logloss: 0.0788374	training's auc: 0.811646	valid_1's binary_logloss: 0.089958	valid_1's auc: 0.779946
#4
#train_ks :  0.4801091876625077
#evl_ks :  0.4416674980164514

在这里插入图片描述
LightGBM其实效果确实是比较LR要好的,但是我们LR也可以逼近这个效果,下节课我们会具体来做。

评分卡公式变形
600 + 50 × ln ⁡ P 0 P 1 ln ⁡ 2 , P 0 为 好 人 , P 1 为 坏 人 600+50 \times \frac{\ln \frac{P_{0}}{P_{1}}}{\ln 2},P_{0}为好人,P_{1}为坏人 600+50×ln2lnP1P0,P0P1 600 + 50 × ln ⁡ 1 − x b e t a x b e t a ln ⁡ 2 600+50 \times \frac{\ln \frac{1-xbeta}{xbeta}}{\ln 2} 600+50×ln2lnxbeta1xbeta 600 + 50 × log ⁡ 2 1 − x b e t a x b e t a 600+50 \times \log _{2} \frac{1-{ xbeta }}{{ xbeta }} 600+50×log2xbeta1xbeta

#['person_info','finance_info','credit_info','act_info']
#算分数onekey 
def score(xbeta):
    score = 1000+500*(math.log2(1-xbeta)/xbeta)  #好人的概率/坏人的概率
    return score
evl['xbeta'] = model.predict_proba(evl_x)[:,1]   
evl['score'] = evl.apply(lambda x : score(x.xbeta) ,axis=1)

fpr_lr,tpr_lr,_ = roc_curve(evl_y,evl['score'])
evl_ks = abs(fpr_lr - tpr_lr).max()
print('val_ks : ',evl_ks)
#val_ks :  0.4416674980164514
#生成报告
row_num, col_num = 0, 0
bins = 20
Y_predict = evl['xbeta']
Y = evl_y
nrows = Y.shape[0]
lis = [(Y_predict[i], Y[i]) for i in range(nrows)]
ks_lis = sorted(lis, key=lambda x: x[0], reverse=True)
bin_num = int(nrows/bins+1)
bad = sum([1 for (p, y) in ks_lis if y > 0.5])
good = sum([1 for (p, y) in ks_lis if y <= 0.5])
bad_cnt, good_cnt = 0, 0
KS = []
BAD = []
GOOD = []
BAD_CNT = []
GOOD_CNT = []
BAD_PCTG = []
BADRATE = []
dct_report = {}
for j in range(bins):
    ds = ks_lis[j*bin_num: min((j+1)*bin_num, nrows)]
    bad1 = sum([1 for (p, y) in ds if y > 0.5])
    good1 = sum([1 for (p, y) in ds if y <= 0.5])
    bad_cnt += bad1
    good_cnt += good1
    bad_pctg = round(bad_cnt/sum(evl_y),3)
    badrate = round(bad1/(bad1+good1),3)
    ks = round(math.fabs((bad_cnt / bad) - (good_cnt / good)),3)
    KS.append(ks)
    BAD.append(bad1)
    GOOD.append(good1)
    BAD_CNT.append(bad_cnt)
    GOOD_CNT.append(good_cnt)
    BAD_PCTG.append(bad_pctg)
    BADRATE.append(badrate)
    dct_report['KS'] = KS
    dct_report['BAD'] = BAD
    dct_report['GOOD'] = GOOD
    dct_report['BAD_CNT'] = BAD_CNT
    dct_report['GOOD_CNT'] = GOOD_CNT
    dct_report['BAD_PCTG'] = BAD_PCTG
    dct_report['BADRATE'] = BADRATE
val_repot = pd.DataFrame(dct_report)
val_repot

在这里插入图片描述

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值