零基础入门金融风控之贷款违约预测—特征工程

最新推荐文章于 2020-09-27 15:53:32 发布

sosososoon

最新推荐文章于 2020-09-27 15:53:32 发布

阅读量1.3k

点赞数

分类专栏：数据分析与挖掘文章标签：数据挖掘

本文链接：https://blog.csdn.net/sosososoon/article/details/108721009

版权

数据分析与挖掘专栏收录该内容

25 篇文章 5 订阅

订阅专栏

初始化

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
import warnings
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss
warnings.filterwarnings('ignore')

INPUT_PATH = 'G:\代码\数据挖掘实践\\' 
data_train = pd.read_csv(INPUT_PATH +'train.csv')
data_test_a = pd.read_csv(INPUT_PATH +'testA.csv')

预处理

查找出数据中的类别特征和数值特征

numerical_fea = list(data_train.select_dtypes(exclude=['object']).columns) 
# select_dtypes 选取特定的类型，include:选择的数据类型，exclude：排除的数据类型
category_fea = list(filter(lambda x: x not in numerical_fea,list(data_train.columns)))
# filter() 用于过滤序列，过滤掉不符合条件的元素，返回由符合条件元素组成的新列表
label = 'isDefault'
numerical_fea.remove(label)
print(category_fea)
print(numerical_fea)

['grade', 'subGrade', 'employmentLength', 'issueDate', 'earliesCreditLine']
['id', 'loanAmnt', 'term', 'interestRate', 'installment', 'employmentTitle', 'homeOwnership', 'annualIncome', 'verificationStatus', 'purpose', 'postCode', 'regionCode', 'dti', 'delinquency_2years', 'ficoRangeLow', 'ficoRangeHigh', 'openAcc', 'pubRec', 'pubRecBankruptcies', 'revolBal', 'revolUtil', 'totalAcc', 'initialListStatus', 'applicationType', 'title', 'policyCode', 'n0', 'n1', 'n2', 'n3', 'n4', 'n5', 'n6', 'n7', 'n8', 'n9', 'n10', 'n11', 'n12', 'n13', 'n14']

缺失值处理

把所有缺失值替换为指定的值：

    data_train = data_train.fillna(0)

用缺失值上面的值替换缺失值：

    data_train = data_train.fillna(axis=0,method='ffill')

用缺失值下面的值替换缺失值，并且设置最多只填充两个连续的缺失值：

    data_train = data_train.fillna(axis=0,method='bfill',limit=2)

用插值的方式填充缺失值：

    data_train['n0'] = data_train['n0'].interpolate()

# 查看缺失值情况
data_train.isnull().sum()

id                        0
loanAmnt                  0
term                      0
interestRate              0
installment               0
grade                     0
subGrade                  0
employmentTitle           1
employmentLength      46799
homeOwnership             0
annualIncome              0
verificationStatus        0
issueDate                 0
isDefault                 0
purpose                   0
postCode                  1
regionCode                0
dti                     239
delinquency_2years        0
ficoRangeLow              0
ficoRangeHigh             0
openAcc                   0
pubRec                    0
pubRecBankruptcies      405
revolBal                  0
revolUtil               531
totalAcc                  0
initialListStatus         0
applicationType           0
earliesCreditLine         0
title                     1
policyCode                0
n0                    40270
n1                    40270
n2                    40270
n3                    40270
n4                    33239
n5                    40270
n6                    40270
n7                    40270
n8                    40271
n9                    40270
n10                   33239
n11                   69752
n12                   40270
n13                   40270
n14                   40270
dtype: int64

# 用平均数填充数值型特征
data_train[numerical_fea] = data_train[numerical_fea].fillna(data_train[numerical_fea].median())
data_test_a[numerical_fea] = data_test_a[numerical_fea].fillna(data_train[numerical_fea].median())
# 用众数填充类别型特征
data_train[category_fea] = data_train[category_fea].fillna(data_train[category_fea].mode())
data_test_a[category_fea] = data_test_a[category_fea].fillna(data_train[category_fea].mode())

时间格式处理

# 转化成时间格式
for data in [data_train, data_test_a]:
    data['issueDate'] = pd.to_datetime(data['issueDate'],format='%Y-%m-%d')
    startdate = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d') # 把str转换为datetime
    # 构造时间特征
    data['issueDateDT'] = data['issueDate'].apply(lambda x: x-startdate).dt.days # 距离 2007-06-01 过去了多少天
data.head()

	id	loanAmnt	term	interestRate	installment	grade	subGrade	employmentTitle	employmentLength	homeOwnership	...	n6	n7	n8	n9	n10	n13	n14	issueDateDT
0	800000	14000.0	3	10.99	458.28	B	B3	7027.0	10+ years	0	...	4.0	15.0	19.0	6.0	17.0	1.0	3.0	2587
1	800001	20000.0	5	14.65	472.14	C	C5	60426.0	10+ years	0	...	3.0	3.0	9.0	3.0	5.0	2.0	2.0	2952
2	800002	12000.0	3	19.99	445.91	D	D4	23547.0	2 years	1	...	36.0	5.0	6.0	4.0	12.0	0.0	7.0	3410
3	800003	17500.0	5	14.31	410.02	C	C4	636.0	4 years	0	...	2.0	8.0	14.0	2.0	10.0	0.0	3.0	2710
4	800004	35000.0	3	17.09	1249.42	D	D1	368446.0	< 1 year	1	...	3.0	16.0	18.0	11.0	19.0	0.0	1.0	3775

5 rows × 47 columns

对象类型特征转换到数值

处理 employmentLength

data_train['employmentLength'].value_counts(dropna=False).sort_index()

1 year        55842
10+ years    278860
2 years       76742
3 years       68149
4 years       50932
5 years       53186
6 years       39575
7 years       37622
8 years       38551
9 years       32225
< 1 year      68316
Name: employmentLength, dtype: int64

def employmentLength_to_int(s):
    if pd.isnull(s):
        return s
    else:
        return np.int8(s.split()[0])
    
for data in [data_train, data_test_a]:
    data['employmentLength'].replace(to_replace='10+ years', value='10 years', inplace=True)
    data['employmentLength'].replace('< 1 year', '0 years', inplace=True)
    data['employmentLength'] = data['employmentLength'].apply(employmentLength_to_int)

data['employmentLength'].value_counts(dropna=False).sort_index()

0.0     15989
1.0     13182
2.0     18207
3.0     16011
4.0     11833
5.0     12543
6.0      9328
7.0      8823
8.0      8976
9.0      7594
10.0    65772
NaN     11742
Name: employmentLength, dtype: int64

处理 earliesCreditLine

data_train['earliesCreditLine'].sample(5)

682302    Apr-2006
182452    Jul-2003
621004    Sep-2001
775313    Aug-2009
128605    Dec-1986
Name: earliesCreditLine, dtype: object

for data in [data_train, data_test_a]:
    data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda s: int(s[-4:]))
    
data_train['earliesCreditLine'].sample(5)

791947    1999
418281    2003
455770    2001
443463    1987
252262    1973
Name: earliesCreditLine, dtype: int64

类别特征处理

cate_features = ['grade', 'subGrade', 'employmentTitle', 'homeOwnership',
'verificationStatus', 'purpose', 'postCode', 'regionCode', \
'applicationType', 'initialListStatus', 'title', 'policyCode']
for f in cate_features:
    print(f, '类型数：', data[f].nunique())

grade 类型数： 7
subGrade 类型数： 35
employmentTitle 类型数： 248683
homeOwnership 类型数： 6
verificationStatus 类型数： 3
purpose 类型数： 14
postCode 类型数： 932
regionCode 类型数： 51
applicationType 类型数： 2
initialListStatus 类型数： 2
title 类型数： 39644
policyCode 类型数： 1

对于 employmentTitle 以及 title 这种类别数较多的特征，可能需要提取关键字然后进行聚合（可以检测关键字是否在其中然后依据关键字进行分类）。

像等级这种具有优先级的类别特征可以进行 labelencode 或者自映射

for data in [data_train, data_test_a]:
    data['grade'] = data['grade'].map({'A':1,'B':2,'C':3,'D':4,'E':5,'F':6,'G':7})

for data in [data_train, data_test_a]:
    data = pd.get_dummies(data, columns=['subGrade', 'homeOwnership', 'verificationStatus',
'purpose', 'regionCode'], drop_first=True) # 获得 dummy 编码 (比 one-hot 编码少一维)

异常值处理

首先先分清是什么原因导致的异常值，然后再考虑如何处理。

如果异常值代表的是一种极其偶然的现象，并且这种偶然现象并不在研究范围内，此时就可以将其删除。
如果异常值存在且代表了一种真实存在的现象，那就不能随便删除。在现有的欺诈场景中很多时候欺诈数据本身相对于正常数据来说就是异常的，此时就要把这些异常点纳入，重新拟合模型，研究其规律。

利用均值和方差检验异常值

在统计学中，如果一个数据分布近似正态，那么大约 68% 的数据值会在均值的一个标准差范围内，大约 95% 会在两个标准差范围内，大约 99.7% 会在三个标准差范围内。

def find_outliers_by_3segama(data,fea):
    data_std = np.std(data[fea])
    data_mean = np.mean(data[fea])
    outliers_cut_off = data_std * 3
    lower_rule = data_mean - outliers_cut_off
    upper_rule = data_mean + outliers_cut_off
    data[fea+'_outliers'] = data[fea].apply(lambda x:str('异常值') if x > upper_rule or x < lower_rule else '正常值')
    return data

data_train = data_train.copy()
for fea in numerical_fea:
    data_train = find_outliers_by_3segama(data_train,fea)
    print(data_train[fea+'_outliers'].value_counts())
    print(data_train.groupby(fea+'_outliers')['isDefault'].sum())
    print('*'*10)

正常值    800000
Name: id_outliers, dtype: int64
id_outliers
正常值    159610
Name: isDefault, dtype: int64
**********
正常值    800000
Name: loanAmnt_outliers, dtype: int64
loanAmnt_outliers
正常值    159610
Name: isDefault, dtype: int64
**********
正常值    800000
Name: term_outliers, dtype: int64
term_outliers
正常值    159610
Name: isDefault, dtype: int64
**********
正常值    794259
异常值      5741
Name: interestRate_outliers, dtype: int64
interestRate_outliers
异常值      2916
正常值    156694
Name: isDefault, dtype: int64
**********
正常值    792046
异常值      7954
Name: installment_outliers, dtype: int64
installment_outliers
异常值      2152
正常值    157458
Name: isDefault, dtype: int64
**********
正常值    800000
Name: employmentTitle_outliers, dtype: int64
employmentTitle_outliers
正常值    159610
Name: isDefault, dtype: int64
**********
正常值    799701
异常值       299
Name: homeOwnership_outliers, dtype: int64
homeOwnership_outliers
异常值        62
正常值    159548
Name: isDefault, dtype: int64
**********
正常值    793973
异常值      6027
Name: annualIncome_outliers, dtype: int64
annualIncome_outliers
异常值       756
正常值    158854
Name: isDefault, dtype: int64
**********
正常值    800000
Name: verificationStatus_outliers, dtype: int64
verificationStatus_outliers
正常值    159610
Name: isDefault, dtype: int64
**********
正常值    783003
异常值     16997
Name: purpose_outliers, dtype: int64
purpose_outliers
异常值      3635
正常值    155975
Name: isDefault, dtype: int64
**********
正常值    798931
异常值      1069
Name: postCode_outliers, dtype: int64
postCode_outliers
异常值       221
正常值    159389
Name: isDefault, dtype: int64
**********
正常值    799994
异常值         6
Name: regionCode_outliers, dtype: int64
regionCode_outliers
异常值         1
正常值    159609
Name: isDefault, dtype: int64
**********
正常值    798440
异常值      1560
Name: dti_outliers, dtype: int64
dti_outliers
异常值       466
正常值    159144
Name: isDefault, dtype: int64
**********
正常值    778245
异常值     21755
Name: delinquency_2years_outliers, dtype: int64
delinquency_2years_outliers
异常值      5089
正常值    154521
Name: isDefault, dtype: int64
**********
正常值    788261
异常值     11739
Name: ficoRangeLow_outliers, dtype: int64
ficoRangeLow_outliers
异常值       778
正常值    158832
Name: isDefault, dtype: int64
**********
正常值    788261
异常值     11739
Name: ficoRangeHigh_outliers, dtype: int64
ficoRangeHigh_outliers
异常值       778
正常值    158832
Name: isDefault, dtype: int64
**********
正常值    790889
异常值      9111
Name: openAcc_outliers, dtype: int64
openAcc_outliers
异常值      2195
正常值    157415
Name: isDefault, dtype: int64
**********
正常值    792471
异常值      7529
Name: pubRec_outliers, dtype: int64
pubRec_outliers
异常值      1701
正常值    157909
Name: isDefault, dtype: int64
**********
正常值    794120
异常值      5880
Name: pubRecBankruptcies_outliers, dtype: int64
pubRecBankruptcies_outliers
异常值      1423
正常值    158187
Name: isDefault, dtype: int64
**********
正常值    790001
异常值      9999
Name: revolBal_outliers, dtype: int64
revolBal_outliers
异常值      1359
正常值    158251
Name: isDefault, dtype: int64
**********
正常值    799948
异常值        52
Name: revolUtil_outliers, dtype: int64
revolUtil_outliers
异常值        23
正常值    159587
Name: isDefault, dtype: int64
**********
正常值    791663
异常值      8337
Name: totalAcc_outliers, dtype: int64
totalAcc_outliers
异常值      1668
正常值    157942
Name: isDefault, dtype: int64
**********
正常值    800000
Name: initialListStatus_outliers, dtype: int64
initialListStatus_outliers
正常值    159610
Name: isDefault, dtype: int64
**********
正常值    784586
异常值     15414
Name: applicationType_outliers, dtype: int64
applicationType_outliers
异常值      3875
正常值    155735
Name: isDefault, dtype: int64
**********
正常值    775134
异常值     24866
Name: title_outliers, dtype: int64
title_outliers
异常值      3900
正常值    155710
Name: isDefault, dtype: int64
**********
正常值    800000
Name: policyCode_outliers, dtype: int64
policyCode_outliers
正常值    159610
Name: isDefault, dtype: int64
**********
正常值    782773
异常值     17227
Name: n0_outliers, dtype: int64
n0_outliers
异常值      3485
正常值    156125
Name: isDefault, dtype: int64
**********
正常值    790500
异常值      9500
Name: n1_outliers, dtype: int64
n1_outliers
异常值      2491
正常值    157119
Name: isDefault, dtype: int64
**********
正常值    789067
异常值     10933
Name: n2_outliers, dtype: int64
n2_outliers
异常值      3205
正常值    156405
Name: isDefault, dtype: int64
**********
正常值    789067
异常值     10933
Name: n3_outliers, dtype: int64
n3_outliers
异常值      3205
正常值    156405
Name: isDefault, dtype: int64
**********
正常值    788660
异常值     11340
Name: n4_outliers, dtype: int64
n4_outliers
异常值      2476
正常值    157134
Name: isDefault, dtype: int64
**********
正常值    790355
异常值      9645
Name: n5_outliers, dtype: int64
n5_outliers
异常值      1858
正常值    157752
Name: isDefault, dtype: int64
**********
正常值    786006
异常值     13994
Name: n6_outliers, dtype: int64
n6_outliers
异常值      3182
正常值    156428
Name: isDefault, dtype: int64
**********
正常值    788430
异常值     11570
Name: n7_outliers, dtype: int64
n7_outliers
异常值      2746
正常值    156864
Name: isDefault, dtype: int64
**********
正常值    789625
异常值     10375
Name: n8_outliers, dtype: int64
n8_outliers
异常值      2131
正常值    157479
Name: isDefault, dtype: int64
**********
正常值    786384
异常值     13616
Name: n9_outliers, dtype: int64
n9_outliers
异常值      3953
正常值    155657
Name: isDefault, dtype: int64
**********
正常值    788979
异常值     11021
Name: n10_outliers, dtype: int64
n10_outliers
异常值      2639
正常值    156971
Name: isDefault, dtype: int64
**********
正常值    799434
异常值       566
Name: n11_outliers, dtype: int64
n11_outliers
异常值       112
正常值    159498
Name: isDefault, dtype: int64
**********
正常值    797585
异常值      2415
Name: n12_outliers, dtype: int64
n12_outliers
异常值       545
正常值    159065
Name: isDefault, dtype: int64
**********
正常值    788907
异常值     11093
Name: n13_outliers, dtype: int64
n13_outliers
异常值      2482
正常值    157128
Name: isDefault, dtype: int64
**********
正常值    788884
异常值     11116
Name: n14_outliers, dtype: int64
n14_outliers
异常值      3364
正常值    156246
Name: isDefault, dtype: int64
**********

# 删除异常值
for fea in numerical_fea:
    data_train = data_train[data_train[fea+'_outliers']=='正常值']
    data_train = data_train.reset_index(drop=True)

特征分箱

特征分箱的目的：
- 从模型效果上来看，特征分箱主要是为了降低变量的复杂性，减少变量噪音对模型的影响，提高自变量和因变量的相关度。从而使模型更加稳定。
特征分箱的对象：
- 将连续变量离散化
- 将多状态的离散变量合并成少状态
分箱的原因：
- 数据的特征内的值跨度可能比较大，对于如 k-Means 等使用欧氏距离作为相似度函数来测量数据点之间的相似度的算法，会造成大吃小的影响。
分箱的优点：
- 处理缺失值：当数据源存在缺失值时，可以把null单独作为一个分箱。
- 处理异常值：当数据中存在离群点时，可以把离群点通过分箱离散化处理，从而提高变量的鲁棒性（抗干扰能力）。例如，若 age 出现200这种异常值，可分入“age > 60”这个分箱里，排除影响。
- 业务解释性：我们习惯于线性判断变量的作用，当 x 越大，y 就越大。但实际 x 与 y 之间经常存在着非线性关系，此时可以使用WOE变换。
分箱的基本原则：
- 最小分箱占比不低于 5%
- 箱内不能全部是好客户
- 连续箱单调

固定宽度分箱

- 当数值横跨多个数量级时，最好按照 10 的幂（或任何常数的幂）来进行分组。固定宽度分箱非常容易计算，但如果计数值中有比较大的缺口，就会产生很多没有任何数据的空箱子。

# 通过除法映射到间隔均匀的分箱中，每个分箱的取值范围都是loanAmnt/1000
data['loanAmnt_bin1'] = np.floor_divide(data['loanAmnt'], 1000)

# 通过对数函数映射到指数宽度分箱
data['loanAmnt_bin2'] = np.floor(np.log10(data['loanAmnt']))

分位数分箱

data['loanAmnt_bin3'] = pd.qcut(data['loanAmnt'], 10, labels=False)

卡方分箱

卡方分箱是自底向上的(即基于合并的)数据离散化方法。它依赖于卡方检验：具有最小卡方值的相邻区间合并在一起,直到满足确定的停止准则。
基本思想：对于精确的离散化，相对类频率在一个区间内应当完全一致。因此,如果两个相邻的区间具有非常类似的类分布，则这两个区间可以合并；否则，它们应当保持分开。而低卡方值表明它们具有相似的类分布。

特征交互

交互特征的构造非常简单，使用起来却代价不菲。如果线性模型中包含有交互特征对，那么它的训练时间和评分时间就会从 $O (n)$ 增加到 $O(n^2)$ .

for col in ['grade', 'subGrade']:
    temp_dict = data_train.groupby([col])['isDefault'].agg(['mean']).reset_index().rename(columns={'mean': col + '_target_mean'})
    temp_dict.index = temp_dict[col].values
    temp_dict = temp_dict[col + '_target_mean'].to_dict() # 字典
    data_train[col + '_target_mean'] = data_train[col].map(temp_dict)
    data_test_a[col + '_target_mean'] = data_test_a[col].map(temp_dict)

# 其他衍生变量 mean 和 std
for df in [data_train, data_test_a]:
    for item in ['n0','n1','n2','n3','n4','n5','n6','n7','n8','n9','n10','n11','n12','n13','n14']:
        df['grade_to_mean_' + item] = df['grade'] / df.groupby([item])['grade'].transform('mean')
        df['grade_to_std_' + item] = df['grade'] / df.groupby([item])['grade'].transform('std')

特征编码

编码好的特征可以直接放入树模型中

# label-encode:subGrade,postCode,title
for col in tqdm(['employmentTitle', 'postCode', 'title','subGrade']):
    le = LabelEncoder()
    le.fit(list(data_train[col].astype(str).values) + list(data_test_a[col].astype(str).values))
    data_train[col] = le.transform(list(data_train[col].astype(str).values))
    data_test_a[col] = le.transform(list(data_test_a[col].astype(str).values))

  0%|                                                                                            | 0/4 [00:00<?, ?it/s][A
 25%|█████████████████████                                                               | 1/4 [00:03<00:11,  3.85s/it][A
 50%|██████████████████████████████████████████                                          | 2/4 [00:05<00:06,  3.27s/it][A
 75%|███████████████████████████████████████████████████████████████                     | 3/4 [00:07<00:02,  2.80s/it][A
100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:09<00:00,  2.33s/it][A

逻辑回归等模型要单独增加的特征工程

对特征做归一化，去除相关性高的特征
归一化目的是让训练过程更好更快的收敛，避免特征大吃小的问题

list_fea = ['loanAmnt','annualIncome', 'ficoRangeLow', 'ficoRangeHigh', 'openAcc', 'pubRec', 'pubRecBankruptcies', 'revolBal', 'revolUtil', 'totalAcc']
for fea in list_fea:
    data[fea] = ((data[fea] - np.min(data[fea])) / (np.max(data[fea]) - np.min(data[fea])))

特征选择

特征选择可以精简掉无用的特征，以降低最终模型的复杂性，它的最终目的是得到一个简约模型，在不降低预测准确率或对预测准确率影响不大的情况下提高计算速度。特征选择不是为了减少训练时间（实际上，一些技术会增加总体训练时间），而是为了减少模型评分时间。

Filter

基于特征间的关系进行筛选:

方差选择法
计算各个特征的方差，然后根据设定的阈值，选择方差大于阈值的特征

from sklearn.feature_selection import VarianceThreshold
# 其中参数 threshold 为方差的阈值
VarianceThreshold(threshold=3).fit_transform(data_train,target_train)

Pearson 相关系数
皮尔森相关系数是一种最简单的，可以帮助理解特征和响应变量之间关系的方法，该方法衡量的是变量之间的线性相关性。皮尔森相关系数的取值区间为 [-1，1] ， -1 表示完全的负相关， +1表示完全的正相关，0 表示没有线性相关。

from sklearn.feature_selection import SelectKBest
from scipy.stats import pearsonr
# 选择 K 个最好的特征，返回选择特征后的数据
# 第一个参数为计算评估特征是否好的函数，该函数输入特征矩阵和目标向量，
# 输出二元组（评分，P值）的数组，数组第 i 项为第 i 个特征的评分和 P 值。此处指计算出的相关系数
# 参数 K 为选择的特征个数
SelectKBest(k=5).fit_transform(train,target_train)

卡方检验
- 经典的卡方检验是用于检验自变量对因变量的相关性。假设自变量有 N 种取值，因变量有 M 种取值，考虑自变量等于 i 且因变量等于 j 的样本频数的观察值与期望的差距
- 其统计量如下： $χ 2 = \sum (A - T) 2 T$ ，其中A为实际值，T为理论值
- 卡方只能运用在正定矩阵上，否则会报错 Input X must be non-negative
```
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
SelectKBest(chi2, k=5).fit_transform(train,target_train)
```

互信息法
经典的互信息也是评价自变量对因变量的相关性的。在 feature_selection 库的 SelectKBest 类结合最大信息系数法可以用于选择特征：

from sklearn.feature_selection import SelectKBest
from minepy import MINE
# 由于 MINE 的设计不是函数式的，定义 mic 方法使其成为函数式的，
# 返回一个二元组，二元组的第2项设置成固定的 P 值0.5
def mic(x, y):
    m = MINE()
    m.compute_score(x, y)
    return (m.mic(), 0.5)

SelectKBest(lambda X, Y: array(map(lambda x:mic(x, Y), X.T)).T,k=2).fit_transform(train,target_train)

Wrapper （Recursive feature elimination，RFE）

递归特征消除法：使用一个基模型来进行多轮训练，每轮训练后，消除若干权值系数的特征，再基于新的特征集进行下一轮训练。feature_selection 库的 RFE 类可以用于选择特征：

from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
# 递归特征消除法，返回特征选择后的数据
# 参数estimator为基模型
# 参数n_features_to_select为选择的特征个数
RFE(estimator=LogisticRegression(),
n_features_to_select=2).fit_transform(train,target_train)

Embedded

基于惩罚项的特征选择法：使用带惩罚项的基模型，除了筛选出特征外，同时也进行了降维。

feature_selection 库的 SelectFromModel 类结合逻辑回归模型可以用于选择特征：

from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
# 带L1惩罚项的逻辑回归作为基模型的特征选择
SelectFromModel(LogisticRegression(penalty="l1", C=0.1)).fit_transform(train,target_train)

基于树模型的特征选择：树模型中GBDT也可用来作为基模型进行特征选择。在 feature_selection 库的 SelectFromModel 类结合 GBDT 模型可以用于选择特征：

from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import GradientBoostingClassifier
# GBDT 作为基模型的特征选择
SelectFromModel(GradientBoostingClassifier()).fit_transform(train,target_train)

数据处理

# 删除不需要的数据
for data in [data_train, data_test_a]:
    data.drop(['issueDate','id'], axis=1,inplace=True)

# 纵向用缺失值上面的值替换缺失值
data_train = data_train.fillna(axis=0,method='ffill')

x_train = data_train.drop(['isDefault'], axis=1)
# 计算协方差
data_corr = x_train.corrwith(data_train.isDefault) #计算相关性
result = pd.DataFrame(columns=['features', 'corr'])
result['features'] = data_corr.index
result['corr'] = data_corr.values
result

	features	corr
0	loanAmnt	0.065210
1	term	0.175126
2	interestRate	0.259202
3	installment	0.051524
4	employmentTitle	-0.030714
5	homeOwnership	0.054599
6	annualIncome	-0.042782
7	verificationStatus	0.088557
8	purpose	-0.029208
9	postCode	0.007139
10	regionCode	0.002822
11	dti	0.084937
12	delinquency_2years	0.020185
13	ficoRangeLow	-0.130994
14	ficoRangeHigh	-0.130993
15	openAcc	0.026867
16	pubRec	0.026558
17	pubRecBankruptcies	0.026048
18	revolBal	-0.020929
19	revolUtil	0.059377
20	totalAcc	-0.012756
21	initialListStatus	-0.007383
22	applicationType	0.018197
23	title	-0.025173
24	policyCode	NaN
25	n0	0.010229
26	n1	0.039083
27	n2	0.067559
28	n3	0.067559
29	n4	0.012882
30	n5	-0.017726
31	n6	0.005150
32	n7	0.031089
33	n8	-0.005393
34	n9	0.066188
35	n10	0.024947
36	n11	-0.000426
37	n12	0.003206
38	n13	0.009699
39	n14	0.081998

画出热力图：

data_numeric = data_train[numerical_fea]
correlation = data_numeric.corr()
f , ax = plt.subplots(figsize = (7, 7))
plt.title('Correlation of Numeric Features with Price',y=1,size=16)
sns.heatmap(correlation,square = True, vmax=0.8)

在这里插入图片描述

features = [f for f in data_train.columns if f not in ['id','issueDate','isDefault'] and '_outliers' not in f]
x_train = data_train[features]
x_test = data_test_a[features]
y_train = data_train['isDefault']

def cv_model(clf, train_x, train_y, test_x, clf_name):
    folds = 5
    seed = 2020
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
    train = np.zeros(train_x.shape[0])
    test = np.zeros(test_x.shape[0])
    cv_scores = []
    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {}************************************'.format(str(i+1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index],train_x.iloc[valid_index], train_y[valid_index]
        
        if clf_name == "lgb":
            train_matrix = clf.Dataset(trn_x, label=trn_y)
            valid_matrix = clf.Dataset(val_x, label=val_y)
            params = {
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'metric': 'auc',
                'min_child_weight': 5,
                'num_leaves': 2 ** 5,
                'lambda_l2': 10,
                'feature_fraction': 0.8,
                'bagging_fraction': 0.8,
                'bagging_freq': 4,
                'learning_rate': 0.1,
                'seed': 2020,
                'nthread': 28,
                'n_jobs':24,
                'silent': True,
                'verbose': -1,
            }
            model = clf.train(params, train_matrix, 50000, valid_sets=[train_matrix,valid_matrix], verbose_eval=200,early_stopping_rounds=200)
            val_pred = model.predict(val_x, num_iteration=model.best_iteration)
            test_pred = model.predict(test_x, num_iteration=model.best_iteration)

        if clf_name == "xgb":
            train_matrix = clf.DMatrix(trn_x , label=trn_y)
            valid_matrix = clf.DMatrix(val_x , label=val_y)
            params = {'booster': 'gbtree',
                'objective': 'binary:logistic',
                'eval_metric': 'auc',
                'gamma': 1,
                'min_child_weight': 1.5,
                'max_depth': 5,
                'lambda': 10,
                'subsample': 0.7,
                'colsample_bytree': 0.7,
                'colsample_bylevel': 0.7,
                'eta': 0.04,
                'tree_method': 'exact',
                'seed': 2020,
                'nthread': 36,
                "silent": True,
            }
            watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
            model = clf.train(params, train_matrix, num_boost_round=50000, evals=watchlist,
            verbose_eval=200, early_stopping_rounds=200)
            val_pred = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
            test_pred = model.predict(test_x , ntree_limit=model.best_ntree_limit)
            
        if clf_name == "cat":
            params = {'learning_rate': 0.05, 'depth': 5, 'l2_leaf_reg': 10,
                'bootstrap_type': 'Bernoulli',
                'od_type': 'Iter', 'od_wait': 50, 'random_seed': 11,
                'allow_writing_files': False}
            model = clf(iterations=20000, **params)
            model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
            cat_features=[], use_best_model=True, verbose=500)
            val_pred = model.predict(val_x)
            test_pred = model.predict(test_x)
            train[valid_index] = val_pred
            test = test_pred / kf.n_splits
            cv_scores.append(roc_auc_score(val_y, val_pred))
            
    print(cv_scores)
    print("%s_scotrainre_list:" % clf_name, cv_scores)
    print("%s_score_mean:" % clf_name, np.mean(cv_scores))
    print("%s_score_std:" % clf_name, np.std(cv_scores))
    return train, test
 
def lgb_model(x_train, y_train, x_test):
    lgb_train, lgb_test = cv_model(lgb, x_train, y_train, x_test, "lgb")
    return lgb_train, lgb_test

def xgb_model(x_train, y_train, x_test):
    xgb_train, xgb_test = cv_model(xgb, x_train, y_train, x_test, "xgb")
    return xgb_train, xgb_test

def cat_model(x_train, y_train, x_test):
    cat_train, cat_test = cv_model(CatBoostRegressor, x_train, y_train, x_test, "cat")
    return cat_train, cat_test

lgb_train, lgb_test = lgb_model(x_train, y_train, x_test)