目标
- 学习特征预处理、缺失值、异常值处理、数据分桶等特征处理方法
- 学习特征交互、编码、选择的相应方法
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
import lightgbm as lgb
#from catboost import CatBoostRegressor
import warnings
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss
warnings.filterwarnings('ignore')
import os
os.getcwd()
#pip install xgboost-1.1.0-cp37-cp37m-win_amd64.whl
Requirement already satisfied: xgboost==1.1.0 from file:///C:/Users/gnzha/%E5%B7%A5%E4%BD%9C/%E5%B7%A5%E4%BD%9C/bonc/python/%E5%AD%A6%E4%B9%A0%E8%B5%84%E6%96%99/202009datawhale%E8%B5%84%E6%96%99/xgboost-1.1.0-cp37-cp37m-win_amd64.whl
Note: you may need to restart the kernel to use updated packages.
#pip install lightgbm-2.3.2-cp37-cp37m-win_amd64.whl
Processing c:\users\gnzha\工作\工作\bonc\python\学习资料\202009datawhale资料\lightgbm-2.3.2-cp37-cp37m-win_amd64.whl
Installing collected packages: lightgbm
Successfully installed lightgbm-2.3.2
Note: you may need to restart the kernel to use updated packages.
##pip install catboost -i https://pypi.tuna.tsinghua.edu.cn/simple
pip list
pip show xgboost
Name: xgboost
Version: 1.1.0
Summary: XGBoost Python Package
Home-page: https://github.com/dmlc/xgboost
Author: None
Author-email: None
License: Apache-2.0
Location:
Requires: numpy, scipy
Required-by:
Note: you may need to restart the kernel to use updated packages.
data_train =pd.read_csv('./数据/train.csv')
data_test_a = pd.read_csv('./数据/testA.csv')
1 数据预处理
#首先我们查找出数据中的对象特征和数值特征
numerical_fea = list(data_train.select_dtypes(exclude=['object']).columns)
category_fea = list(filter(lambda x: x not in numerical_fea,list(data_train.columns)))
label = 'isDefault'
numerical_fea.remove(label)
1.1 缺失值的填充
data_train1=data_train.fillna(0)
data_train1.head()
id | loanAmnt | term | interestRate | installment | grade | subGrade | employmentTitle | employmentLength | homeOwnership | ... | n5 | n6 | n7 | n8 | n9 | n10 | n11 | n12 | n13 | n14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 35000.0 | 5 | 19.52 | 917.97 | E | E2 | 320.0 | 2 years | 2 | ... | 9.0 | 8.0 | 4.0 | 12.0 | 2.0 | 7.0 | 0.0 | 0.0 | 0.0 | 2.0 |
1 | 1 | 18000.0 | 5 | 18.49 | 461.90 | D | D2 | 219843.0 | 5 years | 0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 13.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | 2 | 12000.0 | 5 | 16.99 | 298.17 | D | D3 | 31698.0 | 8 years | 0 | ... | 0.0 | 21.0 | 4.0 | 5.0 | 3.0 | 11.0 | 0.0 | 0.0 | 0.0 | 4.0 |
3 | 3 | 11000.0 | 3 | 7.26 | 340.96 | A | A4 | 46854.0 | 10+ years | 1 | ... | 16.0 | 4.0 | 7.0 | 21.0 | 6.0 | 9.0 | 0.0 | 0.0 | 0.0 | 1.0 |
4 | 4 | 3000.0 | 3 | 12.99 | 101.07 | C | C2 | 54.0 | 0 | 1 | ... | 4.0 | 9.0 | 10.0 | 15.0 | 7.0 | 12.0 | 0.0 | 0.0 | 0.0 | 4.0 |
5 rows × 47 columns
data_train2 = data_train.fillna(axis=0,method='ffill')
data_train2 = data_train.fillna(axis=0,method='bfill',limit=2)
data_train.isnull().sum()
id 0
loanAmnt 0
term 0
interestRate 0
installment 0
grade 0
subGrade 0
employmentTitle 1
employmentLength 46799
homeOwnership 0
annualIncome 0
verificationStatus 0
issueDate 0
isDefault 0
purpose 0
postCode 1
regionCode 0
dti 239
delinquency_2years 0
ficoRangeLow 0
ficoRangeHigh 0
openAcc 0
pubRec 0
pubRecBankruptcies 405
revolBal 0
revolUtil 531
totalAcc 0
initialListStatus 0
applicationType 0
earliesCreditLine 0
title 1
policyCode 0
n0 40270
n1 40270
n2 40270
n2.1 40270
n4 33239
n5 40270
n6 40270
n7 40270
n8 40271
n9 40270
n10 33239
n11 69752
n12 40270
n13 40270
n14 40270
dtype: int64
#按照平均数填充数值型特征
data_train[numerical_fea] = data_train[numerical_fea].fillna(data_train[numerical_fea].median())
data_test_a[numerical_fea] = data_test_a[numerical_fea].fillna(data_train[numerical_fea].median())
#按照众数填充类别型特征
data_train[category_fea] = data_train[category_fea].fillna(data_train[category_fea].mode())
data_test_a[category_fea] = data_test_a[category_fea].fillna(data_train[category_fea].mode())
data_train[category_fea].mode()
grade | subGrade | employmentLength | issueDate | earliesCreditLine | |
---|---|---|---|---|---|
0 | B | C1 | 10+ years | 2016-03-01 | Aug-2001 |
data_train.isnull().sum()
id 0
loanAmnt 0
term 0
interestRate 0
installment 0
grade 0
subGrade 0
employmentTitle 0
employmentLength 46799
homeOwnership 0
annualIncome 0
verificationStatus 0
issueDate 0
isDefault 0
purpose 0
postCode 0
regionCode 0
dti 0
delinquency_2years 0
ficoRangeLow 0
ficoRangeHigh 0
openAcc 0
pubRec 0
pubRecBankruptcies 0
revolBal 0
revolUtil 0
totalAcc 0
initialListStatus 0
applicationType 0
earliesCreditLine 0
title 0
policyCode 0
n0 0
n1 0
n2 0
n2.1 0
n4 0
n5 0
n6 0
n7 0
n8 0
n9 0
n10 0
n11 0
n12 0
n13 0
n14 0
dtype: int64
data_train.employmentLength.head()
0 2 years
1 5 years
2 8 years
3 10+ years
4 NaN
Name: employmentLength, dtype: object
data_train.employmentLength=data_train.employmentLength.fillna('10+ years')
data_train.isnull().any().sum()
0
data_train.employmentLength
0 2 years
1 5 years
2 8 years
3 10+ years
4 10+ years
...
799995 7 years
799996 10+ years
799997 10+ years
799998 10+ years
799999 5 years
Name: employmentLength, Length: 800000, dtype: object
category_fea
['grade', 'subGrade', 'employmentLength', 'issueDate', 'earliesCreditLine']
1.2 时间格式处理
#转化成时间格式
for data in [data_train, data_test_a]:
data['issueDate'] = pd.to_datetime(data['issueDate'],format='%Y-%m-%d')
startdate = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d')
#构造时间特征
data['issueDateDT'] = data['issueDate'].apply(lambda x: x-startdate).dt.days
data_train.columns
Index(['id', 'loanAmnt', 'term', 'interestRate', 'installment', 'grade',
'subGrade', 'employmentTitle', 'employmentLength', 'homeOwnership',
'annualIncome', 'verificationStatus', 'issueDate', 'isDefault',
'purpose', 'postCode', 'regionCode', 'dti', 'delinquency_2years',
'ficoRangeLow', 'ficoRangeHigh', 'openAcc', 'pubRec',
'pubRecBankruptcies', 'revolBal', 'revolUtil', 'totalAcc',
'initialListStatus', 'applicationType', 'earliesCreditLine', 'title',
'policyCode', 'n0', 'n1', 'n2', 'n2.1', 'n4', 'n5', 'n6', 'n7', 'n8',
'n9', 'n10', 'n11', 'n12', 'n13', 'n14', 'issueDateDT'],
dtype='object')
data_train.head()
id | loanAmnt | term | interestRate | installment | grade | subGrade | employmentTitle | employmentLength | homeOwnership | ... | n6 | n7 | n8 | n9 | n10 | n11 | n12 | n13 | n14 | issueDateDT | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 35000.0 | 5 | 19.52 | 917.97 | E | E2 | 320.0 | 2 years | 2 | ... | 8.0 | 4.0 | 12.0 | 2.0 | 7.0 | 0.0 | 0.0 | 0.0 | 2.0 | 2587 |
1 | 1 | 18000.0 | 5 | 18.49 | 461.90 | D | D2 | 219843.0 | 5 years | 0 | ... | 7.0 | 7.0 | 13.0 | 5.0 | 13.0 | 0.0 | 0.0 | 0.0 | 2.0 | 1888 |
2 | 2 | 12000.0 | 5 | 16.99 | 298.17 | D | D3 | 31698.0 | 8 years | 0 | ... | 21.0 | 4.0 | 5.0 | 3.0 | 11.0 | 0.0 | 0.0 | 0.0 | 4.0 | 3044 |
3 | 3 | 11000.0 | 3 | 7.26 | 340.96 | A | A4 | 46854.0 | 10+ years | 1 | ... | 4.0 | 7.0 | 21.0 | 6.0 | 9.0 | 0.0 | 0.0 | 0.0 | 1.0 | 2983 |
4 | 4 | 3000.0 | 3 | 12.99 | 101.07 | C | C2 | 54.0 | 10+ years | 1 | ... | 9.0 | 10.0 | 15.0 | 7.0 | 12.0 | 0.0 | 0.0 | 0.0 | 4.0 | 3196 |
5 rows × 48 columns
data_train['employmentLength'].value_counts(dropna=False).sort_index()
1 year 52489
10+ years 309552
2 years 72358
3 years 64152
4 years 47985
5 years 50102
6 years 37254
7 years 35407
8 years 36192
9 years 30272
< 1 year 64237
Name: employmentLength, dtype: int64
1.3 对象类型特征转换到数值
def employmentLength_to_int(s):
if pd.isnull(s):
return s
else:
return np.int8(s.split()[0])
for data in [data_train, data_test_a]:
data['employmentLength'].replace(to_replace='10+ years', value='10 years', inplace=True)
data['employmentLength'].replace('< 1 year', '0 years', inplace=True)
data['employmentLength'] = data['employmentLength'].apply(employmentLength_to_int)
data_train['employmentLength'].value_counts(dropna=False).sort_index()
0 64237
1 52489
2 72358
3 64152
4 47985
5 50102
6 37254
7 35407
8 36192
9 30272
10 309552
Name: employmentLength, dtype: int64
data_train['earliesCreditLine'].sample(5)
372611 Jul-2008
414399 Nov-1989
733310 Nov-2003
137 Jun-1996
28298 Dec-1987
Name: earliesCreditLine, dtype: object
for data in [data_train, data_test_a]:
data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda s: int(s[-4:]))
data_train['earliesCreditLine']
0 2001
1 2002
2 2006
3 1999
4 1977
...
799995 2011
799996 1989
799997 2002
799998 1994
799999 2002
Name: earliesCreditLine, Length: 800000, dtype: int64
1.4 类别特征处理
# 部分类别特征
cate_features = ['grade', 'subGrade', 'employmentTitle', 'homeOwnership', 'verificationStatus', 'purpose', 'postCode', 'regionCode',
'applicationType', 'initialListStatus', 'title', 'policyCode']
for f in cate_features:
print(f, '类型数:', data[f].nunique())
grade 类型数: 7
subGrade 类型数: 35
employmentTitle 类型数: 79282
homeOwnership 类型数: 6
verificationStatus 类型数: 3
purpose 类型数: 14
postCode 类型数: 889
regionCode 类型数: 51
applicationType 类型数: 2
initialListStatus 类型数: 2
title 类型数: 12058
policyCode 类型数: 1
data_train[cate_features]
grade | subGrade | employmentTitle | homeOwnership | verificationStatus | purpose | postCode | regionCode | applicationType | initialListStatus | title | policyCode | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | E | E2 | 320.0 | 2 | 2 | 1 | 137.0 | 32 | 0 | 0 | 1.0 | 1.0 |
1 | D | D2 | 219843.0 | 0 | 2 | 0 | 156.0 | 18 | 0 | 1 | 1723.0 | 1.0 |
2 | D | D3 | 31698.0 | 0 | 2 | 0 | 337.0 | 14 | 0 | 0 | 0.0 | 1.0 |
3 | A | A4 | 46854.0 | 1 | 1 | 4 | 148.0 | 11 | 0 | 1 | 4.0 | 1.0 |
4 | C | C2 | 54.0 | 1 | 2 | 10 | 301.0 | 21 | 0 | 0 | 11.0 | 1.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
799995 | C | C4 | 2659.0 | 1 | 0 | 0 | 242.0 | 8 | 0 | 1 | 0.0 | 1.0 |
799996 | A | A4 | 29205.0 | 0 | 2 | 4 | 563.0 | 10 | 0 | 0 | 33369.0 | 1.0 |
799997 | C | C3 | 2582.0 | 1 | 2 | 0 | 47.0 | 17 | 0 | 1 | 0.0 | 1.0 |
799998 | A | A4 | 151.0 | 0 | 2 | 4 | 34.0 | 18 | 0 | 1 | 4.0 | 1.0 |
799999 | B | B3 | 13.0 | 0 | 0 | 4 | 62.0 | 13 | 0 | 0 | 4.0 | 1.0 |
800000 rows × 12 columns
#像等级这种类别特征,是有优先级的可以labelencode或者自映射
for data in [data_train, data_test_a]:
data['grade'] = data['grade'].map({'A':1,'B':2,'C':3,'D':4,'E':5,'F':6,'G':7})
data_train['grade']
0 5
1 4
2 4
3 1
4 3
..
799995 3
799996 1
799997 3
799998 1
799999 2
Name: grade, Length: 800000, dtype: int64
# 类型数在2之上,又不是高维稀疏的,且纯分类特征
for data in [data_train, data_test_a]:
data = pd.get_dummies(data, columns=['subGrade', 'homeOwnership', 'verificationStatus', 'purpose', 'regionCode'], drop_first=True)
data_train.subGrade
0 E2
1 D2
2 D3
3 A4
4 C2
..
799995 C4
799996 A4
799997 C3
799998 A4
799999 B3
Name: subGrade, Length: 800000, dtype: object
data_train = pd.get_dummies(data_train, columns=['subGrade', 'homeOwnership', 'verificationStatus', 'purpose', 'regionCode'], drop_first=True)
data_test_a = pd.get_dummies(data_test_a, columns=['subGrade', 'homeOwnership', 'verificationStatus', 'purpose', 'regionCode'], drop_first=True)
data_test_a.columns
Index(['id', 'loanAmnt', 'term', 'interestRate', 'installment', 'grade',
'employmentTitle', 'employmentLength', 'annualIncome', 'issueDate',
...
'regionCode_41', 'regionCode_42', 'regionCode_43', 'regionCode_44',
'regionCode_45', 'regionCode_46', 'regionCode_47', 'regionCode_48',
'regionCode_49', 'regionCode_50'],
dtype='object', length=148)
2 异常值处理
2.1 检测异常的方法一:均方差
def find_outliers_by_3segama(data,fea):
data_std = np.std(data[fea])
data_mean = np.mean(data[fea])
outliers_cut_off = data_std * 3
lower_rule = data_mean - outliers_cut_off
upper_rule = data_mean + outliers_cut_off
data[fea+'_outliers'] = data[fea].apply(lambda x:str('异常值') if x > upper_rule or x < lower_rule else '正常值')
return data
data_train = data_train.copy()
for fea in numerical_fea:
data_train = find_outliers_by_3segama(data_train,fea)
print(data_train[fea+'_outliers'].value_counts())
print(data_train.groupby(fea+'_outliers')['isDefault'].sum())
print('*'*10)
正常值 800000
Name: id_outliers, dtype: int64
id_outliers
正常值 159610
Name: isDefault, dtype: int64
**********
正常值 800000
Name: loanAmnt_outliers, dtype: int64
loanAmnt_outliers
正常值 159610
Name: isDefault, dtype: int64
**********
正常值 800000
Name: term_outliers, dtype: int64
term_outliers
正常值 159610
Name: isDefault, dtype: int64
**********
正常值 794259
异常值 5741
Name: interestRate_outliers, dtype: int64
interestRate_outliers
异常值 2916
正常值 156694
Name: isDefault, dtype: int64
**********
正常值 792046
异常值 7954
Name: installment_outliers, dtype: int64
installment_outliers
异常值 2152
正常值 157458
Name: isDefault, dtype: int64
**********
正常值 800000
Name: employmentTitle_outliers, dtype: int64
employmentTitle_outliers
正常值 159610
Name: isDefault, dtype: int64
**********
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
~\anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2890 try:
-> 2891 return self._engine.get_loc(casted_key)
2892 except KeyError as err:
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'homeOwnership'
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
<ipython-input-72-ccb16448c879> in <module>
1 data_train = data_train.copy()
2 for fea in numerical_fea:
----> 3 data_train = find_outliers_by_3segama(data_train,fea)
4 print(data_train[fea+'_outliers'].value_counts())
5 print(data_train.groupby(fea+'_outliers')['isDefault'].sum())
<ipython-input-71-4f149a870f4e> in find_outliers_by_3segama(data, fea)
1 def find_outliers_by_3segama(data,fea):
----> 2 data_std = np.std(data[fea])
3 data_mean = np.mean(data[fea])
4 outliers_cut_off = data_std * 3
5 lower_rule = data_mean - outliers_cut_off
~\anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
2900 if self.columns.nlevels > 1:
2901 return self._getitem_multilevel(key)
-> 2902 indexer = self.columns.get_loc(key)
2903 if is_integer(indexer):
2904 indexer = [indexer]
~\anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2891 return self._engine.get_loc(casted_key)
2892 except KeyError as err:
-> 2893 raise KeyError(key) from err
2894
2895 if tolerance is not None:
KeyError: 'homeOwnership'
2.2 检测异常的方法二:箱型图
3 数据分桶
分箱的基本原则:
- a. (1)小分箱占比不低于5%
- b. (2)箱内不能全部是好客户
- c. (3)连续箱单调
# 固定宽度分箱
# 通过除法映射到间隔均匀的分箱中,每个分箱的取值范围都是loanAmnt/1000
data['loanAmnt_bin1'] = np.floor_divide(data['loanAmnt'], 1000)
## 通过对数函数映射到指数宽度分箱
data['loanAmnt_bin2'] = np.floor(np.log10(data['loanAmnt']))
# 分位数分箱
data['loanAmnt_bin3'] = pd.qcut(data['loanAmnt'], 10, labels=False)
File "<ipython-input-75-863fe9282f0d>", line 2
data['loanAmnt_bin3'] = pd.qcut(data['loanAmnt'], 10, labels=False)
^
SyntaxError: invalid character in identifier
4 特征交互
for col in ['grade', 'subGrade']:
temp_dict = data_train.groupby([col]) ['isDefault'].agg(['mean']).reset_index().rename(columns={'mean': col + '_target_mean'})
temp_dict.index = temp_dict[col].values
temp_dict = temp_dict[col + '_target_mean'].to_dict()
data_train[col + '_target_mean'] = data_train[col].map(temp_dict)
data_test_a[col + '_target_mean'] = data_test_a[col].map(temp_dict)
# 其他衍生变量 mean 和 std
for df in [data_train, data_test_a]:
for item in ['n0','n1','n2','n2.1','n4','n5','n6','n7','n8','n9','n10','n11','n12','n13','n14']:
df['grade_to_mean_' + item] = df['grade'] / df.groupby([item]) ['grade'].transform('mean')
df['grade_to_std_' + item] = df['grade'] / df.groupby([item]) ['grade'].transform('std')
5 特征编码
5.1 labelEncode直接放入树模型中
#label-encode:subGrade,postCode,title
# 高维类别特征需要进行转换
for col in tqdm(['employmentTitle', 'postCode', 'title','subGrade']):
le = LabelEncoder()
le.fit(list(data_train[col].astype(str).values) + list(data_test_a[col].astype(str).values))
data_train[col] = le.transform(list(data_train[col].astype(str).values))
data_test_a[col] = le.transform(list(data_test_a[col].astype(str).values))
print('Label Encoding 完成')
5.2 逻辑回归等模型要单独增加的特征工程
# 举例归一化过程
#伪代码
for fea in [要归一化的特征列表]:
data[fea] = ((data[fea] - np.min(data[fea])) / (np.max(data[fea]) - np.min(data[fea])))
6 特征选择
6.1 Filter
######方差选择法
from sklearn.feature_selection import VarianceThreshold
#其中参数threshold为方差的阈值
VarianceThreshold(threshold=3).fit_transform(train,target_train)
##########相关系数法
from sklearn.feature_selection import SelectKBest
from scipy.stats import pearsonr
#选择K个好的特征,返回选择特征后的数据
#第一个参数为计算评估特征是否好的函数,该函数输入特征矩阵和目标向量,
#输出二元组(评分,P值)的数组,数组第i项为第i个特征的评分和P值。在此定义为计算相关系数
#参数k为选择的特征个数
SelectKBest(k=5).fit_transform(train,target_train)
###########卡方检验
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
#参数k为选择的特征个数
SelectKBest(chi2, k=5).fit_transform(train,target_train)
#########互信息法
from sklearn.feature_selection import SelectKBest
from minepy import MINE
#由于MINE的设计不是函数式的,定义mic方法将其为函数式的,
#返回一个二元组,二元组的第2项设置成固定的P值0.5
def mic(x, y):
m = MINE()
m.compute_score(x, y)
return (m.mic(), 0.5)
#参数k为选择的特征个数
SelectKBest(lambda X, Y: array(map(lambda x:mic(x, Y), X.T)).T, k=2).fit_transform(train,target_train)
6.2 Wrapper(Recursivefeature elimination,RFE)
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
#递归特征消除法,返回特征选择后的数据
#参数estimator为基模型
#参数n_features_to_select为选择的特征个数
RFE(estimator=LogisticRegression(), n_features_to_select=2).fit_transform(train,target_train)
6.3 Embedded
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
#带L1惩罚项的逻辑回归作为基模型的特征选择
SelectFromModel(LogisticRegression(penalty="l1", C=0.1)).fit_transform(train,target_train)
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import GradientBoostingClassifier
#GBDT作为基模型的特征选择
SelectFromModel(GradientBoostingClassifier()).fit_transform(train,target_train)
6.4 数据处理
######本数据集中我们删除非入模特征后,并对缺失值填充,然后用计算协方差的方式看一下特征间相关性,然后进行模型训练
# 删除不需要的数据
for data in [data_train, data_test_a]:
data.drop(['issueDate','id'], axis=1,inplace=True)
"纵向用缺失值上面的值替换缺失值"
data_train = data_train.fillna(axis=0,method='ffill')
#计算协方差
data_corr = x_train.corrwith(data_train.isDefault)
#计算相关性
result = pd.DataFrame(columns=['features', 'corr'])
result['features'] = data_corr.index
result['corr'] = data_corr.values