task3:特征工程

目标

  1. 学习特征预处理、缺失值、异常值处理、数据分桶等特征处理方法
  2. 学习特征交互、编码、选择的相应方法
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import datetime 
from tqdm import tqdm 
from sklearn.preprocessing import LabelEncoder 
from sklearn.feature_selection import SelectKBest 
from sklearn.feature_selection import chi2 
from sklearn.preprocessing import MinMaxScaler 
import xgboost as xgb 
import lightgbm as lgb 
#from catboost import CatBoostRegressor 
import warnings 
from sklearn.model_selection import StratifiedKFold, KFold 
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss 
warnings.filterwarnings('ignore')
import os
os.getcwd()
#pip install xgboost-1.1.0-cp37-cp37m-win_amd64.whl
Requirement already satisfied: xgboost==1.1.0 from file:///C:/Users/gnzha/%E5%B7%A5%E4%BD%9C/%E5%B7%A5%E4%BD%9C/bonc/python/%E5%AD%A6%E4%B9%A0%E8%B5%84%E6%96%99/202009datawhale%E8%B5%84%E6%96%99/xgboost-1.1.0-cp37-cp37m-win_amd64.whl
Note: you may need to restart the kernel to use updated packages.
#pip install lightgbm-2.3.2-cp37-cp37m-win_amd64.whl
Processing c:\users\gnzha\工作\工作\bonc\python\学习资料\202009datawhale资料\lightgbm-2.3.2-cp37-cp37m-win_amd64.whl
 Installing collected packages: lightgbm
Successfully installed lightgbm-2.3.2
Note: you may need to restart the kernel to use updated packages.
##pip install catboost -i https://pypi.tuna.tsinghua.edu.cn/simple
pip list
pip show xgboost
Name: xgboost
Version: 1.1.0
Summary: XGBoost Python Package
Home-page: https://github.com/dmlc/xgboost
Author: None
Author-email: None
License: Apache-2.0
Location: 
Requires: numpy, scipy
Required-by: 
Note: you may need to restart the kernel to use updated packages.
data_train =pd.read_csv('./数据/train.csv') 
data_test_a = pd.read_csv('./数据/testA.csv')

1 数据预处理

#首先我们查找出数据中的对象特征和数值特征
numerical_fea = list(data_train.select_dtypes(exclude=['object']).columns) 
category_fea = list(filter(lambda x: x not in numerical_fea,list(data_train.columns))) 
label = 'isDefault' 
numerical_fea.remove(label)

1.1 缺失值的填充

data_train1=data_train.fillna(0)
data_train1.head()
idloanAmntterminterestRateinstallmentgradesubGradeemploymentTitleemploymentLengthhomeOwnership...n5n6n7n8n9n10n11n12n13n14
0035000.0519.52917.97EE2320.02 years2...9.08.04.012.02.07.00.00.00.02.0
1118000.0518.49461.90DD2219843.05 years0...0.00.00.00.00.013.00.00.00.00.0
2212000.0516.99298.17DD331698.08 years0...0.021.04.05.03.011.00.00.00.04.0
3311000.037.26340.96AA446854.010+ years1...16.04.07.021.06.09.00.00.00.01.0
443000.0312.99101.07CC254.001...4.09.010.015.07.012.00.00.00.04.0

5 rows × 47 columns

data_train2 = data_train.fillna(axis=0,method='ffill')
data_train2 = data_train.fillna(axis=0,method='bfill',limit=2)
data_train.isnull().sum()
id                        0
loanAmnt                  0
term                      0
interestRate              0
installment               0
grade                     0
subGrade                  0
employmentTitle           1
employmentLength      46799
homeOwnership             0
annualIncome              0
verificationStatus        0
issueDate                 0
isDefault                 0
purpose                   0
postCode                  1
regionCode                0
dti                     239
delinquency_2years        0
ficoRangeLow              0
ficoRangeHigh             0
openAcc                   0
pubRec                    0
pubRecBankruptcies      405
revolBal                  0
revolUtil               531
totalAcc                  0
initialListStatus         0
applicationType           0
earliesCreditLine         0
title                     1
policyCode                0
n0                    40270
n1                    40270
n2                    40270
n2.1                  40270
n4                    33239
n5                    40270
n6                    40270
n7                    40270
n8                    40271
n9                    40270
n10                   33239
n11                   69752
n12                   40270
n13                   40270
n14                   40270
dtype: int64
#按照平均数填充数值型特征 
data_train[numerical_fea] = data_train[numerical_fea].fillna(data_train[numerical_fea].median()) 
data_test_a[numerical_fea] = data_test_a[numerical_fea].fillna(data_train[numerical_fea].median()) 
#按照众数填充类别型特征
data_train[category_fea] = data_train[category_fea].fillna(data_train[category_fea].mode()) 
data_test_a[category_fea] = data_test_a[category_fea].fillna(data_train[category_fea].mode())
data_train[category_fea].mode()
gradesubGradeemploymentLengthissueDateearliesCreditLine
0BC110+ years2016-03-01Aug-2001
data_train.isnull().sum()
id                        0
loanAmnt                  0
term                      0
interestRate              0
installment               0
grade                     0
subGrade                  0
employmentTitle           0
employmentLength      46799
homeOwnership             0
annualIncome              0
verificationStatus        0
issueDate                 0
isDefault                 0
purpose                   0
postCode                  0
regionCode                0
dti                       0
delinquency_2years        0
ficoRangeLow              0
ficoRangeHigh             0
openAcc                   0
pubRec                    0
pubRecBankruptcies        0
revolBal                  0
revolUtil                 0
totalAcc                  0
initialListStatus         0
applicationType           0
earliesCreditLine         0
title                     0
policyCode                0
n0                        0
n1                        0
n2                        0
n2.1                      0
n4                        0
n5                        0
n6                        0
n7                        0
n8                        0
n9                        0
n10                       0
n11                       0
n12                       0
n13                       0
n14                       0
dtype: int64
data_train.employmentLength.head()
0      2 years
1      5 years
2      8 years
3    10+ years
4          NaN
Name: employmentLength, dtype: object
data_train.employmentLength=data_train.employmentLength.fillna('10+ years')
data_train.isnull().any().sum()
0
data_train.employmentLength
0           2 years
1           5 years
2           8 years
3         10+ years
4         10+ years
            ...    
799995      7 years
799996    10+ years
799997    10+ years
799998    10+ years
799999      5 years
Name: employmentLength, Length: 800000, dtype: object
category_fea
['grade', 'subGrade', 'employmentLength', 'issueDate', 'earliesCreditLine']

1.2 时间格式处理

#转化成时间格式 
for data in [data_train, data_test_a]:    
    data['issueDate'] = pd.to_datetime(data['issueDate'],format='%Y-%m-%d')    
    startdate = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d')    
    #构造时间特征    
    data['issueDateDT'] = data['issueDate'].apply(lambda x: x-startdate).dt.days
data_train.columns
Index(['id', 'loanAmnt', 'term', 'interestRate', 'installment', 'grade',
       'subGrade', 'employmentTitle', 'employmentLength', 'homeOwnership',
       'annualIncome', 'verificationStatus', 'issueDate', 'isDefault',
       'purpose', 'postCode', 'regionCode', 'dti', 'delinquency_2years',
       'ficoRangeLow', 'ficoRangeHigh', 'openAcc', 'pubRec',
       'pubRecBankruptcies', 'revolBal', 'revolUtil', 'totalAcc',
       'initialListStatus', 'applicationType', 'earliesCreditLine', 'title',
       'policyCode', 'n0', 'n1', 'n2', 'n2.1', 'n4', 'n5', 'n6', 'n7', 'n8',
       'n9', 'n10', 'n11', 'n12', 'n13', 'n14', 'issueDateDT'],
      dtype='object')
data_train.head()
idloanAmntterminterestRateinstallmentgradesubGradeemploymentTitleemploymentLengthhomeOwnership...n6n7n8n9n10n11n12n13n14issueDateDT
0035000.0519.52917.97EE2320.02 years2...8.04.012.02.07.00.00.00.02.02587
1118000.0518.49461.90DD2219843.05 years0...7.07.013.05.013.00.00.00.02.01888
2212000.0516.99298.17DD331698.08 years0...21.04.05.03.011.00.00.00.04.03044
3311000.037.26340.96AA446854.010+ years1...4.07.021.06.09.00.00.00.01.02983
443000.0312.99101.07CC254.010+ years1...9.010.015.07.012.00.00.00.04.03196

5 rows × 48 columns

data_train['employmentLength'].value_counts(dropna=False).sort_index()
1 year        52489
10+ years    309552
2 years       72358
3 years       64152
4 years       47985
5 years       50102
6 years       37254
7 years       35407
8 years       36192
9 years       30272
< 1 year      64237
Name: employmentLength, dtype: int64

1.3 对象类型特征转换到数值

def employmentLength_to_int(s):    
    if pd.isnull(s):        
        return s    
    else:        
        return np.int8(s.split()[0]) 
for data in [data_train, data_test_a]:    
    data['employmentLength'].replace(to_replace='10+ years', value='10 years', inplace=True)    
    data['employmentLength'].replace('< 1 year', '0 years', inplace=True)    
    data['employmentLength'] = data['employmentLength'].apply(employmentLength_to_int)
data_train['employmentLength'].value_counts(dropna=False).sort_index()
0      64237
1      52489
2      72358
3      64152
4      47985
5      50102
6      37254
7      35407
8      36192
9      30272
10    309552
Name: employmentLength, dtype: int64
data_train['earliesCreditLine'].sample(5)
372611    Jul-2008
414399    Nov-1989
733310    Nov-2003
137       Jun-1996
28298     Dec-1987
Name: earliesCreditLine, dtype: object
for data in [data_train, data_test_a]:    
    data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda s: int(s[-4:]))
data_train['earliesCreditLine']
0         2001
1         2002
2         2006
3         1999
4         1977
          ... 
799995    2011
799996    1989
799997    2002
799998    1994
799999    2002
Name: earliesCreditLine, Length: 800000, dtype: int64

1.4 类别特征处理

# 部分类别特征 
cate_features = ['grade', 'subGrade', 'employmentTitle', 'homeOwnership', 'verificationStatus', 'purpose', 'postCode', 'regionCode', 
                 'applicationType', 'initialListStatus', 'title', 'policyCode'] 
for f in cate_features:    
    print(f, '类型数:', data[f].nunique())
grade 类型数: 7
subGrade 类型数: 35
employmentTitle 类型数: 79282
homeOwnership 类型数: 6
verificationStatus 类型数: 3
purpose 类型数: 14
postCode 类型数: 889
regionCode 类型数: 51
applicationType 类型数: 2
initialListStatus 类型数: 2
title 类型数: 12058
policyCode 类型数: 1
data_train[cate_features]
gradesubGradeemploymentTitlehomeOwnershipverificationStatuspurposepostCoderegionCodeapplicationTypeinitialListStatustitlepolicyCode
0EE2320.0221137.032001.01.0
1DD2219843.0020156.018011723.01.0
2DD331698.0020337.014000.01.0
3AA446854.0114148.011014.01.0
4CC254.01210301.0210011.01.0
.......................................
799995CC42659.0100242.08010.01.0
799996AA429205.0024563.0100033369.01.0
799997CC32582.012047.017010.01.0
799998AA4151.002434.018014.01.0
799999BB313.000462.013004.01.0

800000 rows × 12 columns

#像等级这种类别特征,是有优先级的可以labelencode或者自映射
for data in [data_train, data_test_a]:    
    data['grade'] = data['grade'].map({'A':1,'B':2,'C':3,'D':4,'E':5,'F':6,'G':7})
data_train['grade']
0         5
1         4
2         4
3         1
4         3
         ..
799995    3
799996    1
799997    3
799998    1
799999    2
Name: grade, Length: 800000, dtype: int64
# 类型数在2之上,又不是高维稀疏的,且纯分类特征 
for data in [data_train, data_test_a]:    
    data = pd.get_dummies(data, columns=['subGrade', 'homeOwnership', 'verificationStatus', 'purpose', 'regionCode'], drop_first=True)
data_train.subGrade
0         E2
1         D2
2         D3
3         A4
4         C2
          ..
799995    C4
799996    A4
799997    C3
799998    A4
799999    B3
Name: subGrade, Length: 800000, dtype: object
data_train = pd.get_dummies(data_train, columns=['subGrade', 'homeOwnership', 'verificationStatus', 'purpose', 'regionCode'], drop_first=True)
data_test_a = pd.get_dummies(data_test_a, columns=['subGrade', 'homeOwnership', 'verificationStatus', 'purpose', 'regionCode'], drop_first=True)
data_test_a.columns
Index(['id', 'loanAmnt', 'term', 'interestRate', 'installment', 'grade',
       'employmentTitle', 'employmentLength', 'annualIncome', 'issueDate',
       ...
       'regionCode_41', 'regionCode_42', 'regionCode_43', 'regionCode_44',
       'regionCode_45', 'regionCode_46', 'regionCode_47', 'regionCode_48',
       'regionCode_49', 'regionCode_50'],
      dtype='object', length=148)

2 异常值处理

2.1 检测异常的方法一:均方差

def find_outliers_by_3segama(data,fea):    
    data_std = np.std(data[fea])    
    data_mean = np.mean(data[fea])    
    outliers_cut_off = data_std * 3    
    lower_rule = data_mean - outliers_cut_off    
    upper_rule = data_mean + outliers_cut_off    
    data[fea+'_outliers'] = data[fea].apply(lambda x:str('异常值') if x > upper_rule or x < lower_rule else '正常值')    
    return data
data_train = data_train.copy() 
for fea in numerical_fea:    
    data_train = find_outliers_by_3segama(data_train,fea)    
    print(data_train[fea+'_outliers'].value_counts())    
    print(data_train.groupby(fea+'_outliers')['isDefault'].sum())    
    print('*'*10)
正常值    800000
Name: id_outliers, dtype: int64
id_outliers
正常值    159610
Name: isDefault, dtype: int64
**********
正常值    800000
Name: loanAmnt_outliers, dtype: int64
loanAmnt_outliers
正常值    159610
Name: isDefault, dtype: int64
**********
正常值    800000
Name: term_outliers, dtype: int64
term_outliers
正常值    159610
Name: isDefault, dtype: int64
**********
正常值    794259
异常值      5741
Name: interestRate_outliers, dtype: int64
interestRate_outliers
异常值      2916
正常值    156694
Name: isDefault, dtype: int64
**********
正常值    792046
异常值      7954
Name: installment_outliers, dtype: int64
installment_outliers
异常值      2152
正常值    157458
Name: isDefault, dtype: int64
**********
正常值    800000
Name: employmentTitle_outliers, dtype: int64
employmentTitle_outliers
正常值    159610
Name: isDefault, dtype: int64
**********



---------------------------------------------------------------------------

KeyError                                  Traceback (most recent call last)

~\anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
   2890             try:
-> 2891                 return self._engine.get_loc(casted_key)
   2892             except KeyError as err:


pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()


pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()


pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()


pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()


KeyError: 'homeOwnership'


The above exception was the direct cause of the following exception:


KeyError                                  Traceback (most recent call last)

<ipython-input-72-ccb16448c879> in <module>
      1 data_train = data_train.copy()
      2 for fea in numerical_fea:
----> 3     data_train = find_outliers_by_3segama(data_train,fea)
      4     print(data_train[fea+'_outliers'].value_counts())
      5     print(data_train.groupby(fea+'_outliers')['isDefault'].sum())


<ipython-input-71-4f149a870f4e> in find_outliers_by_3segama(data, fea)
      1 def find_outliers_by_3segama(data,fea):
----> 2     data_std = np.std(data[fea])
      3     data_mean = np.mean(data[fea])
      4     outliers_cut_off = data_std * 3
      5     lower_rule = data_mean - outliers_cut_off


~\anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
   2900             if self.columns.nlevels > 1:
   2901                 return self._getitem_multilevel(key)
-> 2902             indexer = self.columns.get_loc(key)
   2903             if is_integer(indexer):
   2904                 indexer = [indexer]


~\anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
   2891                 return self._engine.get_loc(casted_key)
   2892             except KeyError as err:
-> 2893                 raise KeyError(key) from err
   2894 
   2895         if tolerance is not None:


KeyError: 'homeOwnership'

2.2 检测异常的方法二:箱型图

3 数据分桶

分箱的基本原则:

  • a. (1)小分箱占比不低于5%
  • b. (2)箱内不能全部是好客户
  • c. (3)连续箱单调
# 固定宽度分箱 
# 通过除法映射到间隔均匀的分箱中,每个分箱的取值范围都是loanAmnt/1000 
data['loanAmnt_bin1'] = np.floor_divide(data['loanAmnt'], 1000)
## 通过对数函数映射到指数宽度分箱 
data['loanAmnt_bin2'] = np.floor(np.log10(data['loanAmnt']))
# 分位数分箱
data['loanAmnt_bin3'] = pd.qcut(data['loanAmnt'], 10, labels=False
  File "<ipython-input-75-863fe9282f0d>", line 2
    data['loanAmnt_bin3'] = pd.qcut(data['loanAmnt'], 10, labels=False)
                                                                      ^
SyntaxError: invalid character in identifier

4 特征交互

for col in ['grade', 'subGrade']:     
    temp_dict = data_train.groupby([col]) ['isDefault'].agg(['mean']).reset_index().rename(columns={'mean': col + '_target_mean'})    
    temp_dict.index = temp_dict[col].values    
    temp_dict = temp_dict[col + '_target_mean'].to_dict()
    data_train[col + '_target_mean'] = data_train[col].map(temp_dict)    
    data_test_a[col + '_target_mean'] = data_test_a[col].map(temp_dict)
# 其他衍生变量 mean 和 std 
for df in [data_train, data_test_a]:    
    for item in ['n0','n1','n2','n2.1','n4','n5','n6','n7','n8','n9','n10','n11','n12','n13','n14']:        
        df['grade_to_mean_' + item] = df['grade'] / df.groupby([item]) ['grade'].transform('mean')        
        df['grade_to_std_' + item] = df['grade'] / df.groupby([item]) ['grade'].transform('std')

5 特征编码

5.1 labelEncode直接放入树模型中

#label-encode:subGrade,postCode,title 
# 高维类别特征需要进行转换 
for col in tqdm(['employmentTitle', 'postCode', 'title','subGrade']):    
    le = LabelEncoder()    
    le.fit(list(data_train[col].astype(str).values) + list(data_test_a[col].astype(str).values))    
    data_train[col] = le.transform(list(data_train[col].astype(str).values))    
    data_test_a[col] = le.transform(list(data_test_a[col].astype(str).values)) 
    print('Label Encoding 完成')

5.2 逻辑回归等模型要单独增加的特征工程

# 举例归一化过程 
#伪代码 
for fea in [要归一化的特征列表]:    
data[fea] = ((data[fea] - np.min(data[fea])) / (np.max(data[fea]) - np.min(data[fea])))

6 特征选择

6.1 Filter

######方差选择法
from sklearn.feature_selection import VarianceThreshold 
#其中参数threshold为方差的阈值 
VarianceThreshold(threshold=3).fit_transform(train,target_train)
##########相关系数法
from sklearn.feature_selection import SelectKBest 
from scipy.stats import pearsonr 
#选择K个好的特征,返回选择特征后的数据 
#第一个参数为计算评估特征是否好的函数,该函数输入特征矩阵和目标向量, 
#输出二元组(评分,P值)的数组,数组第i项为第i个特征的评分和P值。在此定义为计算相关系数 
#参数k为选择的特征个数
SelectKBest(k=5).fit_transform(train,target_train)
###########卡方检验
from sklearn.feature_selection import SelectKBest 
from sklearn.feature_selection import chi2 
#参数k为选择的特征个数
SelectKBest(chi2, k=5).fit_transform(train,target_train)
#########互信息法
from sklearn.feature_selection import SelectKBest 
from minepy import MINE 
#由于MINE的设计不是函数式的,定义mic方法将其为函数式的, 
#返回一个二元组,二元组的第2项设置成固定的P值0.5 
def mic(x, y):    
    m = MINE()    
    m.compute_score(x, y)    
    return (m.mic(), 0.5) 
#参数k为选择的特征个数 
SelectKBest(lambda X, Y: array(map(lambda x:mic(x, Y), X.T)).T, k=2).fit_transform(train,target_train)

6.2 Wrapper(Recursivefeature elimination,RFE)

from sklearn.feature_selection import RFE 
from sklearn.linear_model import LogisticRegression 
#递归特征消除法,返回特征选择后的数据 
#参数estimator为基模型 
#参数n_features_to_select为选择的特征个数
RFE(estimator=LogisticRegression(), n_features_to_select=2).fit_transform(train,target_train)

6.3 Embedded

from sklearn.feature_selection import SelectFromModel 
from sklearn.linear_model import LogisticRegression 
#带L1惩罚项的逻辑回归作为基模型的特征选择
SelectFromModel(LogisticRegression(penalty="l1", C=0.1)).fit_transform(train,target_train)
from sklearn.feature_selection import SelectFromModel 
from sklearn.ensemble import GradientBoostingClassifier 
#GBDT作为基模型的特征选择 
SelectFromModel(GradientBoostingClassifier()).fit_transform(train,target_train)

6.4 数据处理

######本数据集中我们删除非入模特征后,并对缺失值填充,然后用计算协方差的方式看一下特征间相关性,然后进行模型训练
# 删除不需要的数据 
for data in [data_train, data_test_a]:    
    data.drop(['issueDate','id'], axis=1,inplace=True)
"纵向用缺失值上面的值替换缺失值" 
data_train = data_train.fillna(axis=0,method='ffill')
#计算协方差 
data_corr = x_train.corrwith(data_train.isDefault) 
#计算相关性 
result = pd.DataFrame(columns=['features', 'corr']) 
result['features'] = data_corr.index 
result['corr'] = data_corr.values
  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值