task3：特征工程

最新推荐文章于 2023-11-25 17:43:59 发布

老三1987

最新推荐文章于 2023-11-25 17:43:59 发布

阅读量1.2k

点赞数 1

分类专栏：机器学习分类算法文章标签： python

本文链接：https://blog.csdn.net/jona1987/article/details/108721947

版权

机器学习分类算法专栏收录该内容

13 篇文章 0 订阅

订阅专栏

目标

学习特征预处理、缺失值、异常值处理、数据分桶等特征处理方法
学习特征交互、编码、选择的相应方法

import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import datetime 
from tqdm import tqdm 
from sklearn.preprocessing import LabelEncoder 
from sklearn.feature_selection import SelectKBest 
from sklearn.feature_selection import chi2 
from sklearn.preprocessing import MinMaxScaler 
import xgboost as xgb 
import lightgbm as lgb 
#from catboost import CatBoostRegressor 
import warnings 
from sklearn.model_selection import StratifiedKFold, KFold 
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss 
warnings.filterwarnings('ignore')

import os
os.getcwd()

#pip install xgboost-1.1.0-cp37-cp37m-win_amd64.whl

Requirement already satisfied: xgboost==1.1.0 from file:///C:/Users/gnzha/%E5%B7%A5%E4%BD%9C/%E5%B7%A5%E4%BD%9C/bonc/python/%E5%AD%A6%E4%B9%A0%E8%B5%84%E6%96%99/202009datawhale%E8%B5%84%E6%96%99/xgboost-1.1.0-cp37-cp37m-win_amd64.whl
Note: you may need to restart the kernel to use updated packages.

#pip install lightgbm-2.3.2-cp37-cp37m-win_amd64.whl

Processing c:\users\gnzha\工作\工作\bonc\python\学习资料\202009datawhale资料\lightgbm-2.3.2-cp37-cp37m-win_amd64.whl
 Installing collected packages: lightgbm
Successfully installed lightgbm-2.3.2
Note: you may need to restart the kernel to use updated packages.

##pip install catboost -i https://pypi.tuna.tsinghua.edu.cn/simple

pip list

pip show xgboost

Name: xgboost
Version: 1.1.0
Summary: XGBoost Python Package
Home-page: https://github.com/dmlc/xgboost
Author: None
Author-email: None
License: Apache-2.0
Location: 
Requires: numpy, scipy
Required-by: 
Note: you may need to restart the kernel to use updated packages.

data_train =pd.read_csv('./数据/train.csv') 
data_test_a = pd.read_csv('./数据/testA.csv')

1 数据预处理

#首先我们查找出数据中的对象特征和数值特征
numerical_fea = list(data_train.select_dtypes(exclude=['object']).columns) 
category_fea = list(filter(lambda x: x not in numerical_fea,list(data_train.columns))) 
label = 'isDefault' 
numerical_fea.remove(label)

1.1 缺失值的填充

data_train1=data_train.fillna(0)

data_train1.head()

	id	loanAmnt	term	interestRate	installment	grade	subGrade	employmentTitle	employmentLength	homeOwnership	...	n5	n6	n7	n8	n9	n10	n14
0	0	35000.0	5	19.52	917.97	E	E2	320.0	2 years	2	...	9.0	8.0	4.0	12.0	2.0	7.0	2.0
1	1	18000.0	5	18.49	461.90	D	D2	219843.0	5 years	0	...	0.0	0.0	0.0	0.0	0.0	13.0	0.0
2	2	12000.0	5	16.99	298.17	D	D3	31698.0	8 years	0	...	0.0	21.0	4.0	5.0	3.0	11.0	4.0
3	3	11000.0	3	7.26	340.96	A	A4	46854.0	10+ years	1	...	16.0	4.0	7.0	21.0	6.0	9.0	1.0
4	4	3000.0	3	12.99	101.07	C	C2	54.0	0	1	...	4.0	9.0	10.0	15.0	7.0	12.0	4.0

5 rows × 47 columns

data_train2 = data_train.fillna(axis=0,method='ffill')

data_train2 = data_train.fillna(axis=0,method='bfill',limit=2)

data_train.isnull().sum()

id                        0
loanAmnt                  0
term                      0
interestRate              0
installment               0
grade                     0
subGrade                  0
employmentTitle           1
employmentLength      46799
homeOwnership             0
annualIncome              0
verificationStatus        0
issueDate                 0
isDefault                 0
purpose                   0
postCode                  1
regionCode                0
dti                     239
delinquency_2years        0
ficoRangeLow              0
ficoRangeHigh             0
openAcc                   0
pubRec                    0
pubRecBankruptcies      405
revolBal                  0
revolUtil               531
totalAcc                  0
initialListStatus         0
applicationType           0
earliesCreditLine         0
title                     1
policyCode                0
n0                    40270
n1                    40270
n2                    40270
n2.1                  40270
n4                    33239
n5                    40270
n6                    40270
n7                    40270
n8                    40271
n9                    40270
n10                   33239
n11                   69752
n12                   40270
n13                   40270
n14                   40270
dtype: int64

#按照平均数填充数值型特征 
data_train[numerical_fea] = data_train[numerical_fea].fillna(data_train[numerical_fea].median()) 
data_test_a[numerical_fea] = data_test_a[numerical_fea].fillna(data_train[numerical_fea].median()) 
#按照众数填充类别型特征
data_train[category_fea] = data_train[category_fea].fillna(data_train[category_fea].mode()) 
data_test_a[category_fea] = data_test_a[category_fea].fillna(data_train[category_fea].mode())

data_train[category_fea].mode()

	grade	subGrade	employmentLength	issueDate	earliesCreditLine
0	B	C1	10+ years	2016-03-01	Aug-2001

data_train.isnull().sum()

id                        0
loanAmnt                  0
term                      0
interestRate              0
installment               0
grade                     0
subGrade                  0
employmentTitle           0
employmentLength      46799
homeOwnership             0
annualIncome              0
verificationStatus        0
issueDate                 0
isDefault                 0
purpose                   0
postCode                  0
regionCode                0
dti                       0
delinquency_2years        0
ficoRangeLow              0
ficoRangeHigh             0
openAcc                   0
pubRec                    0
pubRecBankruptcies        0
revolBal                  0
revolUtil                 0
totalAcc                  0
initialListStatus         0
applicationType           0
earliesCreditLine         0
title                     0
policyCode                0
n0                        0
n1                        0
n2                        0
n2.1                      0
n4                        0
n5                        0
n6                        0
n7                        0
n8                        0
n9                        0
n10                       0
n11                       0
n12                       0
n13                       0
n14                       0
dtype: int64

data_train.employmentLength.head()

0      2 years
1      5 years
2      8 years
3    10+ years
4          NaN
Name: employmentLength, dtype: object

data_train.employmentLength=data_train.employmentLength.fillna('10+ years')

data_train.isnull().any().sum()

data_train.employmentLength

0           2 years
1           5 years
2           8 years
3         10+ years
4         10+ years
            ...    
799995      7 years
799996    10+ years
799997    10+ years
799998    10+ years
799999      5 years
Name: employmentLength, Length: 800000, dtype: object

category_fea

['grade', 'subGrade', 'employmentLength', 'issueDate', 'earliesCreditLine']

1.2 时间格式处理

#转化成时间格式 
for data in [data_train, data_test_a]:    
    data['issueDate'] = pd.to_datetime(data['issueDate'],format='%Y-%m-%d')    
    startdate = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d')    
    #构造时间特征    
    data['issueDateDT'] = data['issueDate'].apply(lambda x: x-startdate).dt.days

data_train.columns

Index(['id', 'loanAmnt', 'term', 'interestRate', 'installment', 'grade',
       'subGrade', 'employmentTitle', 'employmentLength', 'homeOwnership',
       'annualIncome', 'verificationStatus', 'issueDate', 'isDefault',
       'purpose', 'postCode', 'regionCode', 'dti', 'delinquency_2years',
       'ficoRangeLow', 'ficoRangeHigh', 'openAcc', 'pubRec',
       'pubRecBankruptcies', 'revolBal', 'revolUtil', 'totalAcc',
       'initialListStatus', 'applicationType', 'earliesCreditLine', 'title',
       'policyCode', 'n0', 'n1', 'n2', 'n2.1', 'n4', 'n5', 'n6', 'n7', 'n8',
       'n9', 'n10', 'n11', 'n12', 'n13', 'n14', 'issueDateDT'],
      dtype='object')

data_train.head()

	id	loanAmnt	term	interestRate	installment	grade	subGrade	employmentTitle	employmentLength	homeOwnership	...	n6	n7	n8	n9	n10	n14	issueDateDT
0	0	35000.0	5	19.52	917.97	E	E2	320.0	2 years	2	...	8.0	4.0	12.0	2.0	7.0	2.0	2587
1	1	18000.0	5	18.49	461.90	D	D2	219843.0	5 years	0	...	7.0	7.0	13.0	5.0	13.0	2.0	1888
2	2	12000.0	5	16.99	298.17	D	D3	31698.0	8 years	0	...	21.0	4.0	5.0	3.0	11.0	4.0	3044
3	3	11000.0	3	7.26	340.96	A	A4	46854.0	10+ years	1	...	4.0	7.0	21.0	6.0	9.0	1.0	2983
4	4	3000.0	3	12.99	101.07	C	C2	54.0	10+ years	1	...	9.0	10.0	15.0	7.0	12.0	4.0	3196

5 rows × 48 columns

data_train['employmentLength'].value_counts(dropna=False).sort_index()

1 year        52489
10+ years    309552
2 years       72358
3 years       64152
4 years       47985
5 years       50102
6 years       37254
7 years       35407
8 years       36192
9 years       30272
< 1 year      64237
Name: employmentLength, dtype: int64

1.3 对象类型特征转换到数值

def employmentLength_to_int(s):    
    if pd.isnull(s):        
        return s    
    else:        
        return np.int8(s.split()[0]) 
for data in [data_train, data_test_a]:    
    data['employmentLength'].replace(to_replace='10+ years', value='10 years', inplace=True)    
    data['employmentLength'].replace('< 1 year', '0 years', inplace=True)    
    data['employmentLength'] = data['employmentLength'].apply(employmentLength_to_int)

data_train['employmentLength'].value_counts(dropna=False).sort_index()

0      64237
1      52489
2      72358
3      64152
4      47985
5      50102
6      37254
7      35407
8      36192
9      30272
10    309552
Name: employmentLength, dtype: int64

data_train['earliesCreditLine'].sample(5)

372611    Jul-2008
414399    Nov-1989
733310    Nov-2003
137       Jun-1996
28298     Dec-1987
Name: earliesCreditLine, dtype: object

for data in [data_train, data_test_a]:    
    data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda s: int(s[-4:]))

data_train['earliesCreditLine']

0         2001
1         2002
2         2006
3         1999
4         1977
          ... 
799995    2011
799996    1989
799997    2002
799998    1994
799999    2002
Name: earliesCreditLine, Length: 800000, dtype: int64

1.4 类别特征处理

# 部分类别特征 
cate_features = ['grade', 'subGrade', 'employmentTitle', 'homeOwnership', 'verificationStatus', 'purpose', 'postCode', 'regionCode', 
                 'applicationType', 'initialListStatus', 'title', 'policyCode'] 
for f in cate_features:    
    print(f, '类型数：', data[f].nunique())

grade 类型数： 7
subGrade 类型数： 35
employmentTitle 类型数： 79282
homeOwnership 类型数： 6
verificationStatus 类型数： 3
purpose 类型数： 14
postCode 类型数： 889
regionCode 类型数： 51
applicationType 类型数： 2
initialListStatus 类型数： 2
title 类型数： 12058
policyCode 类型数： 1

data_train[cate_features]

	grade	subGrade	employmentTitle	homeOwnership	verificationStatus	purpose	postCode	regionCode	applicationType	initialListStatus	title	policyCode
0	E	E2	320.0	2	2	1	137.0	32	0	0	1.0	1.0
1	D	D2	219843.0	0	2	0	156.0	18	0	1	1723.0	1.0
2	D	D3	31698.0	0	2	0	337.0	14	0	0	0.0	1.0
3	A	A4	46854.0	1	1	4	148.0	11	0	1	4.0	1.0
4	C	C2	54.0	1	2	10	301.0	21	0	0	11.0	1.0
...	...	...	...	...	...	...	...	...	...	...	...	...
799995	C	C4	2659.0	1	0	0	242.0	8	0	1	0.0	1.0
799996	A	A4	29205.0	0	2	4	563.0	10	0	0	33369.0	1.0
799997	C	C3	2582.0	1	2	0	47.0	17	0	1	0.0	1.0
799998	A	A4	151.0	0	2	4	34.0	18	0	1	4.0	1.0
799999	B	B3	13.0	0	0	4	62.0	13	0	0	4.0	1.0

800000 rows × 12 columns

#像等级这种类别特征，是有优先级的可以labelencode或者自映射
for data in [data_train, data_test_a]:    
    data['grade'] = data['grade'].map({'A':1,'B':2,'C':3,'D':4,'E':5,'F':6,'G':7})

data_train['grade']

0         5
1         4
2         4
3         1
4         3
         ..
799995    3
799996    1
799997    3
799998    1
799999    2
Name: grade, Length: 800000, dtype: int64

# 类型数在2之上，又不是高维稀疏的,且纯分类特征 
for data in [data_train, data_test_a]:    
    data = pd.get_dummies(data, columns=['subGrade', 'homeOwnership', 'verificationStatus', 'purpose', 'regionCode'], drop_first=True)

data_train.subGrade

0         E2
1         D2
2         D3
3         A4
4         C2
          ..
799995    C4
799996    A4
799997    C3
799998    A4
799999    B3
Name: subGrade, Length: 800000, dtype: object

data_train = pd.get_dummies(data_train, columns=['subGrade', 'homeOwnership', 'verificationStatus', 'purpose', 'regionCode'], drop_first=True)

data_test_a = pd.get_dummies(data_test_a, columns=['subGrade', 'homeOwnership', 'verificationStatus', 'purpose', 'regionCode'], drop_first=True)

data_test_a.columns

Index(['id', 'loanAmnt', 'term', 'interestRate', 'installment', 'grade',
       'employmentTitle', 'employmentLength', 'annualIncome', 'issueDate',
       ...
       'regionCode_41', 'regionCode_42', 'regionCode_43', 'regionCode_44',
       'regionCode_45', 'regionCode_46', 'regionCode_47', 'regionCode_48',
       'regionCode_49', 'regionCode_50'],
      dtype='object', length=148)

2 异常值处理

2.1 检测异常的方法一：均方差

def find_outliers_by_3segama(data,fea):    
    data_std = np.std(data[fea])    
    data_mean = np.mean(data[fea])    
    outliers_cut_off = data_std * 3    
    lower_rule = data_mean - outliers_cut_off    
    upper_rule = data_mean + outliers_cut_off    
    data[fea+'_outliers'] = data[fea].apply(lambda x:str('异常值') if x > upper_rule or x < lower_rule else '正常值')    
    return data

data_train = data_train.copy() 
for fea in numerical_fea:    
    data_train = find_outliers_by_3segama(data_train,fea)    
    print(data_train[fea+'_outliers'].value_counts())    
    print(data_train.groupby(fea+'_outliers')['isDefault'].sum())    
    print('*'*10)

正常值    800000
Name: id_outliers, dtype: int64
id_outliers
正常值    159610
Name: isDefault, dtype: int64
**********
正常值    800000
Name: loanAmnt_outliers, dtype: int64
loanAmnt_outliers
正常值    159610
Name: isDefault, dtype: int64
**********
正常值    800000
Name: term_outliers, dtype: int64
term_outliers
正常值    159610
Name: isDefault, dtype: int64
**********
正常值    794259
异常值      5741
Name: interestRate_outliers, dtype: int64
interestRate_outliers
异常值      2916
正常值    156694
Name: isDefault, dtype: int64
**********
正常值    792046
异常值      7954
Name: installment_outliers, dtype: int64
installment_outliers
异常值      2152
正常值    157458
Name: isDefault, dtype: int64
**********
正常值    800000
Name: employmentTitle_outliers, dtype: int64
employmentTitle_outliers
正常值    159610
Name: isDefault, dtype: int64
**********



---------------------------------------------------------------------------

KeyError                                  Traceback (most recent call last)

~\anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
   2890             try:
-> 2891                 return self._engine.get_loc(casted_key)
   2892             except KeyError as err:


pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()


pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()


pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()


pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()


KeyError: 'homeOwnership'


The above exception was the direct cause of the following exception:


KeyError                                  Traceback (most recent call last)

<ipython-input-72-ccb16448c879> in <module>
      1 data_train = data_train.copy()
      2 for fea in numerical_fea:
----> 3     data_train = find_outliers_by_3segama(data_train,fea)
      4     print(data_train[fea+'_outliers'].value_counts())
      5     print(data_train.groupby(fea+'_outliers')['isDefault'].sum())


<ipython-input-71-4f149a870f4e> in find_outliers_by_3segama(data, fea)
      1 def find_outliers_by_3segama(data,fea):
----> 2     data_std = np.std(data[fea])
      3     data_mean = np.mean(data[fea])
      4     outliers_cut_off = data_std * 3
      5     lower_rule = data_mean - outliers_cut_off


~\anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
   2900             if self.columns.nlevels > 1:
   2901                 return self._getitem_multilevel(key)
-> 2902             indexer = self.columns.get_loc(key)
   2903             if is_integer(indexer):
   2904                 indexer = [indexer]


~\anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
   2891                 return self._engine.get_loc(casted_key)
   2892             except KeyError as err:
-> 2893                 raise KeyError(key) from err
   2894 
   2895         if tolerance is not None:


KeyError: 'homeOwnership'

2.2 检测异常的方法二：箱型图

3 数据分桶

分箱的基本原则：

a. （1）小分箱占比不低于5%
b. （2）箱内不能全部是好客户
c. （3）连续箱单调

# 固定宽度分箱 
# 通过除法映射到间隔均匀的分箱中，每个分箱的取值范围都是loanAmnt/1000 
data['loanAmnt_bin1'] = np.floor_divide(data['loanAmnt'], 1000)

## 通过对数函数映射到指数宽度分箱 
data['loanAmnt_bin2'] = np.floor(np.log10(data['loanAmnt']))

# 分位数分箱
data['loanAmnt_bin3'] = pd.qcut(data['loanAmnt'], 10, labels=False）

  File "<ipython-input-75-863fe9282f0d>", line 2
    data['loanAmnt_bin3'] = pd.qcut(data['loanAmnt'], 10, labels=False）
                                                                      ^
SyntaxError: invalid character in identifier

4 特征交互

for col in ['grade', 'subGrade']:     
    temp_dict = data_train.groupby([col]) ['isDefault'].agg(['mean']).reset_index().rename(columns={'mean': col + '_target_mean'})    
    temp_dict.index = temp_dict[col].values    
    temp_dict = temp_dict[col + '_target_mean'].to_dict()
    data_train[col + '_target_mean'] = data_train[col].map(temp_dict)    
    data_test_a[col + '_target_mean'] = data_test_a[col].map(temp_dict)

# 其他衍生变量 mean 和 std 
for df in [data_train, data_test_a]:    
    for item in ['n0','n1','n2','n2.1','n4','n5','n6','n7','n8','n9','n10','n11','n12','n13','n14']:        
        df['grade_to_mean_' + item] = df['grade'] / df.groupby([item]) ['grade'].transform('mean')        
        df['grade_to_std_' + item] = df['grade'] / df.groupby([item]) ['grade'].transform('std')

5 特征编码

5.1 labelEncode直接放入树模型中

#label-encode:subGrade,postCode,title 
# 高维类别特征需要进行转换 
for col in tqdm(['employmentTitle', 'postCode', 'title','subGrade']):    
    le = LabelEncoder()    
    le.fit(list(data_train[col].astype(str).values) + list(data_test_a[col].astype(str).values))    
    data_train[col] = le.transform(list(data_train[col].astype(str).values))    
    data_test_a[col] = le.transform(list(data_test_a[col].astype(str).values)) 
    print('Label Encoding 完成')

5.2 逻辑回归等模型要单独增加的特征工程

# 举例归一化过程 
#伪代码 
for fea in [要归一化的特征列表]：    
data[fea] = ((data[fea] - np.min(data[fea])) / (np.max(data[fea]) - np.min(data[fea])))

6 特征选择

6.1 Filter

######方差选择法
from sklearn.feature_selection import VarianceThreshold 
#其中参数threshold为方差的阈值 
VarianceThreshold(threshold=3).fit_transform(train,target_train)

##########相关系数法
from sklearn.feature_selection import SelectKBest 
from scipy.stats import pearsonr 
#选择K个好的特征，返回选择特征后的数据 
#第一个参数为计算评估特征是否好的函数，该函数输入特征矩阵和目标向量， 
#输出二元组（评分，P值）的数组，数组第i项为第i个特征的评分和P值。在此定义为计算相关系数 
#参数k为选择的特征个数
SelectKBest(k=5).fit_transform(train,target_train)

###########卡方检验
from sklearn.feature_selection import SelectKBest 
from sklearn.feature_selection import chi2 
#参数k为选择的特征个数
SelectKBest(chi2, k=5).fit_transform(train,target_train)

#########互信息法
from sklearn.feature_selection import SelectKBest 
from minepy import MINE 
#由于MINE的设计不是函数式的，定义mic方法将其为函数式的， 
#返回一个二元组，二元组的第2项设置成固定的P值0.5 
def mic(x, y):    
    m = MINE()    
    m.compute_score(x, y)    
    return (m.mic(), 0.5) 
#参数k为选择的特征个数 
SelectKBest(lambda X, Y: array(map(lambda x:mic(x, Y), X.T)).T, k=2).fit_transform(train,target_train)

6.2 Wrapper（Recursivefeature elimination，RFE）

from sklearn.feature_selection import RFE 
from sklearn.linear_model import LogisticRegression 
#递归特征消除法，返回特征选择后的数据 
#参数estimator为基模型 
#参数n_features_to_select为选择的特征个数
RFE(estimator=LogisticRegression(), n_features_to_select=2).fit_transform(train,target_train)

6.3 Embedded

from sklearn.feature_selection import SelectFromModel 
from sklearn.linear_model import LogisticRegression 
#带L1惩罚项的逻辑回归作为基模型的特征选择
SelectFromModel(LogisticRegression(penalty="l1", C=0.1)).fit_transform(train,target_train)

from sklearn.feature_selection import SelectFromModel 
from sklearn.ensemble import GradientBoostingClassifier 
#GBDT作为基模型的特征选择 
SelectFromModel(GradientBoostingClassifier()).fit_transform(train,target_train)

6.4 数据处理

######本数据集中我们删除非入模特征后，并对缺失值填充，然后用计算协方差的方式看一下特征间相关性，然后进行模型训练
# 删除不需要的数据 
for data in [data_train, data_test_a]:    
    data.drop(['issueDate','id'], axis=1,inplace=True)

"纵向用缺失值上面的值替换缺失值" 
data_train = data_train.fillna(axis=0,method='ffill')

#计算协方差 
data_corr = x_train.corrwith(data_train.isDefault) 
#计算相关性 
result = pd.DataFrame(columns=['features', 'corr']) 
result['features'] = data_corr.index 
result['corr'] = data_corr.values

老三1987

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
task3：特征工程

目标学习特征预处理、缺失值、异常值处理、数据分桶等特征处理方法学习特征交互、编码、选择的相应方法import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import datetime from tqdm import tqdm from sklearn.preprocessing import LabelEncoder from sklearn.featu
复制链接

扫一扫