目的
- EDA价值主要在于熟悉了解整个数据集的基本情况(缺失值,异常值),对数据集进行验证是否可以进行接 下来的机器学习或者深度学习建模.
- 了解变量间的相互关系、变量与预测值之间的存在关系。
- 为特征工程做准备
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
train_data=pd.read_csv('./数据/train.csv')
test_data=pd.read_csv('./数据/testA.csv')
import os
os.getcwd()
'C:\\Users\\gnzha\\工作\\工作\\bonc\\python\\学习资料\\202009datawhale资料'
train_data=pd.read_csv('./数据/train.csv')
1.1 数据理解
type(train_data)
pandas.core.frame.DataFrame
train_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800000 entries, 0 to 799999
Data columns (total 47 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 800000 non-null int64
1 loanAmnt 800000 non-null float64
2 term 800000 non-null int64
3 interestRate 800000 non-null float64
4 installment 800000 non-null float64
5 grade 800000 non-null object
6 subGrade 800000 non-null object
7 employmentTitle 799999 non-null float64
8 employmentLength 753201 non-null object
9 homeOwnership 800000 non-null int64
10 annualIncome 800000 non-null float64
11 verificationStatus 800000 non-null int64
12 issueDate 800000 non-null object
13 isDefault 800000 non-null int64
14 purpose 800000 non-null int64
15 postCode 799999 non-null float64
16 regionCode 800000 non-null int64
17 dti 799761 non-null float64
18 delinquency_2years 800000 non-null float64
19 ficoRangeLow 800000 non-null float64
20 ficoRangeHigh 800000 non-null float64
21 openAcc 800000 non-null float64
22 pubRec 800000 non-null float64
23 pubRecBankruptcies 799595 non-null float64
24 revolBal 800000 non-null float64
25 revolUtil 799469 non-null float64
26 totalAcc 800000 non-null float64
27 initialListStatus 800000 non-null int64
28 applicationType 800000 non-null int64
29 earliesCreditLine 800000 non-null object
30 title 799999 non-null float64
31 policyCode 800000 non-null float64
32 n0 759730 non-null float64
33 n1 759730 non-null float64
34 n2 759730 non-null float64
35 n2.1 759730 non-null float64
36 n4 766761 non-null float64
37 n5 759730 non-null float64
38 n6 759730 non-null float64
39 n7 759730 non-null float64
40 n8 759729 non-null float64
41 n9 759730 non-null float64
42 n10 766761 non-null float64
43 n11 730248 non-null float64
44 n12 759730 non-null float64
45 n13 759730 non-null float64
46 n14 759730 non-null float64
dtypes: float64(33), int64(9), object(5)
memory usage: 286.9+ MB
test_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 48 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 200000 non-null int64
1 loanAmnt 200000 non-null float64
2 term 200000 non-null int64
3 interestRate 200000 non-null float64
4 installment 200000 non-null float64
5 grade 200000 non-null object
6 subGrade 200000 non-null object
7 employmentTitle 200000 non-null float64
8 employmentLength 188258 non-null object
9 homeOwnership 200000 non-null int64
10 annualIncome 200000 non-null float64
11 verificationStatus 200000 non-null int64
12 issueDate 200000 non-null object
13 purpose 200000 non-null int64
14 postCode 200000 non-null float64
15 regionCode 200000 non-null int64
16 dti 199939 non-null float64
17 delinquency_2years 200000 non-null float64
18 ficoRangeLow 200000 non-null float64
19 ficoRangeHigh 200000 non-null float64
20 openAcc 200000 non-null float64
21 pubRec 200000 non-null float64
22 pubRecBankruptcies 199884 non-null float64
23 revolBal 200000 non-null float64
24 revolUtil 199873 non-null float64
25 totalAcc 200000 non-null float64
26 initialListStatus 200000 non-null int64
27 applicationType 200000 non-null int64
28 earliesCreditLine 200000 non-null object
29 title 200000 non-null float64
30 policyCode 200000 non-null float64
31 n0 189889 non-null float64
32 n1 189889 non-null float64
33 n2 189889 non-null float64
34 n2.1 189889 non-null float64
35 n2.2 189889 non-null float64
36 n2.3 189889 non-null float64
37 n4 191606 non-null float64
38 n5 189889 non-null float64
39 n6 189889 non-null float64
40 n7 189889 non-null float64
41 n8 189889 non-null float64
42 n9 189889 non-null float64
43 n10 191606 non-null float64
44 n11 182425 non-null float64
45 n12 189889 non-null float64
46 n13 189889 non-null float64
47 n14 189889 non-null float64
dtypes: float64(35), int64(8), object(5)
memory usage: 73.2+ MB
train_data.shape
(800000, 47)
test_data.shape
(200000, 48)
train_data.columns
Index(['id', 'loanAmnt', 'term', 'interestRate', 'installment', 'grade',
'subGrade', 'employmentTitle', 'employmentLength', 'homeOwnership',
'annualIncome', 'verificationStatus', 'issueDate', 'isDefault',
'purpose', 'postCode', 'regionCode', 'dti', 'delinquency_2years',
'ficoRangeLow', 'ficoRangeHigh', 'openAcc', 'pubRec',
'pubRecBankruptcies', 'revolBal', 'revolUtil', 'totalAcc',
'initialListStatus', 'applicationType', 'earliesCreditLine', 'title',
'policyCode', 'n0', 'n1', 'n2', 'n2.1', 'n4', 'n5', 'n6', 'n7', 'n8',
'n9', 'n10', 'n11', 'n12', 'n13', 'n14'],
dtype='object')
test_data.columns
Index(['id', 'loanAmnt', 'term', 'interestRate', 'installment', 'grade',
'subGrade', 'employmentTitle', 'employmentLength', 'homeOwnership',
'annualIncome', 'verificationStatus', 'issueDate', 'purpose',
'postCode', 'regionCode', 'dti', 'delinquency_2years', 'ficoRangeLow',
'ficoRangeHigh', 'openAcc', 'pubRec', 'pubRecBankruptcies', 'revolBal',
'revolUtil', 'totalAcc', 'initialListStatus', 'applicationType',
'earliesCreditLine', 'title', 'policyCode', 'n0', 'n1', 'n2', 'n2.1',
'n2.2', 'n2.3', 'n4', 'n5', 'n6', 'n7', 'n8', 'n9', 'n10', 'n11', 'n12',
'n13', 'n14'],
dtype='object')
train_data.index
RangeIndex(start=0, stop=800000, step=1)
test_data.index
RangeIndex(start=0, stop=200000, step=1)
train_data.values
array([[0, 35000.0, 5, ..., 0.0, 0.0, 2.0],
[1, 18000.0, 5, ..., nan, nan, nan],
[2, 12000.0, 5, ..., 0.0, 0.0, 4.0],
...,
[799997, 6000.0, 3, ..., 0.0, 1.0, 4.0],
[799998, 19200.0, 3, ..., 0.0, 0.0, 5.0],
[799999, 9000.0, 3, ..., 0.0, 0.0, 2.0]], dtype=object)
train_data.head()
id | loanAmnt | term | interestRate | installment | grade | subGrade | employmentTitle | employmentLength | homeOwnership | ... | n5 | n6 | n7 | n8 | n9 | n10 | n11 | n12 | n13 | n14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 35000.0 | 5 | 19.52 | 917.97 | E | E2 | 320.0 | 2 years | 2 | ... | 9.0 | 8.0 | 4.0 | 12.0 | 2.0 | 7.0 | 0.0 | 0.0 | 0.0 | 2.0 |
1 | 1 | 18000.0 | 5 | 18.49 | 461.90 | D | D2 | 219843.0 | 5 years | 0 | ... | NaN | NaN | NaN | NaN | NaN | 13.0 | NaN | NaN | NaN | NaN |
2 | 2 | 12000.0 | 5 | 16.99 | 298.17 | D | D3 | 31698.0 | 8 years | 0 | ... | 0.0 | 21.0 | 4.0 | 5.0 | 3.0 | 11.0 | 0.0 | 0.0 | 0.0 | 4.0 |
3 | 3 | 11000.0 | 3 | 7.26 | 340.96 | A | A4 | 46854.0 | 10+ years | 1 | ... | 16.0 | 4.0 | 7.0 | 21.0 | 6.0 | 9.0 | 0.0 | 0.0 | 0.0 | 1.0 |
4 | 4 | 3000.0 | 3 | 12.99 | 101.07 | C | C2 | 54.0 | NaN | 1 | ... | 4.0 | 9.0 | 10.0 | 15.0 | 7.0 | 12.0 | 0.0 | 0.0 | 0.0 | 4.0 |
5 rows × 47 columns
train_data.dtypes
id int64
loanAmnt float64
term int64
interestRate float64
installment float64
grade object
subGrade object
employmentTitle float64
employmentLength object
homeOwnership int64
annualIncome float64
verificationStatus int64
issueDate object
isDefault int64
purpose int64
postCode float64
regionCode int64
dti float64
delinquency_2years float64
ficoRangeLow float64
ficoRangeHigh float64
openAcc float64
pubRec float64
pubRecBankruptcies float64
revolBal float64
revolUtil float64
totalAcc float64
initialListStatus int64
applicationType int64
earliesCreditLine object
title float64
policyCode float64
n0 float64
n1 float64
n2 float64
n2.1 float64
n4 float64
n5 float64
n6 float64
n7 float64
n8 float64
n9 float64
n10 float64
n11 float64
n12 float64
n13 float64
n14 float64
dtype: object
train_data.describe()
id | loanAmnt | term | interestRate | installment | employmentTitle | homeOwnership | annualIncome | verificationStatus | isDefault | ... | n5 | n6 | n7 | n8 | n9 | n10 | n11 | n12 | n13 | n14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 800000.000000 | 800000.000000 | 800000.000000 | 800000.000000 | 800000.000000 | 799999.000000 | 800000.000000 | 8.000000e+05 | 800000.000000 | 800000.000000 | ... | 759730.000000 | 759730.000000 | 759730.000000 | 759729.000000 | 759730.000000 | 766761.000000 | 730248.000000 | 759730.000000 | 759730.000000 | 759730.000000 |
mean | 399999.500000 | 14416.818875 | 3.482745 | 13.238391 | 437.947723 | 72005.351714 | 0.614213 | 7.613391e+04 | 1.009683 | 0.199513 | ... | 8.107937 | 8.575994 | 8.282953 | 14.622488 | 5.592345 | 11.643896 | 0.000815 | 0.003384 | 0.089366 | 2.178606 |
std | 230940.252015 | 8716.086178 | 0.855832 | 4.765757 | 261.460393 | 106585.640204 | 0.675749 | 6.894751e+04 | 0.782716 | 0.399634 | ... | 4.799210 | 7.400536 | 4.561689 | 8.124610 | 3.216184 | 5.484104 | 0.030075 | 0.062041 | 0.509069 | 1.844377 |
min | 0.000000 | 500.000000 | 3.000000 | 5.310000 | 15.690000 | 0.000000 | 0.000000 | 0.000000e+00 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 199999.750000 | 8000.000000 | 3.000000 | 9.750000 | 248.450000 | 427.000000 | 0.000000 | 4.560000e+04 | 0.000000 | 0.000000 | ... | 5.000000 | 4.000000 | 5.000000 | 9.000000 | 3.000000 | 8.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
50% | 399999.500000 | 12000.000000 | 3.000000 | 12.740000 | 375.135000 | 7755.000000 | 1.000000 | 6.500000e+04 | 1.000000 | 0.000000 | ... | 7.000000 | 7.000000 | 7.000000 | 13.000000 | 5.000000 | 11.000000 | 0.000000 | 0.000000 | 0.000000 | 2.000000 |
75% | 599999.250000 | 20000.000000 | 3.000000 | 15.990000 | 580.710000 | 117663.500000 | 1.000000 | 9.000000e+04 | 2.000000 | 0.000000 | ... | 11.000000 | 11.000000 | 10.000000 | 19.000000 | 7.000000 | 14.000000 | 0.000000 | 0.000000 | 0.000000 | 3.000000 |
max | 799999.000000 | 40000.000000 | 5.000000 | 30.990000 | 1715.420000 | 378351.000000 | 5.000000 | 1.099920e+07 | 2.000000 | 1.000000 | ... | 70.000000 | 132.000000 | 79.000000 | 128.000000 | 45.000000 | 82.000000 | 4.000000 | 4.000000 | 39.000000 | 30.000000 |
8 rows × 42 columns
train_data.head(3).append(train_data.tail(3))
id | loanAmnt | term | interestRate | installment | grade | subGrade | employmentTitle | employmentLength | homeOwnership | ... | n5 | n6 | n7 | n8 | n9 | n10 | n11 | n12 | n13 | n14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 35000.0 | 5 | 19.52 | 917.97 | E | E2 | 320.0 | 2 years | 2 | ... | 9.0 | 8.0 | 4.0 | 12.0 | 2.0 | 7.0 | 0.0 | 0.0 | 0.0 | 2.0 |
1 | 1 | 18000.0 | 5 | 18.49 | 461.90 | D | D2 | 219843.0 | 5 years | 0 | ... | NaN | NaN | NaN | NaN | NaN | 13.0 | NaN | NaN | NaN | NaN |
2 | 2 | 12000.0 | 5 | 16.99 | 298.17 | D | D3 | 31698.0 | 8 years | 0 | ... | 0.0 | 21.0 | 4.0 | 5.0 | 3.0 | 11.0 | 0.0 | 0.0 | 0.0 | 4.0 |
799997 | 799997 | 6000.0 | 3 | 13.33 | 203.12 | C | C3 | 2582.0 | 10+ years | 1 | ... | 4.0 | 26.0 | 4.0 | 10.0 | 4.0 | 5.0 | 0.0 | 0.0 | 1.0 | 4.0 |
799998 | 799998 | 19200.0 | 3 | 6.92 | 592.14 | A | A4 | 151.0 | 10+ years | 0 | ... | 10.0 | 6.0 | 12.0 | 22.0 | 8.0 | 16.0 | 0.0 | 0.0 | 0.0 | 5.0 |
799999 | 799999 | 9000.0 | 3 | 11.06 | 294.91 | B | B3 | 13.0 | 5 years | 0 | ... | 3.0 | 4.0 | 4.0 | 8.0 | 3.0 | 7.0 | 0.0 | 0.0 | 0.0 | 2.0 |
6 rows × 47 columns
1.2 查看数据中缺失值、唯一值等
train_data.isnull()
id | loanAmnt | term | interestRate | installment | grade | subGrade | employmentTitle | employmentLength | homeOwnership | ... | n5 | n6 | n7 | n8 | n9 | n10 | n11 | n12 | n13 | n14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
1 | False | False | False | False | False | False | False | False | False | False | ... | True | True | True | True | True | False | True | True | True | True |
2 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
3 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
4 | False | False | False | False | False | False | False | False | True | False | ... | False | False | False | False | False | False | False | False | False | False |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
799995 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
799996 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
799997 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
799998 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
799999 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
800000 rows × 47 columns
train_data.isnull().sum()
id 0
loanAmnt 0
term 0
interestRate 0
installment 0
grade 0
subGrade 0
employmentTitle 1
employmentLength 46799
homeOwnership 0
annualIncome 0
verificationStatus 0
issueDate 0
isDefault 0
purpose 0
postCode 1
regionCode 0
dti 239
delinquency_2years 0
ficoRangeLow 0
ficoRangeHigh 0
openAcc 0
pubRec 0
pubRecBankruptcies 405
revolBal 0
revolUtil 531
totalAcc 0
initialListStatus 0
applicationType 0
earliesCreditLine 0
title 1
policyCode 0
n0 40270
n1 40270
n2 40270
n2.1 40270
n4 33239
n5 40270
n6 40270
n7 40270
n8 40271
n9 40270
n10 33239
n11 69752
n12 40270
n13 40270
n14 40270
dtype: int64
type(train_data.isnull().sum())
pandas.core.series.Series
train_data.isnull().sum().dtype
dtype('int64')
train_data.isnull().sum().shape
(47,)
train_data.isnull().sum().index
Index(['id', 'loanAmnt', 'term', 'interestRate', 'installment', 'grade',
'subGrade', 'employmentTitle', 'employmentLength', 'homeOwnership',
'annualIncome', 'verificationStatus', 'issueDate', 'isDefault',
'purpose', 'postCode', 'regionCode', 'dti', 'delinquency_2years',
'ficoRangeLow', 'ficoRangeHigh', 'openAcc', 'pubRec',
'pubRecBankruptcies', 'revolBal', 'revolUtil', 'totalAcc',
'initialListStatus', 'applicationType', 'earliesCreditLine', 'title',
'policyCode', 'n0', 'n1', 'n2', 'n2.1', 'n4', 'n5', 'n6', 'n7', 'n8',
'n9', 'n10', 'n11', 'n12', 'n13', 'n14'],
dtype='object')
train_data.isnull().any()#any()函数用于判断给定的迭代参数iterable是否全部为false,是则返回false,如果有一个为true,则返回true
id False
loanAmnt False
term False
interestRate False
installment False
grade False
subGrade False
employmentTitle True
employmentLength True
homeOwnership False
annualIncome False
verificationStatus False
issueDate False
isDefault False
purpose False
postCode True
regionCode False
dti True
delinquency_2years False
ficoRangeLow False
ficoRangeHigh False
openAcc False
pubRec False
pubRecBankruptcies True
revolBal False
revolUtil True
totalAcc False
initialListStatus False
applicationType False
earliesCreditLine False
title True
policyCode False
n0 True
n1 True
n2 True
n2.1 True
n4 True
n5 True
n6 True
n7 True
n8 True
n9 True
n10 True
n11 True
n12 True
n13 True
n14 True
dtype: bool
train_data.isnull().any().sum()
22
train_data.isnull().any().values
array([False, False, False, False, False, False, False, True, True,
False, False, False, False, False, False, True, False, True,
False, False, False, False, False, True, False, True, False,
False, False, False, True, False, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True])
len(train_data)
800000
train_data.isnull().sum()/len(train_data)
id 0.000000
loanAmnt 0.000000
term 0.000000
interestRate 0.000000
installment 0.000000
grade 0.000000
subGrade 0.000000
employmentTitle 0.000001
employmentLength 0.058499
homeOwnership 0.000000
annualIncome 0.000000
verificationStatus 0.000000
issueDate 0.000000
isDefault 0.000000
purpose 0.000000
postCode 0.000001
regionCode 0.000000
dti 0.000299
delinquency_2years 0.000000
ficoRangeLow 0.000000
ficoRangeHigh 0.000000
openAcc 0.000000
pubRec 0.000000
pubRecBankruptcies 0.000506
revolBal 0.000000
revolUtil 0.000664
totalAcc 0.000000
initialListStatus 0.000000
applicationType 0.000000
earliesCreditLine 0.000000
title 0.000001
policyCode 0.000000
n0 0.050338
n1 0.050338
n2 0.050338
n2.1 0.050338
n4 0.041549
n5 0.050338
n6 0.050338
n7 0.050338
n8 0.050339
n9 0.050338
n10 0.041549
n11 0.087190
n12 0.050338
n13 0.050338
n14 0.050338
dtype: float64
(train_data.isnull().sum()/len(train_data)).to_dict()
{'id': 0.0,
'loanAmnt': 0.0,
'term': 0.0,
'interestRate': 0.0,
'installment': 0.0,
'grade': 0.0,
'subGrade': 0.0,
'employmentTitle': 1.25e-06,
'employmentLength': 0.05849875,
'homeOwnership': 0.0,
'annualIncome': 0.0,
'verificationStatus': 0.0,
'issueDate': 0.0,
'isDefault': 0.0,
'purpose': 0.0,
'postCode': 1.25e-06,
'regionCode': 0.0,
'dti': 0.00029875,
'delinquency_2years': 0.0,
'ficoRangeLow': 0.0,
'ficoRangeHigh': 0.0,
'openAcc': 0.0,
'pubRec': 0.0,
'pubRecBankruptcies': 0.00050625,
'revolBal': 0.0,
'revolUtil': 0.00066375,
'totalAcc': 0.0,
'initialListStatus': 0.0,
'applicationType': 0.0,
'earliesCreditLine': 0.0,
'title': 1.25e-06,
'policyCode': 0.0,
'n0': 0.0503375,
'n1': 0.0503375,
'n2': 0.0503375,
'n2.1': 0.0503375,
'n4': 0.04154875,
'n5': 0.0503375,
'n6': 0.0503375,
'n7': 0.0503375,
'n8': 0.05033875,
'n9': 0.0503375,
'n10': 0.04154875,
'n11': 0.08719,
'n12': 0.0503375,
'n13': 0.0503375,
'n14': 0.0503375}
#查看缺失值大于50%的特征
ratio=(train_data.isnull().sum()/len(train_data)).to_dict()
more_than_half={}
for key,value in ratio.items():
if value>0.5:
more_than_half[key]=value
print(more_than_half)
{}
missing=train_data.isnull().sum()/len(train_data)
missing=missing[missing>0]
missing.sort_values(inplace=True)
missing.plot.bar()
<matplotlib.axes._subplots.AxesSubplot at 0x12b0f98f6c8>
one_value_feature=[col for col in train_data.columns if train_data[col].nunique()<=1]
one_value_feature
['policyCode']
train_data['n8'].unique()
array([ 12., nan, 5., 21., 15., 3., 18., 17., 39., 16., 11.,
9., 32., 14., 7., 22., 27., 19., 10., 31., 37., 8.,
29., 13., 2., 6., 23., 25., 20., 24., 26., 28., 36.,
4., 30., 47., 42., 44., 35., 33., 57., 34., 41., 38.,
45., 43., 40., 51., 61., 55., 49., 46., 52., 50., 54.,
81., 67., 74., 62., 48., 56., 69., 72., 66., 58., 65.,
53., 78., 60., 59., 1., 70., 92., 77., 71., 63., 68.,
64., 75., 127., 103., 88., 86., 79., 73., 90., 76., 87.,
107., 105., 82., 80., 128., 85., 84., 101., 89., 96., 104.,
93., 102., 91., 83.])
train_data['n8'].nunique()
102
print(f'There are {len(one_value_feature)} columns in train dataset with one unique value.')
There are 1 columns in train dataset with one unique value.
1.3 查看数据类型
- 特征一般都是由类别型特征和数值型特征组成
- 类别型特征有时具有非数值关系,有时也具有数值关系。比如‘grade’中的等级A,B,C等,是否只是单纯的 分类,还是A优于其他要结合业务判断。
- 数值型特征本是可以直接入模的,但往往风控人员要对其做分箱,转化为WOE编码进而做标准评分卡等操作。从模型效果上来看,特征分箱主要是为了降低变量的复杂性,减少变量噪音对模型的影响,提高自变量和因变量的相关度。从而使模型更加稳定。
train_data.dtypes
id int64
loanAmnt float64
term int64
interestRate float64
installment float64
grade object
subGrade object
employmentTitle float64
employmentLength object
homeOwnership int64
annualIncome float64
verificationStatus int64
issueDate object
isDefault int64
purpose int64
postCode float64
regionCode int64
dti float64
delinquency_2years float64
ficoRangeLow float64
ficoRangeHigh float64
openAcc float64
pubRec float64
pubRecBankruptcies float64
revolBal float64
revolUtil float64
totalAcc float64
initialListStatus int64
applicationType int64
earliesCreditLine object
title float64
policyCode float64
n0 float64
n1 float64
n2 float64
n2.1 float64
n4 float64
n5 float64
n6 float64
n7 float64
n8 float64
n9 float64
n10 float64
n11 float64
n12 float64
n13 float64
n14 float64
dtype: object
train_data.head()
id | loanAmnt | term | interestRate | installment | grade | subGrade | employmentTitle | employmentLength | homeOwnership | ... | n5 | n6 | n7 | n8 | n9 | n10 | n11 | n12 | n13 | n14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 35000.0 | 5 | 19.52 | 917.97 | E | E2 | 320.0 | 2 years | 2 | ... | 9.0 | 8.0 | 4.0 | 12.0 | 2.0 | 7.0 | 0.0 | 0.0 | 0.0 | 2.0 |
1 | 1 | 18000.0 | 5 | 18.49 | 461.90 | D | D2 | 219843.0 | 5 years | 0 | ... | NaN | NaN | NaN | NaN | NaN | 13.0 | NaN | NaN | NaN | NaN |
2 | 2 | 12000.0 | 5 | 16.99 | 298.17 | D | D3 | 31698.0 | 8 years | 0 | ... | 0.0 | 21.0 | 4.0 | 5.0 | 3.0 | 11.0 | 0.0 | 0.0 | 0.0 | 4.0 |
3 | 3 | 11000.0 | 3 | 7.26 | 340.96 | A | A4 | 46854.0 | 10+ years | 1 | ... | 16.0 | 4.0 | 7.0 | 21.0 | 6.0 | 9.0 | 0.0 | 0.0 | 0.0 | 1.0 |
4 | 4 | 3000.0 | 3 | 12.99 | 101.07 | C | C2 | 54.0 | NaN | 1 | ... | 4.0 | 9.0 | 10.0 | 15.0 | 7.0 | 12.0 | 0.0 | 0.0 | 0.0 | 4.0 |
5 rows × 47 columns
train_data.select_dtypes(exclude='object')
id | loanAmnt | term | interestRate | installment | employmentTitle | homeOwnership | annualIncome | verificationStatus | isDefault | ... | n5 | n6 | n7 | n8 | n9 | n10 | n11 | n12 | n13 | n14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 35000.0 | 5 | 19.52 | 917.97 | 320.0 | 2 | 110000.0 | 2 | 1 | ... | 9.0 | 8.0 | 4.0 | 12.0 | 2.0 | 7.0 | 0.0 | 0.0 | 0.0 | 2.0 |
1 | 1 | 18000.0 | 5 | 18.49 | 461.90 | 219843.0 | 0 | 46000.0 | 2 | 0 | ... | NaN | NaN | NaN | NaN | NaN | 13.0 | NaN | NaN | NaN | NaN |
2 | 2 | 12000.0 | 5 | 16.99 | 298.17 | 31698.0 | 0 | 74000.0 | 2 | 0 | ... | 0.0 | 21.0 | 4.0 | 5.0 | 3.0 | 11.0 | 0.0 | 0.0 | 0.0 | 4.0 |
3 | 3 | 11000.0 | 3 | 7.26 | 340.96 | 46854.0 | 1 | 118000.0 | 1 | 0 | ... | 16.0 | 4.0 | 7.0 | 21.0 | 6.0 | 9.0 | 0.0 | 0.0 | 0.0 | 1.0 |
4 | 4 | 3000.0 | 3 | 12.99 | 101.07 | 54.0 | 1 | 29000.0 | 2 | 0 | ... | 4.0 | 9.0 | 10.0 | 15.0 | 7.0 | 12.0 | 0.0 | 0.0 | 0.0 | 4.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
799995 | 799995 | 25000.0 | 3 | 14.49 | 860.41 | 2659.0 | 1 | 72000.0 | 0 | 0 | ... | 6.0 | 2.0 | 12.0 | 13.0 | 10.0 | 14.0 | 0.0 | 0.0 | 0.0 | 3.0 |
799996 | 799996 | 17000.0 | 3 | 7.90 | 531.94 | 29205.0 | 0 | 99000.0 | 2 | 0 | ... | 15.0 | 16.0 | 2.0 | 19.0 | 2.0 | 7.0 | 0.0 | 0.0 | 0.0 | 0.0 |
799997 | 799997 | 6000.0 | 3 | 13.33 | 203.12 | 2582.0 | 1 | 65000.0 | 2 | 1 | ... | 4.0 | 26.0 | 4.0 | 10.0 | 4.0 | 5.0 | 0.0 | 0.0 | 1.0 | 4.0 |
799998 | 799998 | 19200.0 | 3 | 6.92 | 592.14 | 151.0 | 0 | 96000.0 | 2 | 0 | ... | 10.0 | 6.0 | 12.0 | 22.0 | 8.0 | 16.0 | 0.0 | 0.0 | 0.0 | 5.0 |
799999 | 799999 | 9000.0 | 3 | 11.06 | 294.91 | 13.0 | 0 | 120000.0 | 0 | 0 | ... | 3.0 | 4.0 | 4.0 | 8.0 | 3.0 | 7.0 | 0.0 | 0.0 | 0.0 | 2.0 |
800000 rows × 42 columns
numerical_feature=list(train_data.select_dtypes(exclude='object').columns)
category_feature=list(filter(lambda x:x not in numerical_feature,list(train_data.columns)))
numerical_feature
['id',
'loanAmnt',
'term',
'interestRate',
'installment',
'employmentTitle',
'homeOwnership',
'annualIncome',
'verificationStatus',
'isDefault',
'purpose',
'postCode',
'regionCode',
'dti',
'delinquency_2years',
'ficoRangeLow',
'ficoRangeHigh',
'openAcc',
'pubRec',
'pubRecBankruptcies',
'revolBal',
'revolUtil',
'totalAcc',
'initialListStatus',
'applicationType',
'title',
'policyCode',
'n0',
'n1',
'n2',
'n2.1',
'n4',
'n5',
'n6',
'n7',
'n8',
'n9',
'n10',
'n11',
'n12',
'n13',
'n14']
category_feature
['grade', 'subGrade', 'employmentLength', 'issueDate', 'earliesCreditLine']
train_data['grade']
0 E
1 D
2 D
3 A
4 C
..
799995 C
799996 A
799997 C
799998 A
799999 B
Name: grade, Length: 800000, dtype: object
train_data.grade
0 E
1 D
2 D
3 A
4 C
..
799995 C
799996 A
799997 C
799998 A
799999 B
Name: grade, Length: 800000, dtype: object
#划分数值型变量中的连续变量和分类变量
def get_numerical_serial_fea(data,feas):
numerical_serial_feas=[]
numerical_nserial_feas=[]
for fea in feas:
if data[fea].nunique()<=10:
numerical_nserial_feas.append(fea)
else:
numerical_serial_feas.append(fea)
return numerical_serial_feas,numerical_nserial_feas
x1,x2=get_numerical_serial_fea(train_data,numerical_feature)
x1# 数值连续型变量
['id',
'loanAmnt',
'interestRate',
'installment',
'employmentTitle',
'annualIncome',
'purpose',
'postCode',
'regionCode',
'dti',
'delinquency_2years',
'ficoRangeLow',
'ficoRangeHigh',
'openAcc',
'pubRec',
'pubRecBankruptcies',
'revolBal',
'revolUtil',
'totalAcc',
'title',
'n0',
'n1',
'n2',
'n2.1',
'n4',
'n5',
'n6',
'n7',
'n8',
'n9',
'n10',
'n13',
'n14']
x2#数值类别型变量
['term',
'homeOwnership',
'verificationStatus',
'isDefault',
'initialListStatus',
'applicationType',
'policyCode',
'n11',
'n12']
train_data.term.value_counts()
3 606902
5 193098
Name: term, dtype: int64
train_data['homeOwnership'].value_counts()
0 395732
1 317660
2 86309
3 185
5 81
4 33
Name: homeOwnership, dtype: int64
train_data['loanAmnt'].value_counts()
10000.0 59068
12000.0 43291
20000.0 41885
15000.0 41629
35000.0 30371
...
35975.0 1
35925.0 1
35850.0 1
35775.0 1
725.0 1
Name: loanAmnt, Length: 1540, dtype: int64
#每个数字特征得分布可视化
f = pd.melt(train_data, value_vars=x1)
g = sns.FacetGrid(f, col="variable", col_wrap=2, sharex=False, sharey=False)
g = g.map(sns.distplot, "value")
print(f)
-
查看某一个数值型变量的分布,查看变量是否符合正态分布,如果不符合正太分布的变量可以log化后再观察下是否符合正态分布。
-
如果想统一处理一批数据变标准化 必须把这些之前已经正态化的数据提出
plt.figure(figsize=(16,12))
plt.subplot(221)
sns.distplot(train_data['loanAmnt'])
plt.subplot(222)
sns.distplot(np.log(train_data['loanAmnt']))
<matplotlib.axes._subplots.AxesSubplot at 0x12b2c369e48>
# 非数值类别型变量分析
category_feature
['grade', 'subGrade', 'employmentLength', 'issueDate', 'earliesCreditLine']
train_data.grade.value_counts()
B 233690
C 227118
A 139661
D 119453
E 55661
F 19053
G 5364
Name: grade, dtype: int64
1.4 变量分布可视化
plt.figure(figsize=(8, 8))
sns.barplot(train_data["employmentLength"].value_counts(dropna=False)[:20],
train_data["employmentLength"].value_counts(dropna=False).keys()[:20])
plt.show()
import datetime
#转化成时间格式
train_data['issueDate'] = pd.to_datetime(train_data['issueDate'],format='%Y-%m-%d')
startdate = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d')
train_data['issueDateDT'] = train_data['issueDate'].apply(lambda x: x-startdate).dt.days
plt.hist(train_data['issueDateDT'], label='train')
(array([ 698., 2104., 7139., 15255., 43725., 105400., 191202.,
256182., 131659., 46636.]),
array([ 0. , 420.1, 840.2, 1260.3, 1680.4, 2100.5, 2520.6, 2940.7,
3360.8, 3780.9, 4201. ]),
<a list of 10 Patch objects>)
#透视图 索引可以有多个,“columns(列)”是可选的,聚合函数aggfunc后是被应用到了变量“values”中你所列举的项目上。
pivot = pd.pivot_table(train_data, index=['grade'], columns=['issueDateDT'], values= ['loanAmnt'], aggfunc=np.sum)
pivot
loanAmnt | |||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
issueDateDT | 0 | 30 | 61 | 92 | 122 | 153 | 183 | 214 | 245 | 274 | ... | 3926 | 3957 | 3987 | 4018 | 4048 | 4079 | 4110 | 4140 | 4171 | 4201 |
grade | |||||||||||||||||||||
A | NaN | 53650.0 | 42000.0 | 19500.0 | 34425.0 | 63950.0 | 43500.0 | 168825.0 | 85600.0 | 101825.0 | ... | 13093850.0 | 11757325.0 | 11945975.0 | 9144000.0 | 7977650.0 | 6888900.0 | 5109800.0 | 3919275.0 | 2694025.0 | 2245625.0 |
B | NaN | 13000.0 | 24000.0 | 32125.0 | 7025.0 | 95750.0 | 164300.0 | 303175.0 | 434425.0 | 538450.0 | ... | 16863100.0 | 17275175.0 | 16217500.0 | 11431350.0 | 8967750.0 | 7572725.0 | 4884600.0 | 4329400.0 | 3922575.0 | 3257100.0 |
C | NaN | 68750.0 | 8175.0 | 10000.0 | 61800.0 | 52550.0 | 175375.0 | 151100.0 | 243725.0 | 393150.0 | ... | 17502375.0 | 17471500.0 | 16111225.0 | 11973675.0 | 10184450.0 | 7765000.0 | 5354450.0 | 4552600.0 | 2870050.0 | 2246250.0 |
D | NaN | NaN | 5500.0 | 2850.0 | 28625.0 | NaN | 167975.0 | 171325.0 | 192900.0 | 269325.0 | ... | 11403075.0 | 10964150.0 | 10747675.0 | 7082050.0 | 7189625.0 | 5195700.0 | 3455175.0 | 3038500.0 | 2452375.0 | 1771750.0 |
E | 7500.0 | NaN | 10000.0 | NaN | 17975.0 | 1500.0 | 94375.0 | 116450.0 | 42000.0 | 139775.0 | ... | 3983050.0 | 3410125.0 | 3107150.0 | 2341825.0 | 2225675.0 | 1643675.0 | 1091025.0 | 1131625.0 | 883950.0 | 802425.0 |
F | NaN | NaN | 31250.0 | 2125.0 | NaN | NaN | NaN | 49000.0 | 27000.0 | 43000.0 | ... | 1074175.0 | 868925.0 | 761675.0 | 685325.0 | 665750.0 | 685200.0 | 316700.0 | 315075.0 | 72300.0 | NaN |
G | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 24625.0 | NaN | NaN | ... | 56100.0 | 243275.0 | 224825.0 | 64050.0 | 198575.0 | 245825.0 | 53125.0 | 23750.0 | 25100.0 | 1000.0 |
7 rows × 139 columns