import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
data = pd.read_csv('train.csv')
# data = pd.read_csv('C:\\Users\\hy\\天池\\贷款违约\\train.csv')
data.head()
id | loanAmnt | term | interestRate | installment | grade | subGrade | employmentTitle | employmentLength | homeOwnership | ... | n5 | n6 | n7 | n8 | n9 | n10 | n11 | n12 | n13 | n14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 35000.0 | 5 | 19.52 | 917.97 | E | E2 | 320.0 | 2 years | 2 | ... | 9.0 | 8.0 | 4.0 | 12.0 | 2.0 | 7.0 | 0.0 | 0.0 | 0.0 | 2.0 |
1 | 1 | 18000.0 | 5 | 18.49 | 461.90 | D | D2 | 219843.0 | 5 years | 0 | ... | NaN | NaN | NaN | NaN | NaN | 13.0 | NaN | NaN | NaN | NaN |
2 | 2 | 12000.0 | 5 | 16.99 | 298.17 | D | D3 | 31698.0 | 8 years | 0 | ... | 0.0 | 21.0 | 4.0 | 5.0 | 3.0 | 11.0 | 0.0 | 0.0 | 0.0 | 4.0 |
3 | 3 | 11000.0 | 3 | 7.26 | 340.96 | A | A4 | 46854.0 | 10+ years | 1 | ... | 16.0 | 4.0 | 7.0 | 21.0 | 6.0 | 9.0 | 0.0 | 0.0 | 0.0 | 1.0 |
4 | 4 | 3000.0 | 3 | 12.99 | 101.07 | C | C2 | 54.0 | NaN | 1 | ... | 4.0 | 9.0 | 10.0 | 15.0 | 7.0 | 12.0 | 0.0 | 0.0 | 0.0 | 4.0 |
5 rows × 47 columns
import copy
df = copy.deepcopy(data)
col_name = data.columns
new_col = ['信用证标识','贷款金额','贷款期限','贷款利率','分期付款金额','贷款等级','贷款等级之子级','就业职称','就业年限(年)','房屋所有权状况','年收入','验证状态','贷款发放的月份','违约状态','贷款用途类别','邮政编码的前3位数字','地区编码','债务收入比','违约事件数','fico所属的下限范围','fico所属的上限范围','未结信用额度的数量','贬损公共记录的数量','公开记录清除的数量','信贷周转余额合计','循环额度利用率','当前的信用额度总数','贷款的初始列表状态','申请方式','信用额度开立的月份','贷款名称','公开策略','n0','n1','n2','n3','n4','n5','n6','n7','n8','n9','n10','n11','n12','n13','n14']
df.columns = new_col
df.head()
信用证标识 | 贷款金额 | 贷款期限 | 贷款利率 | 分期付款金额 | 贷款等级 | 贷款等级之子级 | 就业职称 | 就业年限(年) | 房屋所有权状况 | ... | n5 | n6 | n7 | n8 | n9 | n10 | n11 | n12 | n13 | n14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 35000.0 | 5 | 19.52 | 917.97 | E | E2 | 320.0 | 2 years | 2 | ... | 9.0 | 8.0 | 4.0 | 12.0 | 2.0 | 7.0 | 0.0 | 0.0 | 0.0 | 2.0 |
1 | 1 | 18000.0 | 5 | 18.49 | 461.90 | D | D2 | 219843.0 | 5 years | 0 | ... | NaN | NaN | NaN | NaN | NaN | 13.0 | NaN | NaN | NaN | NaN |
2 | 2 | 12000.0 | 5 | 16.99 | 298.17 | D | D3 | 31698.0 | 8 years | 0 | ... | 0.0 | 21.0 | 4.0 | 5.0 | 3.0 | 11.0 | 0.0 | 0.0 | 0.0 | 4.0 |
3 | 3 | 11000.0 | 3 | 7.26 | 340.96 | A | A4 | 46854.0 | 10+ years | 1 | ... | 16.0 | 4.0 | 7.0 | 21.0 | 6.0 | 9.0 | 0.0 | 0.0 | 0.0 | 1.0 |
4 | 4 | 3000.0 | 3 | 12.99 | 101.07 | C | C2 | 54.0 | NaN | 1 | ... | 4.0 | 9.0 | 10.0 | 15.0 | 7.0 | 12.0 | 0.0 | 0.0 | 0.0 | 4.0 |
5 rows × 47 columns
缺失值填充
from math import isnan
n = sorted(df['n0'])[len(df['n0'])//2]
train_cz = list(copy.deepcopy(df['n0']))
cz = []
for i in train_cz:
if isnan(i):
cz.append(2)
else:
cz.append(i)
df.drop('n0',axis=1, inplace=True)
df['n0'] = cz
del train_cz, cz
n = sorted(df['n1'])[len(df['n1'])//2]
train_cz = list(copy.deepcopy(df['n1']))
cz = []
for i in train_cz:
if isnan(i):
cz.append(2)
else:
cz.append(i)
df.drop('n1',axis=1, inplace=True)
df['n1'] = cz
del train_cz, cz
n = sorted(df['n2'])[len(df['n2'])//2]
train_cz = list(copy.deepcopy(df['n2']))
cz = []
for i in train_cz:
if isnan(i):
cz.append(2)
else:
cz.append(i)
df.drop('n2',axis=1, inplace=True)
df['n2'] = cz
del train_cz, cz
n = sorted(df['n3'])[len(df['n3'])//2]
train_cz = list(copy.deepcopy(df['n3']))
cz = []
for i in train_cz:
if isnan(i):
cz.append(2)
else:
cz.append(i)
df.drop('n3',axis=1, inplace=True)
df['n3'] = cz
del train_cz, cz
n = sorted(df['n4'])[len(df['n4'])//2]
train_cz = list(copy.deepcopy(df['n4']))
cz = []
for i in train_cz:
if isnan(i):
cz.append(2)
else:
cz.append(i)
df.drop('n4',axis=1, inplace=True)
df['n4'] = cz
del train_cz, cz
n = sorted(df['n5'])[len(df['n5'])//2]
train_cz = list(copy.deepcopy(df['n5']))
cz = []
for i in train_cz:
if isnan(i):
cz.append(2)
else:
cz.append(i)
df.drop('n5',axis=1, inplace=True)
df['n5'] = cz
del train_cz, cz
n = sorted(df['n6'])[len(df['n6'])//2]
train_cz = list(copy.deepcopy(df['n6']))
cz = []
for i in train_cz:
if isnan(i):
cz.append(2)
else:
cz.append(i)
df.drop('n6',axis=1, inplace=True)
df['n6'] = cz
del train_cz, cz
n = sorted(df['n7'])[len(df['n7'])//2]
train_cz = list(copy.deepcopy(df['n7']))
cz = []
for i in train_cz:
if isnan(i):
cz.append(2)
else:
cz.append(i)
df.drop('n7',axis=1, inplace=True)
df['n7'] = cz
del train_cz, cz
n = sorted(df['n8'])[len(df['n8'])//2]
train_cz = list(copy.deepcopy(df['n8']))
cz = []
for i in train_cz:
if isnan(i):
cz.append(2)
else:
cz.append(i)
df.drop('n8',axis=1, inplace=True)
df['n8'] = cz
del train_cz, cz
n = sorted(df['n9'])[len(df['n9'])//2]
train_cz = list(copy.deepcopy(df['n9']))
cz = []
for i in train_cz:
if isnan(i):
cz.append(2)
else:
cz.append(i)
df.drop('n9',axis=1, inplace=True)
df['n9'] = cz
del train_cz, cz
n = sorted(df['n10'])[len(df['n10'])//2]
train_cz = list(copy.deepcopy(df['n10']))
cz = []
for i in train_cz:
if isnan(i):
cz.append(2)
else:
cz.append(i)
df.drop('n10',axis=1, inplace=True)
df['n10'] = cz
del train_cz, cz
n = sorted(df['n11'])[len(df['n11'])//2]
train_cz = list(copy.deepcopy(df['n11']))
cz = []
for i in train_cz:
if isnan(i):
cz.append(2)
else:
cz.append(i)
df.drop('n11',axis=1, inplace=True)
df['n11'] = cz
del train_cz, cz
n = sorted(df['n12'])[len(df['n12'])//2]
train_cz = list(copy.deepcopy(df['n12']))
cz = []
for i in train_cz:
if isnan(i):
cz.append(2)
else:
cz.append(i)
df.drop('n12',axis=1, inplace=True)
df['n12'] = cz
del train_cz, cz
n = sorted(df['n13'])[len(df['n13'])//2]
train_cz = list(copy.deepcopy(df['n13']))
cz = []
for i in train_cz:
if isnan(i):
cz.append(2)
else:
cz.append(i)
df.drop('n13',axis=1, inplace=True)
df['n13'] = cz
del train_cz, cz
n = sorted(df['n14'])[len(df['n14'])//2]
train_cz = list(copy.deepcopy(df['n14']))
cz = []
for i in train_cz:
if isnan(i):
cz.append(2)
else:
cz.append(i)
df.drop('n14',axis=1, inplace=True)
df['n14'] = cz
del train_cz, cz
df['贷款名称'].unique()
array([1.0000e+00, 1.7230e+03, 0.0000e+00, ..., 3.6026e+04, 3.9735e+04,
3.3369e+04])
df['贷款名称']
0 1.0
1 1723.0
2 0.0
3 4.0
4 11.0
...
799995 0.0
799996 33369.0
799997 0.0
799998 4.0
799999 4.0
Name: 贷款名称, Length: 800000, dtype: float64
n = int(df['贷款名称'].sum()/8000000)
n
175
train_cz = list(copy.deepcopy(df['贷款名称']))
cz = []
for i in train_cz:
if isnan(i):
cz.append(2)
else:
cz.append(i)
df.drop('贷款名称',axis=1, inplace=True)
df['贷款名称'] = cz
del train_cz, cz
df['就业职称'].unique()
array([3.20000e+02, 2.19843e+05, 3.16980e+04, ..., 2.66097e+05,
3.62528e+05, 1.34854e+05])
df['就业职称']
0 320.0
1 219843.0
2 31698.0
3 46854.0
4 54.0
...
799995 2659.0
799996 29205.0
799997 2582.0
799998 151.0
799999 13.0
Name: 就业职称, Length: 800000, dtype: float64
n = int(df['就业职称'].sum()/8000000)
n
7200
train_cz = list(copy.deepcopy(df['就业职称']))
cz = []
for i in train_cz:
if isnan(i):
cz.append(2)
else:
cz.append(i)
df.drop('就业职称',axis=1, inplace=True)
df['就业职称'] = cz
del train_cz, cz
df['邮政编码的前3位数字']
0 137.0
1 156.0
2 337.0
3 148.0
4 301.0
...
799995 242.0
799996 563.0
799997 47.0
799998 34.0
799999 62.0
Name: 邮政编码的前3位数字, Length: 800000, dtype: float64
n = sorted(df['邮政编码的前3位数字'])[len(df['邮政编码的前3位数字'])//2]
train_cz = list(copy.deepcopy(df['邮政编码的前3位数字']))
cz = []
for i in train_cz:
if isnan(i):
cz.append(2)
else:
cz.append(i)
df.drop('邮政编码的前3位数字',axis=1, inplace=True)
df['邮政编码的前3位数字'] = cz
del train_cz, cz
df['债务收入比'].head(20),df['债务收入比'].tail(20)
(0 17.05
1 27.83
2 22.77
3 17.21
4 32.16
5 17.14
6 17.49
7 32.60
8 19.22
9 24.39
10 14.21
11 34.63
12 7.58
13 5.68
14 38.95
15 17.27
16 21.02
17 17.14
18 28.95
19 15.55
Name: 债务收入比, dtype: float64, 799980 7.16
799981 10.32
799982 26.50
799983 27.87
799984 9.36
799985 36.44
799986 6.45
799987 29.76
799988 19.03
799989 32.03
799990 20.16
799991 29.68
799992 20.86
799993 18.27
799994 23.96
799995 19.03
799996 15.72
799997 12.11
799998 29.25
799999 8.99
Name: 债务收入比, dtype: float64)
n = round(df['债务收入比'].sum()/(8000000-239),2)
n
1.83
train_cz = list(copy.deepcopy(df['债务收入比']))
cz = []
for i in train_cz:
if isnan(i):
cz.append(2)
else:
cz.append(i)
df.drop('债务收入比',axis=1, inplace=True)
df['债务收入比'] = cz
del train_cz, cz
df['公开记录清除的数量'].head(20),df['公开记录清除的数量'].tail(20)
(0 0.0
1 0.0
2 0.0
3 0.0
4 0.0
5 0.0
6 0.0
7 1.0
8 0.0
9 0.0
10 0.0
11 0.0
12 0.0
13 0.0
14 0.0
15 1.0
16 0.0
17 0.0
18 0.0
19 0.0
Name: 公开记录清除的数量, dtype: float64, 799980 1.0
799981 0.0
799982 0.0
799983 0.0
799984 1.0
799985 0.0
799986 0.0
799987 0.0
799988 0.0
799989 0.0
799990 0.0
799991 0.0
799992 1.0
799993 0.0
799994 0.0
799995 0.0
799996 0.0
799997 0.0
799998 0.0
799999 0.0
Name: 公开记录清除的数量, dtype: float64)
n = sorted(df['公开记录清除的数量'])[len(df['公开记录清除的数量'])//2]
train_cz = list(copy.deepcopy(df['公开记录清除的数量']))
cz = []
for i in train_cz:
if isnan(i):
cz.append(2)
else:
cz.append(i)
df.drop('公开记录清除的数量',axis=1, inplace=True)
df['公开记录清除的数量'] = cz
del train_cz, cz
df['循环额度利用率'].head(20),df['循环额度利用率'].tail(20)
(0 48.9
1 38.9
2 51.8
3 52.6
4 32.0
5 31.1
6 8.5
7 59.7
8 46.0
9 30.6
10 47.5
11 80.9
12 35.7
13 68.8
14 60.8
15 3.6
16 61.1
17 76.9
18 84.0
19 57.0
Name: 循环额度利用率, dtype: float64, 799980 70.2
799981 36.4
799982 38.4
799983 28.4
799984 46.1
799985 70.7
799986 91.8
799987 49.6
799988 87.2
799989 52.7
799990 62.2
799991 93.5
799992 47.4
799993 47.5
799994 60.5
799995 46.4
799996 98.4
799997 51.9
799998 61.3
799999 72.6
Name: 循环额度利用率, dtype: float64)
n = round(df['循环额度利用率'].sum()/(8000000-239),1)
n
5.2
train_cz = list(copy.deepcopy(df['循环额度利用率']))
cz = []
for i in train_cz:
if isnan(i):
cz.append(2)
else:
cz.append(i)
df.drop('循环额度利用率',axis=1, inplace=True)
df['循环额度利用率'] = cz
del train_cz, cz
df['就业年限(年)'].unique()
array(['2 years', '5 years', '8 years', '10+ years', nan, '7 years',
'9 years', '1 year', '3 years', '< 1 year', '4 years', '6 years'],
dtype=object)
df['就业年限(年)'] = df['就业年限(年)'].map({'1 year':1,'2 years':2,'3 years':3,'4 years':4,'5 years':5,'6 years':6,'7 years':7,'8 years':8,'9 years':9,'10+ years':10,'< 1 year':0})
df['就业年限(年)']
0 2.0
1 5.0
2 8.0
3 10.0
4 NaN
...
799995 7.0
799996 10.0
799997 10.0
799998 10.0
799999 5.0
Name: 就业年限(年), Length: 800000, dtype: float64
n = sorted(df['就业年限(年)'])[len(df['就业年限(年)'])//2]
train_cz = list(copy.deepcopy(df['就业年限(年)']))
cz = []
for i in train_cz:
if isnan(i):
cz.append(2)
else:
cz.append(i)
df.drop('就业年限(年)',axis=1, inplace=True)
df['就业年限(年)'] = cz
del train_cz, cz
df.head(20)
信用证标识 | 贷款金额 | 贷款期限 | 贷款利率 | 分期付款金额 | 贷款等级 | 贷款等级之子级 | 房屋所有权状况 | 年收入 | 验证状态 | ... | n12 | n13 | n14 | 贷款名称 | 就业职称 | 邮政编码的前3位数字 | 债务收入比 | 公开记录清除的数量 | 循环额度利用率 | 就业年限(年) | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 35000.0 | 5 | 19.52 | 917.97 | E | E2 | 2 | 110000.0 | 2 | ... | 0.0 | 0.0 | 2.0 | 1.0 | 320.0 | 137.0 | 17.05 | 0.0 | 48.9 | 2.0 |
1 | 1 | 18000.0 | 5 | 18.49 | 461.90 | D | D2 | 0 | 46000.0 | 2 | ... | 2.0 | 2.0 | 2.0 | 1723.0 | 219843.0 | 156.0 | 27.83 | 0.0 | 38.9 | 5.0 |
2 | 2 | 12000.0 | 5 | 16.99 | 298.17 | D | D3 | 0 | 74000.0 | 2 | ... | 0.0 | 0.0 | 4.0 | 0.0 | 31698.0 | 337.0 | 22.77 | 0.0 | 51.8 | 8.0 |
3 | 3 | 11000.0 | 3 | 7.26 | 340.96 | A | A4 | 1 | 118000.0 | 1 | ... | 0.0 | 0.0 | 1.0 | 4.0 | 46854.0 | 148.0 | 17.21 | 0.0 | 52.6 | 10.0 |
4 | 4 | 3000.0 | 3 | 12.99 | 101.07 | C | C2 | 1 | 29000.0 | 2 | ... | 0.0 | 0.0 | 4.0 | 11.0 | 54.0 | 301.0 | 32.16 | 0.0 | 32.0 | 2.0 |
5 | 5 | 11000.0 | 3 | 7.99 | 344.65 | A | A5 | 0 | 39000.0 | 2 | ... | 0.0 | 0.0 | 0.0 | 10.0 | 51727.0 | 512.0 | 17.14 | 0.0 | 31.1 | 7.0 |
6 | 6 | 2050.0 | 3 | 7.69 | 63.95 | A | A4 | 0 | 35000.0 | 0 | ... | 0.0 | 0.0 | 3.0 | 0.0 | 180083.0 | 517.0 | 17.49 | 0.0 | 8.5 | 9.0 |
7 | 7 | 11500.0 | 3 | 14.98 | 398.54 | C | C3 | 1 | 30000.0 | 2 | ... | 0.0 | 0.0 | 2.0 | 0.0 | 214017.0 | 100.0 | 32.60 | 1.0 | 59.7 | 1.0 |
8 | 8 | 12000.0 | 3 | 12.99 | 404.27 | C | C2 | 2 | 60000.0 | 1 | ... | 0.0 | 0.0 | 6.0 | 0.0 | 188.0 | 792.0 | 19.22 | 0.0 | 46.0 | 5.0 |
9 | 9 | 6500.0 | 3 | 10.99 | 212.78 | B | B4 | 1 | 15300.0 | 2 | ... | 0.0 | 0.0 | 8.0 | 0.0 | 54.0 | 59.0 | 24.39 | 0.0 | 30.6 | 2.0 |
10 | 10 | 5600.0 | 3 | 13.67 | 190.50 | B | B5 | 1 | 38000.0 | 0 | ... | 0.0 | 0.0 | 3.0 | 27332.0 | 46631.0 | 134.0 | 14.21 | 0.0 | 47.5 | 10.0 |
11 | 11 | 5000.0 | 3 | 27.27 | 204.86 | E | E5 | 0 | 95000.0 | 0 | ... | 0.0 | 0.0 | 3.0 | 0.0 | 140759.0 | 893.0 | 34.63 | 0.0 | 80.9 | 3.0 |
12 | 12 | 19200.0 | 3 | 20.00 | 713.55 | D | D4 | 1 | 88000.0 | 0 | ... | 0.0 | 0.0 | 8.0 | 0.0 | 139297.0 | 195.0 | 7.58 | 0.0 | 35.7 | 2.0 |
13 | 13 | 24000.0 | 3 | 9.99 | 774.30 | B | B3 | 0 | 150000.0 | 1 | ... | 0.0 | 0.0 | 2.0 | 5.0 | 4967.0 | 134.0 | 5.68 | 0.0 | 68.8 | 10.0 |
14 | 14 | 16000.0 | 3 | 7.91 | 500.72 | A | A5 | 1 | 50000.0 | 0 | ... | 0.0 | 0.0 | 1.0 | 4.0 | 11037.0 | 167.0 | 38.95 | 0.0 | 60.8 | 2.0 |
15 | 15 | 6000.0 | 3 | 10.49 | 194.99 | B | B2 | 0 | 77000.0 | 1 | ... | 0.0 | 0.0 | 0.0 | 5.0 | 251530.0 | 194.0 | 17.27 | 1.0 | 3.6 | 2.0 |
16 | 16 | 10375.0 | 5 | 15.61 | 250.16 | D | D1 | 0 | 58000.0 | 0 | ... | 0.0 | 0.0 | 2.0 | 5.0 | 199961.0 | 492.0 | 21.02 | 0.0 | 61.1 | 9.0 |
17 | 17 | 5500.0 | 3 | 10.99 | 180.04 | B | B4 | 1 | 50000.0 | 1 | ... | 0.0 | 0.0 | 0.0 | 4.0 | 81579.0 | 56.0 | 17.14 | 0.0 | 76.9 | 0.0 |
18 | 18 | 6000.0 | 3 | 21.00 | 226.06 | E | E2 | 1 | 40000.0 | 0 | ... | 0.0 | 1.0 | 0.0 | 49758.0 | 323019.0 | 140.0 | 28.95 | 0.0 | 84.0 | 10.0 |
19 | 19 | 20000.0 | 5 | 21.00 | 541.07 | E | E1 | 1 | 90000.0 | 2 | ... | 0.0 | 0.0 | 6.0 | 38.0 | 258238.0 | 305.0 | 15.55 | 0.0 | 57.0 | 9.0 |
20 rows × 47 columns
df['贷款等级'] = df['贷款等级'].map({'A':10,'B':20,'C':30,'D':40,'E':50,'F':60,'G':70})
df['贷款等级之子级'] = df['贷款等级之子级'].map({'A1':1,'A2':2,'A3':3,'A4':4,'A5':5,'B1':1,'B2':2,'B3':3,'B4':4,'B5':5,
'C1':1,'C2':2,'C3':3,'C4':4,'C5':5,'D1':1,'D2':2,'D3':3,'D4':4,'D5':5,
'E1':1,'E2':2,'E3':3,'E4':4,'E5':5,'F1':1,'F2':2,'F3':3,'F4':4,'F5':5,
'G1':1,'G2':2,'G3':3,'G4':4,'G5':5})
df['贷款等级之子级']
0 2
1 2
2 3
3 4
4 2
..
799995 4
799996 4
799997 3
799998 4
799999 3
Name: 贷款等级之子级, Length: 800000, dtype: int64