贷款违约-特征工程（一）

最新推荐文章于 2024-04-23 17:14:15 发布

Chimpanzee1

最新推荐文章于 2024-04-23 17:14:15 发布

阅读量487

点赞数

分类专栏：人工智能知识体系文章标签：矩阵机器学习 python pycharm

本文链接：https://blog.csdn.net/playboygogogo/article/details/111162871

版权

人工智能知识体系专栏收录该内容

9 篇文章 0 订阅

订阅专栏

import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

data = pd.read_csv('train.csv')

# data = pd.read_csv('C:\\Users\\hy\\天池\\贷款违约\\train.csv')
data.head()

	id	loanAmnt	term	interestRate	installment	grade	subGrade	employmentTitle	employmentLength	homeOwnership	...	n5	n6	n7	n8	n9	n10	n11	n12	n13	n14
0	0	35000.0	5	19.52	917.97	E	E2	320.0	2 years	2	...	9.0	8.0	4.0	12.0	2.0	7.0	0.0	0.0	0.0	2.0
1	1	18000.0	5	18.49	461.90	D	D2	219843.0	5 years	0	...	NaN	NaN	NaN	NaN	NaN	13.0	NaN	NaN	NaN	NaN
2	2	12000.0	5	16.99	298.17	D	D3	31698.0	8 years	0	...	0.0	21.0	4.0	5.0	3.0	11.0	0.0	0.0	0.0	4.0
3	3	11000.0	3	7.26	340.96	A	A4	46854.0	10+ years	1	...	16.0	4.0	7.0	21.0	6.0	9.0	0.0	0.0	0.0	1.0
4	4	3000.0	3	12.99	101.07	C	C2	54.0	NaN	1	...	4.0	9.0	10.0	15.0	7.0	12.0	0.0	0.0	0.0	4.0

5 rows × 47 columns

import copy
df = copy.deepcopy(data)
col_name = data.columns
new_col = ['信用证标识','贷款金额','贷款期限','贷款利率','分期付款金额','贷款等级','贷款等级之子级','就业职称','就业年限（年）','房屋所有权状况','年收入','验证状态','贷款发放的月份','违约状态','贷款用途类别','邮政编码的前3位数字','地区编码','债务收入比','违约事件数','fico所属的下限范围','fico所属的上限范围','未结信用额度的数量','贬损公共记录的数量','公开记录清除的数量','信贷周转余额合计','循环额度利用率','当前的信用额度总数','贷款的初始列表状态','申请方式','信用额度开立的月份','贷款名称','公开策略','n0','n1','n2','n3','n4','n5','n6','n7','n8','n9','n10','n11','n12','n13','n14']
df.columns = new_col
df.head()

	信用证标识	贷款金额	贷款期限	贷款利率	分期付款金额	贷款等级	贷款等级之子级	就业职称	就业年限（年）	房屋所有权状况	...	n5	n6	n7	n8	n9	n10	n11	n12	n13	n14
0	0	35000.0	5	19.52	917.97	E	E2	320.0	2 years	2	...	9.0	8.0	4.0	12.0	2.0	7.0	0.0	0.0	0.0	2.0
1	1	18000.0	5	18.49	461.90	D	D2	219843.0	5 years	0	...	NaN	NaN	NaN	NaN	NaN	13.0	NaN	NaN	NaN	NaN
2	2	12000.0	5	16.99	298.17	D	D3	31698.0	8 years	0	...	0.0	21.0	4.0	5.0	3.0	11.0	0.0	0.0	0.0	4.0
3	3	11000.0	3	7.26	340.96	A	A4	46854.0	10+ years	1	...	16.0	4.0	7.0	21.0	6.0	9.0	0.0	0.0	0.0	1.0
4	4	3000.0	3	12.99	101.07	C	C2	54.0	NaN	1	...	4.0	9.0	10.0	15.0	7.0	12.0	0.0	0.0	0.0	4.0

5 rows × 47 columns

缺失值填充

from math import isnan

n = sorted(df['n0'])[len(df['n0'])//2]

train_cz = list(copy.deepcopy(df['n0']))
cz = []

for i in train_cz:
    if isnan(i):
        cz.append(2)
    else:
        cz.append(i)
df.drop('n0',axis=1, inplace=True)
df['n0'] = cz

del train_cz, cz

n = sorted(df['n1'])[len(df['n1'])//2]

train_cz = list(copy.deepcopy(df['n1']))
cz = []

for i in train_cz:
    if isnan(i):
        cz.append(2)
    else:
        cz.append(i)
df.drop('n1',axis=1, inplace=True)
df['n1'] = cz

del train_cz, cz

n = sorted(df['n2'])[len(df['n2'])//2]

train_cz = list(copy.deepcopy(df['n2']))
cz = []

for i in train_cz:
    if isnan(i):
        cz.append(2)
    else:
        cz.append(i)
df.drop('n2',axis=1, inplace=True)
df['n2'] = cz

del train_cz, cz

n = sorted(df['n3'])[len(df['n3'])//2]

train_cz = list(copy.deepcopy(df['n3']))
cz = []

for i in train_cz:
    if isnan(i):
        cz.append(2)
    else:
        cz.append(i)
df.drop('n3',axis=1, inplace=True)
df['n3'] = cz

del train_cz, cz

n = sorted(df['n4'])[len(df['n4'])//2]

train_cz = list(copy.deepcopy(df['n4']))
cz = []

for i in train_cz:
    if isnan(i):
        cz.append(2)
    else:
        cz.append(i)
df.drop('n4',axis=1, inplace=True)
df['n4'] = cz

del train_cz, cz

n = sorted(df['n5'])[len(df['n5'])//2]

train_cz = list(copy.deepcopy(df['n5']))
cz = []

for i in train_cz:
    if isnan(i):
        cz.append(2)
    else:
        cz.append(i)
df.drop('n5',axis=1, inplace=True)
df['n5'] = cz

del train_cz, cz

n = sorted(df['n6'])[len(df['n6'])//2]

train_cz = list(copy.deepcopy(df['n6']))
cz = []

for i in train_cz:
    if isnan(i):
        cz.append(2)
    else:
        cz.append(i)
df.drop('n6',axis=1, inplace=True)
df['n6'] = cz

del train_cz, cz

n = sorted(df['n7'])[len(df['n7'])//2]

train_cz = list(copy.deepcopy(df['n7']))
cz = []

for i in train_cz:
    if isnan(i):
        cz.append(2)
    else:
        cz.append(i)
df.drop('n7',axis=1, inplace=True)
df['n7'] = cz

del train_cz, cz

n = sorted(df['n8'])[len(df['n8'])//2]

train_cz = list(copy.deepcopy(df['n8']))
cz = []

for i in train_cz:
    if isnan(i):
        cz.append(2)
    else:
        cz.append(i)
df.drop('n8',axis=1, inplace=True)
df['n8'] = cz

del train_cz, cz

n = sorted(df['n9'])[len(df['n9'])//2]

train_cz = list(copy.deepcopy(df['n9']))
cz = []

for i in train_cz:
    if isnan(i):
        cz.append(2)
    else:
        cz.append(i)
df.drop('n9',axis=1, inplace=True)
df['n9'] = cz

del train_cz, cz

n = sorted(df['n10'])[len(df['n10'])//2]

train_cz = list(copy.deepcopy(df['n10']))
cz = []

for i in train_cz:
    if isnan(i):
        cz.append(2)
    else:
        cz.append(i)
df.drop('n10',axis=1, inplace=True)
df['n10'] = cz

del train_cz, cz

n = sorted(df['n11'])[len(df['n11'])//2]

train_cz = list(copy.deepcopy(df['n11']))
cz = []

for i in train_cz:
    if isnan(i):
        cz.append(2)
    else:
        cz.append(i)
df.drop('n11',axis=1, inplace=True)
df['n11'] = cz

del train_cz, cz

n = sorted(df['n12'])[len(df['n12'])//2]

train_cz = list(copy.deepcopy(df['n12']))
cz = []

for i in train_cz:
    if isnan(i):
        cz.append(2)
    else:
        cz.append(i)
df.drop('n12',axis=1, inplace=True)
df['n12'] = cz

del train_cz, cz

n = sorted(df['n13'])[len(df['n13'])//2]

train_cz = list(copy.deepcopy(df['n13']))
cz = []

for i in train_cz:
    if isnan(i):
        cz.append(2)
    else:
        cz.append(i)
df.drop('n13',axis=1, inplace=True)
df['n13'] = cz

del train_cz, cz

n = sorted(df['n14'])[len(df['n14'])//2]

train_cz = list(copy.deepcopy(df['n14']))
cz = []

for i in train_cz:
    if isnan(i):
        cz.append(2)
    else:
        cz.append(i)
df.drop('n14',axis=1, inplace=True)
df['n14'] = cz

del train_cz, cz

df['贷款名称'].unique()

array([1.0000e+00, 1.7230e+03, 0.0000e+00, ..., 3.6026e+04, 3.9735e+04,
       3.3369e+04])

df['贷款名称']

0             1.0
1          1723.0
2             0.0
3             4.0
4            11.0
           ...   
799995        0.0
799996    33369.0
799997        0.0
799998        4.0
799999        4.0
Name: 贷款名称, Length: 800000, dtype: float64

n = int(df['贷款名称'].sum()/8000000)
n

train_cz = list(copy.deepcopy(df['贷款名称']))
cz = []

for i in train_cz:
    if isnan(i):
        cz.append(2)
    else:
        cz.append(i)
df.drop('贷款名称',axis=1, inplace=True)
df['贷款名称'] = cz

del train_cz, cz

df['就业职称'].unique()

array([3.20000e+02, 2.19843e+05, 3.16980e+04, ..., 2.66097e+05,
       3.62528e+05, 1.34854e+05])

df['就业职称']

0            320.0
1         219843.0
2          31698.0
3          46854.0
4             54.0
            ...   
799995      2659.0
799996     29205.0
799997      2582.0
799998       151.0
799999        13.0
Name: 就业职称, Length: 800000, dtype: float64

n = int(df['就业职称'].sum()/8000000)
n

train_cz = list(copy.deepcopy(df['就业职称']))
cz = []

for i in train_cz:
    if isnan(i):
        cz.append(2)
    else:
        cz.append(i)
df.drop('就业职称',axis=1, inplace=True)
df['就业职称'] = cz

del train_cz, cz

df['邮政编码的前3位数字']

0         137.0
1         156.0
2         337.0
3         148.0
4         301.0
          ...  
799995    242.0
799996    563.0
799997     47.0
799998     34.0
799999     62.0
Name: 邮政编码的前3位数字, Length: 800000, dtype: float64

n = sorted(df['邮政编码的前3位数字'])[len(df['邮政编码的前3位数字'])//2]

train_cz = list(copy.deepcopy(df['邮政编码的前3位数字']))
cz = []

for i in train_cz:
    if isnan(i):
        cz.append(2)
    else:
        cz.append(i)
df.drop('邮政编码的前3位数字',axis=1, inplace=True)
df['邮政编码的前3位数字'] = cz

del train_cz, cz

df['债务收入比'].head(20),df['债务收入比'].tail(20)

(0     17.05
 1     27.83
 2     22.77
 3     17.21
 4     32.16
 5     17.14
 6     17.49
 7     32.60
 8     19.22
 9     24.39
 10    14.21
 11    34.63
 12     7.58
 13     5.68
 14    38.95
 15    17.27
 16    21.02
 17    17.14
 18    28.95
 19    15.55
 Name: 债务收入比, dtype: float64, 799980     7.16
 799981    10.32
 799982    26.50
 799983    27.87
 799984     9.36
 799985    36.44
 799986     6.45
 799987    29.76
 799988    19.03
 799989    32.03
 799990    20.16
 799991    29.68
 799992    20.86
 799993    18.27
 799994    23.96
 799995    19.03
 799996    15.72
 799997    12.11
 799998    29.25
 799999     8.99
 Name: 债务收入比, dtype: float64)

n = round(df['债务收入比'].sum()/(8000000-239),2)
n

1.83

train_cz = list(copy.deepcopy(df['债务收入比']))
cz = []

for i in train_cz:
    if isnan(i):
        cz.append(2)
    else:
        cz.append(i)
df.drop('债务收入比',axis=1, inplace=True)
df['债务收入比'] = cz

del train_cz, cz

df['公开记录清除的数量'].head(20),df['公开记录清除的数量'].tail(20)

(0     0.0
 1     0.0
 2     0.0
 3     0.0
 4     0.0
 5     0.0
 6     0.0
 7     1.0
 8     0.0
 9     0.0
 10    0.0
 11    0.0
 12    0.0
 13    0.0
 14    0.0
 15    1.0
 16    0.0
 17    0.0
 18    0.0
 19    0.0
 Name: 公开记录清除的数量, dtype: float64, 799980    1.0
 799981    0.0
 799982    0.0
 799983    0.0
 799984    1.0
 799985    0.0
 799986    0.0
 799987    0.0
 799988    0.0
 799989    0.0
 799990    0.0
 799991    0.0
 799992    1.0
 799993    0.0
 799994    0.0
 799995    0.0
 799996    0.0
 799997    0.0
 799998    0.0
 799999    0.0
 Name: 公开记录清除的数量, dtype: float64)

n = sorted(df['公开记录清除的数量'])[len(df['公开记录清除的数量'])//2]

train_cz = list(copy.deepcopy(df['公开记录清除的数量']))
cz = []

for i in train_cz:
    if isnan(i):
        cz.append(2)
    else:
        cz.append(i)
df.drop('公开记录清除的数量',axis=1, inplace=True)
df['公开记录清除的数量'] = cz

del train_cz, cz

df['循环额度利用率'].head(20),df['循环额度利用率'].tail(20)

(0     48.9
 1     38.9
 2     51.8
 3     52.6
 4     32.0
 5     31.1
 6      8.5
 7     59.7
 8     46.0
 9     30.6
 10    47.5
 11    80.9
 12    35.7
 13    68.8
 14    60.8
 15     3.6
 16    61.1
 17    76.9
 18    84.0
 19    57.0
 Name: 循环额度利用率, dtype: float64, 799980    70.2
 799981    36.4
 799982    38.4
 799983    28.4
 799984    46.1
 799985    70.7
 799986    91.8
 799987    49.6
 799988    87.2
 799989    52.7
 799990    62.2
 799991    93.5
 799992    47.4
 799993    47.5
 799994    60.5
 799995    46.4
 799996    98.4
 799997    51.9
 799998    61.3
 799999    72.6
 Name: 循环额度利用率, dtype: float64)

n = round(df['循环额度利用率'].sum()/(8000000-239),1)
n

5.2

train_cz = list(copy.deepcopy(df['循环额度利用率']))
cz = []

for i in train_cz:
    if isnan(i):
        cz.append(2)
    else:
        cz.append(i)
df.drop('循环额度利用率',axis=1, inplace=True)
df['循环额度利用率'] = cz

del train_cz, cz

df['就业年限（年）'].unique()

array(['2 years', '5 years', '8 years', '10+ years', nan, '7 years',
       '9 years', '1 year', '3 years', '< 1 year', '4 years', '6 years'],
      dtype=object)

df['就业年限（年）'] = df['就业年限（年）'].map({'1 year':1,'2 years':2,'3 years':3,'4 years':4,'5 years':5,'6 years':6,'7 years':7,'8 years':8,'9 years':9,'10+ years':10,'< 1 year':0})
df['就业年限（年）']

0          2.0
1          5.0
2          8.0
3         10.0
4          NaN
          ... 
799995     7.0
799996    10.0
799997    10.0
799998    10.0
799999     5.0
Name: 就业年限（年）, Length: 800000, dtype: float64

n = sorted(df['就业年限（年）'])[len(df['就业年限（年）'])//2]

train_cz = list(copy.deepcopy(df['就业年限（年）']))
cz = []

for i in train_cz:
    if isnan(i):
        cz.append(2)
    else:
        cz.append(i)
df.drop('就业年限（年）',axis=1, inplace=True)
df['就业年限（年）'] = cz

del train_cz, cz

df.head(20)

	信用证标识	贷款金额	贷款期限	贷款利率	分期付款金额	贷款等级	贷款等级之子级	房屋所有权状况	年收入	验证状态	...	n12	n13	n14	贷款名称	就业职称	邮政编码的前3位数字	债务收入比	公开记录清除的数量	循环额度利用率	就业年限（年）
0	0	35000.0	5	19.52	917.97	E	E2	2	110000.0	2	...	0.0	0.0	2.0	1.0	320.0	137.0	17.05	0.0	48.9	2.0
1	1	18000.0	5	18.49	461.90	D	D2	0	46000.0	2	...	2.0	2.0	2.0	1723.0	219843.0	156.0	27.83	0.0	38.9	5.0
2	2	12000.0	5	16.99	298.17	D	D3	0	74000.0	2	...	0.0	0.0	4.0	0.0	31698.0	337.0	22.77	0.0	51.8	8.0
3	3	11000.0	3	7.26	340.96	A	A4	1	118000.0	1	...	0.0	0.0	1.0	4.0	46854.0	148.0	17.21	0.0	52.6	10.0
4	4	3000.0	3	12.99	101.07	C	C2	1	29000.0	2	...	0.0	0.0	4.0	11.0	54.0	301.0	32.16	0.0	32.0	2.0
5	5	11000.0	3	7.99	344.65	A	A5	0	39000.0	2	...	0.0	0.0	0.0	10.0	51727.0	512.0	17.14	0.0	31.1	7.0
6	6	2050.0	3	7.69	63.95	A	A4	0	35000.0	0	...	0.0	0.0	3.0	0.0	180083.0	517.0	17.49	0.0	8.5	9.0
7	7	11500.0	3	14.98	398.54	C	C3	1	30000.0	2	...	0.0	0.0	2.0	0.0	214017.0	100.0	32.60	1.0	59.7	1.0
8	8	12000.0	3	12.99	404.27	C	C2	2	60000.0	1	...	0.0	0.0	6.0	0.0	188.0	792.0	19.22	0.0	46.0	5.0
9	9	6500.0	3	10.99	212.78	B	B4	1	15300.0	2	...	0.0	0.0	8.0	0.0	54.0	59.0	24.39	0.0	30.6	2.0
10	10	5600.0	3	13.67	190.50	B	B5	1	38000.0	0	...	0.0	0.0	3.0	27332.0	46631.0	134.0	14.21	0.0	47.5	10.0
11	11	5000.0	3	27.27	204.86	E	E5	0	95000.0	0	...	0.0	0.0	3.0	0.0	140759.0	893.0	34.63	0.0	80.9	3.0
12	12	19200.0	3	20.00	713.55	D	D4	1	88000.0	0	...	0.0	0.0	8.0	0.0	139297.0	195.0	7.58	0.0	35.7	2.0
13	13	24000.0	3	9.99	774.30	B	B3	0	150000.0	1	...	0.0	0.0	2.0	5.0	4967.0	134.0	5.68	0.0	68.8	10.0
14	14	16000.0	3	7.91	500.72	A	A5	1	50000.0	0	...	0.0	0.0	1.0	4.0	11037.0	167.0	38.95	0.0	60.8	2.0
15	15	6000.0	3	10.49	194.99	B	B2	0	77000.0	1	...	0.0	0.0	0.0	5.0	251530.0	194.0	17.27	1.0	3.6	2.0
16	16	10375.0	5	15.61	250.16	D	D1	0	58000.0	0	...	0.0	0.0	2.0	5.0	199961.0	492.0	21.02	0.0	61.1	9.0
17	17	5500.0	3	10.99	180.04	B	B4	1	50000.0	1	...	0.0	0.0	0.0	4.0	81579.0	56.0	17.14	0.0	76.9	0.0
18	18	6000.0	3	21.00	226.06	E	E2	1	40000.0	0	...	0.0	1.0	0.0	49758.0	323019.0	140.0	28.95	0.0	84.0	10.0
19	19	20000.0	5	21.00	541.07	E	E1	1	90000.0	2	...	0.0	0.0	6.0	38.0	258238.0	305.0	15.55	0.0	57.0	9.0

20 rows × 47 columns

df['贷款等级'] = df['贷款等级'].map({'A':10,'B':20,'C':30,'D':40,'E':50,'F':60,'G':70})

df['贷款等级之子级'] = df['贷款等级之子级'].map({'A1':1,'A2':2,'A3':3,'A4':4,'A5':5,'B1':1,'B2':2,'B3':3,'B4':4,'B5':5,
        'C1':1,'C2':2,'C3':3,'C4':4,'C5':5,'D1':1,'D2':2,'D3':3,'D4':4,'D5':5,
        'E1':1,'E2':2,'E3':3,'E4':4,'E5':5,'F1':1,'F2':2,'F3':3,'F4':4,'F5':5,
        'G1':1,'G2':2,'G3':3,'G4':4,'G5':5})

df['贷款等级之子级']

0         2
1         2
2         3
3         4
4         2
         ..
799995    4
799996    4
799997    3
799998    4
799999    3
Name: 贷款等级之子级, Length: 800000, dtype: int64

Chimpanzee1

关注

0
点赞
踩
3

收藏

觉得还不错? 一键收藏
1
评论
贷款违约-特征工程（一）

import numpy as npfrom matplotlib import pyplot as pltimport pandas as pdimport seaborn as snsimport warningswarnings.filterwarnings('ignore')data = pd.read_csv('train.csv')# data = pd.read_csv('C:\\Users\\hy\\天池\\贷款违约\\train.csv')data.head()
复制链接

扫一扫

专栏目录