贷款违约-特征工程(一)

import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

data = pd.read_csv('train.csv')

# data = pd.read_csv('C:\\Users\\hy\\天池\\贷款违约\\train.csv')
data.head()
idloanAmntterminterestRateinstallmentgradesubGradeemploymentTitleemploymentLengthhomeOwnership...n5n6n7n8n9n10n11n12n13n14
0035000.0519.52917.97EE2320.02 years2...9.08.04.012.02.07.00.00.00.02.0
1118000.0518.49461.90DD2219843.05 years0...NaNNaNNaNNaNNaN13.0NaNNaNNaNNaN
2212000.0516.99298.17DD331698.08 years0...0.021.04.05.03.011.00.00.00.04.0
3311000.037.26340.96AA446854.010+ years1...16.04.07.021.06.09.00.00.00.01.0
443000.0312.99101.07CC254.0NaN1...4.09.010.015.07.012.00.00.00.04.0

5 rows × 47 columns

import copy
df = copy.deepcopy(data)
col_name = data.columns
new_col = ['信用证标识','贷款金额','贷款期限','贷款利率','分期付款金额','贷款等级','贷款等级之子级','就业职称','就业年限(年)','房屋所有权状况','年收入','验证状态','贷款发放的月份','违约状态','贷款用途类别','邮政编码的前3位数字','地区编码','债务收入比','违约事件数','fico所属的下限范围','fico所属的上限范围','未结信用额度的数量','贬损公共记录的数量','公开记录清除的数量','信贷周转余额合计','循环额度利用率','当前的信用额度总数','贷款的初始列表状态','申请方式','信用额度开立的月份','贷款名称','公开策略','n0','n1','n2','n3','n4','n5','n6','n7','n8','n9','n10','n11','n12','n13','n14']
df.columns = new_col
df.head()
信用证标识贷款金额贷款期限贷款利率分期付款金额贷款等级贷款等级之子级就业职称就业年限(年)房屋所有权状况...n5n6n7n8n9n10n11n12n13n14
0035000.0519.52917.97EE2320.02 years2...9.08.04.012.02.07.00.00.00.02.0
1118000.0518.49461.90DD2219843.05 years0...NaNNaNNaNNaNNaN13.0NaNNaNNaNNaN
2212000.0516.99298.17DD331698.08 years0...0.021.04.05.03.011.00.00.00.04.0
3311000.037.26340.96AA446854.010+ years1...16.04.07.021.06.09.00.00.00.01.0
443000.0312.99101.07CC254.0NaN1...4.09.010.015.07.012.00.00.00.04.0

5 rows × 47 columns

缺失值填充

from math import isnan

n = sorted(df['n0'])[len(df['n0'])//2]

train_cz = list(copy.deepcopy(df['n0']))
cz = []

for i in train_cz:
    if isnan(i):
        cz.append(2)
    else:
        cz.append(i)
df.drop('n0',axis=1, inplace=True)
df['n0'] = cz

del train_cz, cz
n = sorted(df['n1'])[len(df['n1'])//2]

train_cz = list(copy.deepcopy(df['n1']))
cz = []

for i in train_cz:
    if isnan(i):
        cz.append(2)
    else:
        cz.append(i)
df.drop('n1',axis=1, inplace=True)
df['n1'] = cz

del train_cz, cz
n = sorted(df['n2'])[len(df['n2'])//2]

train_cz = list(copy.deepcopy(df['n2']))
cz = []

for i in train_cz:
    if isnan(i):
        cz.append(2)
    else:
        cz.append(i)
df.drop('n2',axis=1, inplace=True)
df['n2'] = cz

del train_cz, cz
n = sorted(df['n3'])[len(df['n3'])//2]

train_cz = list(copy.deepcopy(df['n3']))
cz = []

for i in train_cz:
    if isnan(i):
        cz.append(2)
    else:
        cz.append(i)
df.drop('n3',axis=1, inplace=True)
df['n3'] = cz

del train_cz, cz
n = sorted(df['n4'])[len(df['n4'])//2]

train_cz = list(copy.deepcopy(df['n4']))
cz = []

for i in train_cz:
    if isnan(i):
        cz.append(2)
    else:
        cz.append(i)
df.drop('n4',axis=1, inplace=True)
df['n4'] = cz

del train_cz, cz
n = sorted(df['n5'])[len(df['n5'])//2]

train_cz = list(copy.deepcopy(df['n5']))
cz = []

for i in train_cz:
    if isnan(i):
        cz.append(2)
    else:
        cz.append(i)
df.drop('n5',axis=1, inplace=True)
df['n5'] = cz

del train_cz, cz
n = sorted(df['n6'])[len(df['n6'])//2]

train_cz = list(copy.deepcopy(df['n6']))
cz = []

for i in train_cz:
    if isnan(i):
        cz.append(2)
    else:
        cz.append(i)
df.drop('n6',axis=1, inplace=True)
df['n6'] = cz

del train_cz, cz
n = sorted(df['n7'])[len(df['n7'])//2]

train_cz = list(copy.deepcopy(df['n7']))
cz = []

for i in train_cz:
    if isnan(i):
        cz.append(2)
    else:
        cz.append(i)
df.drop('n7',axis=1, inplace=True)
df['n7'] = cz

del train_cz, cz
n = sorted(df['n8'])[len(df['n8'])//2]

train_cz = list(copy.deepcopy(df['n8']))
cz = []

for i in train_cz:
    if isnan(i):
        cz.append(2)
    else:
        cz.append(i)
df.drop('n8',axis=1, inplace=True)
df['n8'] = cz

del train_cz, cz
n = sorted(df['n9'])[len(df['n9'])//2]

train_cz = list(copy.deepcopy(df['n9']))
cz = []

for i in train_cz:
    if isnan(i):
        cz.append(2)
    else:
        cz.append(i)
df.drop('n9',axis=1, inplace=True)
df['n9'] = cz

del train_cz, cz
n = sorted(df['n10'])[len(df['n10'])//2]

train_cz = list(copy.deepcopy(df['n10']))
cz = []

for i in train_cz:
    if isnan(i):
        cz.append(2)
    else:
        cz.append(i)
df.drop('n10',axis=1, inplace=True)
df['n10'] = cz

del train_cz, cz
n = sorted(df['n11'])[len(df['n11'])//2]

train_cz = list(copy.deepcopy(df['n11']))
cz = []

for i in train_cz:
    if isnan(i):
        cz.append(2)
    else:
        cz.append(i)
df.drop('n11',axis=1, inplace=True)
df['n11'] = cz

del train_cz, cz
n = sorted(df['n12'])[len(df['n12'])//2]

train_cz = list(copy.deepcopy(df['n12']))
cz = []

for i in train_cz:
    if isnan(i):
        cz.append(2)
    else:
        cz.append(i)
df.drop('n12',axis=1, inplace=True)
df['n12'] = cz

del train_cz, cz
n = sorted(df['n13'])[len(df['n13'])//2]

train_cz = list(copy.deepcopy(df['n13']))
cz = []

for i in train_cz:
    if isnan(i):
        cz.append(2)
    else:
        cz.append(i)
df.drop('n13',axis=1, inplace=True)
df['n13'] = cz

del train_cz, cz
n = sorted(df['n14'])[len(df['n14'])//2]

train_cz = list(copy.deepcopy(df['n14']))
cz = []

for i in train_cz:
    if isnan(i):
        cz.append(2)
    else:
        cz.append(i)
df.drop('n14',axis=1, inplace=True)
df['n14'] = cz

del train_cz, cz
df['贷款名称'].unique()
array([1.0000e+00, 1.7230e+03, 0.0000e+00, ..., 3.6026e+04, 3.9735e+04,
       3.3369e+04])
df['贷款名称']
0             1.0
1          1723.0
2             0.0
3             4.0
4            11.0
           ...   
799995        0.0
799996    33369.0
799997        0.0
799998        4.0
799999        4.0
Name: 贷款名称, Length: 800000, dtype: float64
n = int(df['贷款名称'].sum()/8000000)
n
175
train_cz = list(copy.deepcopy(df['贷款名称']))
cz = []

for i in train_cz:
    if isnan(i):
        cz.append(2)
    else:
        cz.append(i)
df.drop('贷款名称',axis=1, inplace=True)
df['贷款名称'] = cz

del train_cz, cz
df['就业职称'].unique()
array([3.20000e+02, 2.19843e+05, 3.16980e+04, ..., 2.66097e+05,
       3.62528e+05, 1.34854e+05])
df['就业职称']
0            320.0
1         219843.0
2          31698.0
3          46854.0
4             54.0
            ...   
799995      2659.0
799996     29205.0
799997      2582.0
799998       151.0
799999        13.0
Name: 就业职称, Length: 800000, dtype: float64
n = int(df['就业职称'].sum()/8000000)
n
7200
train_cz = list(copy.deepcopy(df['就业职称']))
cz = []

for i in train_cz:
    if isnan(i):
        cz.append(2)
    else:
        cz.append(i)
df.drop('就业职称',axis=1, inplace=True)
df['就业职称'] = cz

del train_cz, cz
df['邮政编码的前3位数字']
0         137.0
1         156.0
2         337.0
3         148.0
4         301.0
          ...  
799995    242.0
799996    563.0
799997     47.0
799998     34.0
799999     62.0
Name: 邮政编码的前3位数字, Length: 800000, dtype: float64
n = sorted(df['邮政编码的前3位数字'])[len(df['邮政编码的前3位数字'])//2]

train_cz = list(copy.deepcopy(df['邮政编码的前3位数字']))
cz = []

for i in train_cz:
    if isnan(i):
        cz.append(2)
    else:
        cz.append(i)
df.drop('邮政编码的前3位数字',axis=1, inplace=True)
df['邮政编码的前3位数字'] = cz

del train_cz, cz
df['债务收入比'].head(20),df['债务收入比'].tail(20)
(0     17.05
 1     27.83
 2     22.77
 3     17.21
 4     32.16
 5     17.14
 6     17.49
 7     32.60
 8     19.22
 9     24.39
 10    14.21
 11    34.63
 12     7.58
 13     5.68
 14    38.95
 15    17.27
 16    21.02
 17    17.14
 18    28.95
 19    15.55
 Name: 债务收入比, dtype: float64, 799980     7.16
 799981    10.32
 799982    26.50
 799983    27.87
 799984     9.36
 799985    36.44
 799986     6.45
 799987    29.76
 799988    19.03
 799989    32.03
 799990    20.16
 799991    29.68
 799992    20.86
 799993    18.27
 799994    23.96
 799995    19.03
 799996    15.72
 799997    12.11
 799998    29.25
 799999     8.99
 Name: 债务收入比, dtype: float64)
n = round(df['债务收入比'].sum()/(8000000-239),2)
n
1.83
train_cz = list(copy.deepcopy(df['债务收入比']))
cz = []

for i in train_cz:
    if isnan(i):
        cz.append(2)
    else:
        cz.append(i)
df.drop('债务收入比',axis=1, inplace=True)
df['债务收入比'] = cz

del train_cz, cz
df['公开记录清除的数量'].head(20),df['公开记录清除的数量'].tail(20)
(0     0.0
 1     0.0
 2     0.0
 3     0.0
 4     0.0
 5     0.0
 6     0.0
 7     1.0
 8     0.0
 9     0.0
 10    0.0
 11    0.0
 12    0.0
 13    0.0
 14    0.0
 15    1.0
 16    0.0
 17    0.0
 18    0.0
 19    0.0
 Name: 公开记录清除的数量, dtype: float64, 799980    1.0
 799981    0.0
 799982    0.0
 799983    0.0
 799984    1.0
 799985    0.0
 799986    0.0
 799987    0.0
 799988    0.0
 799989    0.0
 799990    0.0
 799991    0.0
 799992    1.0
 799993    0.0
 799994    0.0
 799995    0.0
 799996    0.0
 799997    0.0
 799998    0.0
 799999    0.0
 Name: 公开记录清除的数量, dtype: float64)
n = sorted(df['公开记录清除的数量'])[len(df['公开记录清除的数量'])//2]

train_cz = list(copy.deepcopy(df['公开记录清除的数量']))
cz = []

for i in train_cz:
    if isnan(i):
        cz.append(2)
    else:
        cz.append(i)
df.drop('公开记录清除的数量',axis=1, inplace=True)
df['公开记录清除的数量'] = cz

del train_cz, cz
df['循环额度利用率'].head(20),df['循环额度利用率'].tail(20)
(0     48.9
 1     38.9
 2     51.8
 3     52.6
 4     32.0
 5     31.1
 6      8.5
 7     59.7
 8     46.0
 9     30.6
 10    47.5
 11    80.9
 12    35.7
 13    68.8
 14    60.8
 15     3.6
 16    61.1
 17    76.9
 18    84.0
 19    57.0
 Name: 循环额度利用率, dtype: float64, 799980    70.2
 799981    36.4
 799982    38.4
 799983    28.4
 799984    46.1
 799985    70.7
 799986    91.8
 799987    49.6
 799988    87.2
 799989    52.7
 799990    62.2
 799991    93.5
 799992    47.4
 799993    47.5
 799994    60.5
 799995    46.4
 799996    98.4
 799997    51.9
 799998    61.3
 799999    72.6
 Name: 循环额度利用率, dtype: float64)
n = round(df['循环额度利用率'].sum()/(8000000-239),1)
n
5.2
train_cz = list(copy.deepcopy(df['循环额度利用率']))
cz = []

for i in train_cz:
    if isnan(i):
        cz.append(2)
    else:
        cz.append(i)
df.drop('循环额度利用率',axis=1, inplace=True)
df['循环额度利用率'] = cz

del train_cz, cz
df['就业年限(年)'].unique()
array(['2 years', '5 years', '8 years', '10+ years', nan, '7 years',
       '9 years', '1 year', '3 years', '< 1 year', '4 years', '6 years'],
      dtype=object)
df['就业年限(年)'] = df['就业年限(年)'].map({'1 year':1,'2 years':2,'3 years':3,'4 years':4,'5 years':5,'6 years':6,'7 years':7,'8 years':8,'9 years':9,'10+ years':10,'< 1 year':0})
df['就业年限(年)']
0          2.0
1          5.0
2          8.0
3         10.0
4          NaN
          ... 
799995     7.0
799996    10.0
799997    10.0
799998    10.0
799999     5.0
Name: 就业年限(年), Length: 800000, dtype: float64
n = sorted(df['就业年限(年)'])[len(df['就业年限(年)'])//2]

train_cz = list(copy.deepcopy(df['就业年限(年)']))
cz = []

for i in train_cz:
    if isnan(i):
        cz.append(2)
    else:
        cz.append(i)
df.drop('就业年限(年)',axis=1, inplace=True)
df['就业年限(年)'] = cz

del train_cz, cz
df.head(20)
信用证标识贷款金额贷款期限贷款利率分期付款金额贷款等级贷款等级之子级房屋所有权状况年收入验证状态...n12n13n14贷款名称就业职称邮政编码的前3位数字债务收入比公开记录清除的数量循环额度利用率就业年限(年)
0035000.0519.52917.97EE22110000.02...0.00.02.01.0320.0137.017.050.048.92.0
1118000.0518.49461.90DD2046000.02...2.02.02.01723.0219843.0156.027.830.038.95.0
2212000.0516.99298.17DD3074000.02...0.00.04.00.031698.0337.022.770.051.88.0
3311000.037.26340.96AA41118000.01...0.00.01.04.046854.0148.017.210.052.610.0
443000.0312.99101.07CC2129000.02...0.00.04.011.054.0301.032.160.032.02.0
5511000.037.99344.65AA5039000.02...0.00.00.010.051727.0512.017.140.031.17.0
662050.037.6963.95AA4035000.00...0.00.03.00.0180083.0517.017.490.08.59.0
7711500.0314.98398.54CC3130000.02...0.00.02.00.0214017.0100.032.601.059.71.0
8812000.0312.99404.27CC2260000.01...0.00.06.00.0188.0792.019.220.046.05.0
996500.0310.99212.78BB4115300.02...0.00.08.00.054.059.024.390.030.62.0
10105600.0313.67190.50BB5138000.00...0.00.03.027332.046631.0134.014.210.047.510.0
11115000.0327.27204.86EE5095000.00...0.00.03.00.0140759.0893.034.630.080.93.0
121219200.0320.00713.55DD4188000.00...0.00.08.00.0139297.0195.07.580.035.72.0
131324000.039.99774.30BB30150000.01...0.00.02.05.04967.0134.05.680.068.810.0
141416000.037.91500.72AA5150000.00...0.00.01.04.011037.0167.038.950.060.82.0
15156000.0310.49194.99BB2077000.01...0.00.00.05.0251530.0194.017.271.03.62.0
161610375.0515.61250.16DD1058000.00...0.00.02.05.0199961.0492.021.020.061.19.0
17175500.0310.99180.04BB4150000.01...0.00.00.04.081579.056.017.140.076.90.0
18186000.0321.00226.06EE2140000.00...0.01.00.049758.0323019.0140.028.950.084.010.0
191920000.0521.00541.07EE1190000.02...0.00.06.038.0258238.0305.015.550.057.09.0

20 rows × 47 columns

df['贷款等级'] = df['贷款等级'].map({'A':10,'B':20,'C':30,'D':40,'E':50,'F':60,'G':70})
df['贷款等级之子级'] = df['贷款等级之子级'].map({'A1':1,'A2':2,'A3':3,'A4':4,'A5':5,'B1':1,'B2':2,'B3':3,'B4':4,'B5':5,
        'C1':1,'C2':2,'C3':3,'C4':4,'C5':5,'D1':1,'D2':2,'D3':3,'D4':4,'D5':5,
        'E1':1,'E2':2,'E3':3,'E4':4,'E5':5,'F1':1,'F2':2,'F3':3,'F4':4,'F5':5,
        'G1':1,'G2':2,'G3':3,'G4':4,'G5':5})
df['贷款等级之子级']
0         2
1         2
2         3
3         4
4         2
         ..
799995    4
799996    4
799997    3
799998    4
799999    3
Name: 贷款等级之子级, Length: 800000, dtype: int64
  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值