数据清洗+特征构造:application_{train|test}.csv

主表:客户详细信息

application_trian/test.csv


import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
import matplotlib.pyplot as plt

Preprocessing application_train.csv

application_train = pd.read_csv('data/application_train.csv')
application_train.head()
SK_ID_CURRTARGETNAME_CONTRACT_TYPECODE_GENDERFLAG_OWN_CARFLAG_OWN_REALTYCNT_CHILDRENAMT_INCOME_TOTALAMT_CREDITAMT_ANNUITYFLAG_DOCUMENT_18FLAG_DOCUMENT_19FLAG_DOCUMENT_20FLAG_DOCUMENT_21AMT_REQ_CREDIT_BUREAU_HOURAMT_REQ_CREDIT_BUREAU_DAYAMT_REQ_CREDIT_BUREAU_WEEKAMT_REQ_CREDIT_BUREAU_MONAMT_REQ_CREDIT_BUREAU_QRTAMT_REQ_CREDIT_BUREAU_YEAR
01000021Cash loansMNY0202500.0406597.524700.500000.00.00.00.00.01.0
11000030Cash loansFNN0270000.01293502.535698.500000.00.00.00.00.00.0
21000040Revolving loansMYY067500.0135000.06750.000000.00.00.00.00.00.0
31000060Cash loansFNY0135000.0312682.529686.50000NaNNaNNaNNaNNaNNaN
41000070Cash loansMNY0121500.0513000.021865.500000.00.00.00.00.00.0

5 rows × 122 columns

feature_names = application_train.columns
feature_names
Index(['SK_ID_CURR', 'TARGET', 'NAME_CONTRACT_TYPE', 'CODE_GENDER',
       'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL',
       'AMT_CREDIT', 'AMT_ANNUITY',
       ...
       'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20',
       'FLAG_DOCUMENT_21', 'AMT_REQ_CREDIT_BUREAU_HOUR',
       'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK',
       'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT',
       'AMT_REQ_CREDIT_BUREAU_YEAR'],
      dtype='object', length=122)

CODE_GENDER

application_train['CODE_GENDER'].unique()
array(['M', 'F', 'XNA'], dtype=object)

DAYS_EMPLOYED 365243是不可能的,异常值

application_train.loc[application_train['DAYS_EMPLOYED'] >0]['DAYS_EMPLOYED'].value_counts()
365243    55374
Name: DAYS_EMPLOYED, dtype: int64

用np.nan来取代缺失值和异常值

application_train['CODE_GENDER'].replace('XNA', np.nan, inplace=True)
application_train['DAYS_EMPLOYED'].replace(365243, np.nan, inplace = True)

申请贷款前多少天更换了电话

DAYS_LAST_PHONE_CHANGE

application_train['DAYS_LAST_PHONE_CHANGE'].value_counts().head(5)
 0.0    37672
-1.0     2812
-2.0     2318
-3.0     1763
-4.0     1285
Name: DAYS_LAST_PHONE_CHANGE, dtype: int64
application_train['DAYS_LAST_PHONE_CHANGE'].replace(0, np.nan, inplace = True)

NAME_FAMILY_STATUS

application_train['NAME_FAMILY_STATUS'].unique()
array(['Single / not married', 'Married', 'Civil marriage', 'Widow',
       'Separated', 'Unknown'], dtype=object)
application_train['NAME_FAMILY_STATUS'].replace('Unknow', np.nan, inplace = True)

ORGANIZATION_TYPE

application_train['ORGANIZATION_TYPE'].unique()
array(['Business Entity Type 3', 'School', 'Government', 'Religion',
       'Other', 'XNA', 'Electricity', 'Medicine', 'Business Entity Type 2',
       'Self-employed', 'Transport: type 2', 'Construction', 'Housing',
       'Kindergarten', 'Trade: type 7', 'Industry: type 11', 'Military',
       'Services', 'Security Ministries', 'Transport: type 4',
       'Industry: type 1', 'Emergency', 'Security', 'Trade: type 2',
       'University', 'Transport: type 3', 'Police',
       'Business Entity Type 1', 'Postal', 'Industry: type 4',
       'Agriculture', 'Restaurant', 'Culture', 'Hotel', 'Industry: type 7',
       'Trade: type 3', 'Industry: type 3', 'Bank', 'Industry: type 9',
       'Insurance', 'Trade: type 6', 'Industry: type 2',
       'Transport: type 1', 'Industry: type 12', 'Mobile', 'Trade: type 1',
       'Industry: type 5', 'Industry: type 10', 'Legal Services',
       'Advertising', 'Trade: type 5', 'Cleaning', 'Industry: type 13',
       'Trade: type 4', 'Telecom', 'Industry: type 8', 'Realtor',
       'Industry: type 6'], dtype=object)
application_train['ORGANIZATION_TYPE'].replace('XNA', np.nan, inplace = True)
application_train.fillna(np.nan).tail(5)
SK_ID_CURRTARGETNAME_CONTRACT_TYPECODE_GENDERFLAG_OWN_CARFLAG_OWN_REALTYCNT_CHILDRENAMT_INCOME_TOTALAMT_CREDITAMT_ANNUITYFLAG_DOCUMENT_18FLAG_DOCUMENT_19FLAG_DOCUMENT_20FLAG_DOCUMENT_21AMT_REQ_CREDIT_BUREAU_HOURAMT_REQ_CREDIT_BUREAU_DAYAMT_REQ_CREDIT_BUREAU_WEEKAMT_REQ_CREDIT_BUREAU_MONAMT_REQ_CREDIT_BUREAU_QRTAMT_REQ_CREDIT_BUREAU_YEAR
3075064562510Cash loansMNN0157500.0254700.027558.00000NaNNaNNaNNaNNaNNaN
3075074562520Cash loansFNY072000.0269550.012001.50000NaNNaNNaNNaNNaNNaN
3075084562530Cash loansFNY0153000.0677664.029979.000001.00.00.01.00.01.0
3075094562541Cash loansFNY0171000.0370107.020205.000000.00.00.00.00.00.0
3075104562550Cash loansFNN0157500.0675000.049117.500000.00.00.02.00.01.0

5 rows × 122 columns

Feature Engineering

application_train.csv构造新特征

application_train['annuity_income_percentage'] = application_train['AMT_ANNUITY'] / application_train['AMT_INCOME_TOTAL']
application_train['car_to_birth_ratio'] = application_train['OWN_CAR_AGE'] / application_train['DAYS_BIRTH']
application_train['car_to_employ_ratio'] = application_train['OWN_CAR_AGE'] / application_train['DAYS_EMPLOYED']
application_train['children_ratio'] = application_train['CNT_CHILDREN'] / application_train['CNT_FAM_MEMBERS']
application_train['credit_to_annuity_ratio'] = application_train['AMT_CREDIT'] / application_train['AMT_ANNUITY']
application_train['credit_to_goods_ratio'] = application_train['AMT_CREDIT'] / application_train['AMT_GOODS_PRICE']
application_train['credit_to_income_ratio'] = application_train['AMT_CREDIT'] / application_train['AMT_INCOME_TOTAL']
application_train['days_employed_percentage'] = application_train['DAYS_EMPLOYED'] / application_train['DAYS_BIRTH']
application_train['income_credit_percentage'] = application_train['AMT_INCOME_TOTAL'] / application_train['AMT_CREDIT']
application_train['income_per_child'] = application_train['AMT_INCOME_TOTAL'] / (1 + application_train['CNT_CHILDREN'])
application_train['income_per_person'] = application_train['AMT_INCOME_TOTAL'] / application_train['CNT_FAM_MEMBERS']
application_train['payment_rate'] = application_train['AMT_ANNUITY'] / application_train['AMT_CREDIT']
application_train['phone_to_birth_ratio'] = application_train['DAYS_LAST_PHONE_CHANGE'] / application_train['DAYS_BIRTH']
application_train['phone_to_employ_ratio'] = application_train['DAYS_LAST_PHONE_CHANGE'] / application_train['DAYS_EMPLOYED']
application_train['cnt_non_child'] = application_train['CNT_FAM_MEMBERS'] - application_train['CNT_CHILDREN']
application_train['child_to_non_child_ratio'] = application_train['CNT_CHILDREN'] / application_train['cnt_non_child']
application_train['income_per_non_child'] = application_train['AMT_INCOME_TOTAL'] / application_train['cnt_non_child']
application_train['credit_per_person'] = application_train['AMT_CREDIT'] / application_train['CNT_FAM_MEMBERS']
application_train['credit_per_child'] = application_train['AMT_CREDIT'] / (1 + application_train['CNT_CHILDREN'])
application_train['credit_per_non_child'] = application_train['AMT_CREDIT'] / application_train['cnt_non_child']
application_train['external_sources_weighted'] = application_train.EXT_SOURCE_1 * 2 + application_train.EXT_SOURCE_2 * 3 + application_train.EXT_SOURCE_3 * 4
application_train['retirement'] = (application_train.DAYS_BIRTH < -18000).astype(int)
application_train['long_employment'] = (application_train.DAYS_EMPLOYED < -4000).astype(int)

EXT_SOURCE 外部数据源

for feature_name in ['min', 'max', 'sum', 'mean', 'nanmedian']:
    application_train['external_sources_{}'.format(feature_name)] = eval('np.{}'.format(feature_name))(
        application_train[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']], axis=1)
application_train.shape
(307511, 150)

Preprocessing application_test.csv

test = pd.read_csv('data/application_test.csv')

test['CODE_GENDER'].replace('XNA', np.nan, inplace=True)
test['DAYS_EMPLOYED'].replace(365243, np.nan, inplace = True)
test['DAYS_LAST_PHONE_CHANGE'].replace(0, np.nan, inplace = True)
test['NAME_FAMILY_STATUS'].replace('Unknow', np.nan, inplace = True)
test['ORGANIZATION_TYPE'].replace('XNA', np.nan, inplace = True)
test = test.fillna(np.nan)

Feature Engineering

test['annuity_income_percentage'] = test['AMT_ANNUITY'] / test['AMT_INCOME_TOTAL']
test['car_to_birth_ratio'] = test['OWN_CAR_AGE'] / test['DAYS_BIRTH']
test['car_to_employ_ratio'] = test['OWN_CAR_AGE'] / test['DAYS_EMPLOYED']
test['children_ratio'] = test['CNT_CHILDREN'] / test['CNT_FAM_MEMBERS']
test['credit_to_annuity_ratio'] = test['AMT_CREDIT'] / test['AMT_ANNUITY']
test['credit_to_goods_ratio'] = test['AMT_CREDIT'] / test['AMT_GOODS_PRICE']
test['credit_to_income_ratio'] = test['AMT_CREDIT'] / test['AMT_INCOME_TOTAL']
test['days_employed_percentage'] = test['DAYS_EMPLOYED'] / test['DAYS_BIRTH']
test['income_credit_percentage'] = test['AMT_INCOME_TOTAL'] / test['AMT_CREDIT']
test['income_per_child'] = test['AMT_INCOME_TOTAL'] / (1 + test['CNT_CHILDREN'])
test['income_per_person'] = test['AMT_INCOME_TOTAL'] / test['CNT_FAM_MEMBERS']
test['payment_rate'] = test['AMT_ANNUITY'] / test['AMT_CREDIT']
test['phone_to_birth_ratio'] = test['DAYS_LAST_PHONE_CHANGE'] / test['DAYS_BIRTH']
test['phone_to_employ_ratio'] = test['DAYS_LAST_PHONE_CHANGE'] / test['DAYS_EMPLOYED']
test['cnt_non_child'] = test['CNT_FAM_MEMBERS'] - test['CNT_CHILDREN']
test['child_to_non_child_ratio'] = test['CNT_CHILDREN'] / test['cnt_non_child']
test['income_per_non_child'] = test['AMT_INCOME_TOTAL'] / test['cnt_non_child']
test['credit_per_person'] = test['AMT_CREDIT'] / test['CNT_FAM_MEMBERS']
test['credit_per_child'] = test['AMT_CREDIT'] / (1 + test['CNT_CHILDREN'])
test['credit_per_non_child'] = test['AMT_CREDIT'] / test['cnt_non_child']
test['external_sources_weighted'] = test.EXT_SOURCE_1 * 2 + test.EXT_SOURCE_2 * 3 + application_train.EXT_SOURCE_3 * 4
test['retirement'] = (test.DAYS_BIRTH < -18000).astype(int)
test['long_employment'] = (test.DAYS_EMPLOYED < -4000).astype(int)
for feature_name in ['min', 'max', 'sum', 'mean', 'nanmedian']:
    test['external_sources_{}'.format(feature_name)] = eval('np.{}'.format(feature_name))(
        test[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']], axis=1)
test.shape
(48744, 149)

相关性

correlations = application_train.corr()['TARGET'].sort_values()
correlations.iloc[:-1][:].plot(kind = 'barh', figsize = (10, 60))

这里写图片描述

Save data

application_train.to_csv('data/no_select_train.csv',index = False)
test.to_csv('data/no_select_test.csv', index = False)
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值