主表:客户详细信息
application_trian/test.csv
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
import matplotlib.pyplot as plt
Preprocessing application_train.csv
application_train = pd.read_csv('data/application_train.csv')
application_train.head()
SK_ID_CURR | TARGET | NAME_CONTRACT_TYPE | CODE_GENDER | FLAG_OWN_CAR | FLAG_OWN_REALTY | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | … | FLAG_DOCUMENT_18 | FLAG_DOCUMENT_19 | FLAG_DOCUMENT_20 | FLAG_DOCUMENT_21 | AMT_REQ_CREDIT_BUREAU_HOUR | AMT_REQ_CREDIT_BUREAU_DAY | AMT_REQ_CREDIT_BUREAU_WEEK | AMT_REQ_CREDIT_BUREAU_MON | AMT_REQ_CREDIT_BUREAU_QRT | AMT_REQ_CREDIT_BUREAU_YEAR | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 100002 | 1 | Cash loans | M | N | Y | 0 | 202500.0 | 406597.5 | 24700.5 | … | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
1 | 100003 | 0 | Cash loans | F | N | N | 0 | 270000.0 | 1293502.5 | 35698.5 | … | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | 100004 | 0 | Revolving loans | M | Y | Y | 0 | 67500.0 | 135000.0 | 6750.0 | … | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | 100006 | 0 | Cash loans | F | N | Y | 0 | 135000.0 | 312682.5 | 29686.5 | … | 0 | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN |
4 | 100007 | 0 | Cash loans | M | N | Y | 0 | 121500.0 | 513000.0 | 21865.5 | … | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 122 columns
feature_names = application_train.columns
feature_names
Index(['SK_ID_CURR', 'TARGET', 'NAME_CONTRACT_TYPE', 'CODE_GENDER',
'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL',
'AMT_CREDIT', 'AMT_ANNUITY',
...
'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20',
'FLAG_DOCUMENT_21', 'AMT_REQ_CREDIT_BUREAU_HOUR',
'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK',
'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT',
'AMT_REQ_CREDIT_BUREAU_YEAR'],
dtype='object', length=122)
CODE_GENDER
application_train['CODE_GENDER'].unique()
array(['M', 'F', 'XNA'], dtype=object)
DAYS_EMPLOYED 365243是不可能的,异常值
application_train.loc[application_train['DAYS_EMPLOYED'] >0]['DAYS_EMPLOYED'].value_counts()
365243 55374
Name: DAYS_EMPLOYED, dtype: int64
用np.nan来取代缺失值和异常值
application_train['CODE_GENDER'].replace('XNA', np.nan, inplace=True)
application_train['DAYS_EMPLOYED'].replace(365243, np.nan, inplace = True)
申请贷款前多少天更换了电话
DAYS_LAST_PHONE_CHANGE
application_train['DAYS_LAST_PHONE_CHANGE'].value_counts().head(5)
0.0 37672
-1.0 2812
-2.0 2318
-3.0 1763
-4.0 1285
Name: DAYS_LAST_PHONE_CHANGE, dtype: int64
application_train['DAYS_LAST_PHONE_CHANGE'].replace(0, np.nan, inplace = True)
NAME_FAMILY_STATUS
application_train['NAME_FAMILY_STATUS'].unique()
array(['Single / not married', 'Married', 'Civil marriage', 'Widow',
'Separated', 'Unknown'], dtype=object)
application_train['NAME_FAMILY_STATUS'].replace('Unknow', np.nan, inplace = True)
ORGANIZATION_TYPE
application_train['ORGANIZATION_TYPE'].unique()
array(['Business Entity Type 3', 'School', 'Government', 'Religion',
'Other', 'XNA', 'Electricity', 'Medicine', 'Business Entity Type 2',
'Self-employed', 'Transport: type 2', 'Construction', 'Housing',
'Kindergarten', 'Trade: type 7', 'Industry: type 11', 'Military',
'Services', 'Security Ministries', 'Transport: type 4',
'Industry: type 1', 'Emergency', 'Security', 'Trade: type 2',
'University', 'Transport: type 3', 'Police',
'Business Entity Type 1', 'Postal', 'Industry: type 4',
'Agriculture', 'Restaurant', 'Culture', 'Hotel', 'Industry: type 7',
'Trade: type 3', 'Industry: type 3', 'Bank', 'Industry: type 9',
'Insurance', 'Trade: type 6', 'Industry: type 2',
'Transport: type 1', 'Industry: type 12', 'Mobile', 'Trade: type 1',
'Industry: type 5', 'Industry: type 10', 'Legal Services',
'Advertising', 'Trade: type 5', 'Cleaning', 'Industry: type 13',
'Trade: type 4', 'Telecom', 'Industry: type 8', 'Realtor',
'Industry: type 6'], dtype=object)
application_train['ORGANIZATION_TYPE'].replace('XNA', np.nan, inplace = True)
application_train.fillna(np.nan).tail(5)
SK_ID_CURR | TARGET | NAME_CONTRACT_TYPE | CODE_GENDER | FLAG_OWN_CAR | FLAG_OWN_REALTY | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | … | FLAG_DOCUMENT_18 | FLAG_DOCUMENT_19 | FLAG_DOCUMENT_20 | FLAG_DOCUMENT_21 | AMT_REQ_CREDIT_BUREAU_HOUR | AMT_REQ_CREDIT_BUREAU_DAY | AMT_REQ_CREDIT_BUREAU_WEEK | AMT_REQ_CREDIT_BUREAU_MON | AMT_REQ_CREDIT_BUREAU_QRT | AMT_REQ_CREDIT_BUREAU_YEAR | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
307506 | 456251 | 0 | Cash loans | M | N | N | 0 | 157500.0 | 254700.0 | 27558.0 | … | 0 | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN |
307507 | 456252 | 0 | Cash loans | F | N | Y | 0 | 72000.0 | 269550.0 | 12001.5 | … | 0 | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN |
307508 | 456253 | 0 | Cash loans | F | N | Y | 0 | 153000.0 | 677664.0 | 29979.0 | … | 0 | 0 | 0 | 0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 |
307509 | 456254 | 1 | Cash loans | F | N | Y | 0 | 171000.0 | 370107.0 | 20205.0 | … | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
307510 | 456255 | 0 | Cash loans | F | N | N | 0 | 157500.0 | 675000.0 | 49117.5 | … | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 2.0 | 0.0 | 1.0 |
5 rows × 122 columns
Feature Engineering
application_train.csv构造新特征
application_train['annuity_income_percentage'] = application_train['AMT_ANNUITY'] / application_train['AMT_INCOME_TOTAL']
application_train['car_to_birth_ratio'] = application_train['OWN_CAR_AGE'] / application_train['DAYS_BIRTH']
application_train['car_to_employ_ratio'] = application_train['OWN_CAR_AGE'] / application_train['DAYS_EMPLOYED']
application_train['children_ratio'] = application_train['CNT_CHILDREN'] / application_train['CNT_FAM_MEMBERS']
application_train['credit_to_annuity_ratio'] = application_train['AMT_CREDIT'] / application_train['AMT_ANNUITY']
application_train['credit_to_goods_ratio'] = application_train['AMT_CREDIT'] / application_train['AMT_GOODS_PRICE']
application_train['credit_to_income_ratio'] = application_train['AMT_CREDIT'] / application_train['AMT_INCOME_TOTAL']
application_train['days_employed_percentage'] = application_train['DAYS_EMPLOYED'] / application_train['DAYS_BIRTH']
application_train['income_credit_percentage'] = application_train['AMT_INCOME_TOTAL'] / application_train['AMT_CREDIT']
application_train['income_per_child'] = application_train['AMT_INCOME_TOTAL'] / (1 + application_train['CNT_CHILDREN'])
application_train['income_per_person'] = application_train['AMT_INCOME_TOTAL'] / application_train['CNT_FAM_MEMBERS']
application_train['payment_rate'] = application_train['AMT_ANNUITY'] / application_train['AMT_CREDIT']
application_train['phone_to_birth_ratio'] = application_train['DAYS_LAST_PHONE_CHANGE'] / application_train['DAYS_BIRTH']
application_train['phone_to_employ_ratio'] = application_train['DAYS_LAST_PHONE_CHANGE'] / application_train['DAYS_EMPLOYED']
application_train['cnt_non_child'] = application_train['CNT_FAM_MEMBERS'] - application_train['CNT_CHILDREN']
application_train['child_to_non_child_ratio'] = application_train['CNT_CHILDREN'] / application_train['cnt_non_child']
application_train['income_per_non_child'] = application_train['AMT_INCOME_TOTAL'] / application_train['cnt_non_child']
application_train['credit_per_person'] = application_train['AMT_CREDIT'] / application_train['CNT_FAM_MEMBERS']
application_train['credit_per_child'] = application_train['AMT_CREDIT'] / (1 + application_train['CNT_CHILDREN'])
application_train['credit_per_non_child'] = application_train['AMT_CREDIT'] / application_train['cnt_non_child']
application_train['external_sources_weighted'] = application_train.EXT_SOURCE_1 * 2 + application_train.EXT_SOURCE_2 * 3 + application_train.EXT_SOURCE_3 * 4
application_train['retirement'] = (application_train.DAYS_BIRTH < -18000).astype(int)
application_train['long_employment'] = (application_train.DAYS_EMPLOYED < -4000).astype(int)
EXT_SOURCE 外部数据源
for feature_name in ['min', 'max', 'sum', 'mean', 'nanmedian']:
application_train['external_sources_{}'.format(feature_name)] = eval('np.{}'.format(feature_name))(
application_train[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']], axis=1)
application_train.shape
(307511, 150)
Preprocessing application_test.csv
test = pd.read_csv('data/application_test.csv')
test['CODE_GENDER'].replace('XNA', np.nan, inplace=True)
test['DAYS_EMPLOYED'].replace(365243, np.nan, inplace = True)
test['DAYS_LAST_PHONE_CHANGE'].replace(0, np.nan, inplace = True)
test['NAME_FAMILY_STATUS'].replace('Unknow', np.nan, inplace = True)
test['ORGANIZATION_TYPE'].replace('XNA', np.nan, inplace = True)
test = test.fillna(np.nan)
Feature Engineering
test['annuity_income_percentage'] = test['AMT_ANNUITY'] / test['AMT_INCOME_TOTAL']
test['car_to_birth_ratio'] = test['OWN_CAR_AGE'] / test['DAYS_BIRTH']
test['car_to_employ_ratio'] = test['OWN_CAR_AGE'] / test['DAYS_EMPLOYED']
test['children_ratio'] = test['CNT_CHILDREN'] / test['CNT_FAM_MEMBERS']
test['credit_to_annuity_ratio'] = test['AMT_CREDIT'] / test['AMT_ANNUITY']
test['credit_to_goods_ratio'] = test['AMT_CREDIT'] / test['AMT_GOODS_PRICE']
test['credit_to_income_ratio'] = test['AMT_CREDIT'] / test['AMT_INCOME_TOTAL']
test['days_employed_percentage'] = test['DAYS_EMPLOYED'] / test['DAYS_BIRTH']
test['income_credit_percentage'] = test['AMT_INCOME_TOTAL'] / test['AMT_CREDIT']
test['income_per_child'] = test['AMT_INCOME_TOTAL'] / (1 + test['CNT_CHILDREN'])
test['income_per_person'] = test['AMT_INCOME_TOTAL'] / test['CNT_FAM_MEMBERS']
test['payment_rate'] = test['AMT_ANNUITY'] / test['AMT_CREDIT']
test['phone_to_birth_ratio'] = test['DAYS_LAST_PHONE_CHANGE'] / test['DAYS_BIRTH']
test['phone_to_employ_ratio'] = test['DAYS_LAST_PHONE_CHANGE'] / test['DAYS_EMPLOYED']
test['cnt_non_child'] = test['CNT_FAM_MEMBERS'] - test['CNT_CHILDREN']
test['child_to_non_child_ratio'] = test['CNT_CHILDREN'] / test['cnt_non_child']
test['income_per_non_child'] = test['AMT_INCOME_TOTAL'] / test['cnt_non_child']
test['credit_per_person'] = test['AMT_CREDIT'] / test['CNT_FAM_MEMBERS']
test['credit_per_child'] = test['AMT_CREDIT'] / (1 + test['CNT_CHILDREN'])
test['credit_per_non_child'] = test['AMT_CREDIT'] / test['cnt_non_child']
test['external_sources_weighted'] = test.EXT_SOURCE_1 * 2 + test.EXT_SOURCE_2 * 3 + application_train.EXT_SOURCE_3 * 4
test['retirement'] = (test.DAYS_BIRTH < -18000).astype(int)
test['long_employment'] = (test.DAYS_EMPLOYED < -4000).astype(int)
for feature_name in ['min', 'max', 'sum', 'mean', 'nanmedian']:
test['external_sources_{}'.format(feature_name)] = eval('np.{}'.format(feature_name))(
test[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']], axis=1)
test.shape
(48744, 149)
相关性
correlations = application_train.corr()['TARGET'].sort_values()
correlations.iloc[:-1][:].plot(kind = 'barh', figsize = (10, 60))
Save data
application_train.to_csv('data/no_select_train.csv',index = False)
test.to_csv('data/no_select_test.csv', index = False)