金融风控入门赛02
上次的分享做了赛题理解,这次来做数据分析。
一、数据总体情况
本次赛事的特征共有47列,训练集80万条,测试集20万条,基本情况如下所示:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import warnings
warnings.filterwarnings('ignore')
data_train = pd.read_csv('../data/train.csv')
data_test_a = pd.read_csv('../data/testA.csv')
data_train.info()
# Data columns (total 47 columns):
# # Column Non-Null Count Dtype
# --- ------ -------------- -----
# 0 id 800000 non-null int64
# 1 loanAmnt 800000 non-null float64
# 2 term 800000 non-null int64
# 3 interestRate 800000 non-null float64
# 4 installment 800000 non-null float64
# 5 grade 800000 non-null object
# 6 subGrade 800000 non-null object
# 7 employmentTitle 799999 non-null float64
# 8 employmentLength 753201 non-null object
# 9 homeOwnership 800000 non-null int64
# 10 annualIncome 800000 non-null float64
# 11 verificationStatus 800000 non-null int64
# 12 issueDate 800000 non-null object
# 13 isDefault 800000 non-null int64
# 14 purpose 800000 non-null int64
# 15 postCode 799999 non-null float64
# 16 regionCode 800000 non-null int64
# 17 dti 799761 non-null float64
# 18 delinquency_2years 800000 non-null float64
# 19 ficoRangeLow 800000 non-null float64
# 20 ficoRangeHigh 800000 non-null float64
# 21 openAcc 800000 non-null float64
# 22 pubRec 800000 non-null float64
# 23 pubRecBankruptcies 799595 non-null float64
# 24 revolBal 800000 non-null float64
# 25 revolUtil 799469 non-null float64
# 26 totalAcc 800000 non-null float64
# 27 initialListStatus 800000 non-null int64
# 28 applicationType 800000 non-null int64
# 29 earliesCreditLine 800000 non-null object
# 30 title 799999 non-null float64
# 31 policyCode 800000 non-null float64
# 32 n0 759730 non-null float64
# 33 n1 759730 non-null float64
# 34 n2 759730 non-null float64
# 35 n2.1 759730 non-null float64
# 36 n4 766761 non-null float64
# 37 n5 759730 non-null float64
# 38 n6 759730 non-null float64
# 39 n7 759730 non-null float64
# 40 n8 759729 non-null float64
# 41 n9 759730 non-null float64
# 42 n10 766761 non-null float64
# 43 n11 730248 non-null float64
# 44 n12 759730 non-null float64
# 45 n13 759730 non-null float64
# 46 n14 759730 non-null float64
# dtypes: float64(33), int64(9), object(5)
# memory usage: 286.9+ MB
其中22列有缺失,25列没有缺失,数据缺失的列入下图所示:
可以观察到缺失列的缺失比例不是很大,可以考虑用中值填充。
二、数据类型分析
one_value_fea = [col for col in data_train.columns if data_train[col].nunique() <= 1]
print(one_value_fea)#['policyCode'],一个值的列
numerical_fea = list(data_train.select_dtypes(exclude=['object']).columns)
category_fea = list(filter(lambda x: x not in numerical_fea,list(data_train.columns)))
print(numerical_fea)
#['grade', 'subGrade', 'employmentLength', 'issueDate', 'earliesCreditLine']为类别列
#过滤数值型类别特征
def get_numerical_serial_fea(data,feas):
numerical_serial_fea = []
numerical_noserial_fea = []
for fea in feas:
temp = data[fea].nunique()
if temp <= 10:
numerical_noserial_fea.append(fea)
continue
numerical_serial_fea.append(fea)
return numerical_serial_fea,numerical_noserial_fea
numerical_serial_fea,numerical_noserial_fea = get_numerical_serial_fea(data_train,numerical_fea)
numerical_noserial_fea
# ['term',
# 'homeOwnership',
# 'verificationStatus',
# 'isDefault',
# 'initialListStatus',
# 'applicationType',
# 'policyCode',
# 'n11',
# 'n12']
#每个数字特征得分布可视化
f = pd.melt(data_train, value_vars=numerical_serial_fea)
g = sns.FacetGrid(f, col="variable", col_wrap=2, sharex=False, sharey=False)
g = g.map(sns.distplot, "value",kde_kws={'bw': 0.1})
这里需要注意的是要指定带宽,我的seaborn之前总是报错就是因为没用指定。
缺失数值可视化
# nan可视化
missing = data_train.isnull().sum()/len(data_train)
missing = missing[missing > 0]
missing.sort_values(inplace=True)
missing.plot.bar()
#过滤数值型类别特征
def get_numerical_serial_fea(data,feas):
numerical_serial_fea = []
numerical_noserial_fea = []
for fea in feas:
temp = data[fea].nunique()
if temp <= 10:
numerical_noserial_fea.append(fea)
continue
numerical_serial_fea.append(fea)
return numerical_serial_fea,numerical_noserial_fea
numerical_serial_fea,numerical_noserial_fea = get_numerical_serial_fea(data_train,numerical_fea)
print(numerical_noserial_fea)
# ['term',
# 'homeOwnership',
# 'verificationStatus',
# 'isDefault',
# 'initialListStatus',
# 'applicationType',
# 'policyCode',
# 'n11',
# 'n12']
三、时间特征转换
#转化成时间格式 issueDateDT特征表示数据日期离数据集中日期最早的日期(2007-06-01)的天数
data_train['issueDate'] = pd.to_datetime(data_train['issueDate'],format='%Y-%m-%d')
startdate = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d')
data_train['issueDateDT'] = data_train['issueDate'].apply(lambda x: x-startdate).dt.days
#转化成时间格式
data_test_a['issueDate'] = pd.to_datetime(data_train['issueDate'],format='%Y-%m-%d')
startdate = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d')
data_test_a['issueDateDT'] = data_test_a['issueDate'].apply(lambda x: x-startdate).dt.days
plt.hist(data_train['issueDateDT'], label='train');
plt.hist(data_test_a['issueDateDT'], label='test');
plt.legend();
plt.title('Distribution of issueDateDT dates');
#train 和 test issueDateDT 日期有重叠 所以使用基于时间的分割进行验证是不明智的