数据探索性分析
导入库和数据
import warnings
warnings.filterwarnings(action='ignore')
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
train = pd.read_csv(r'.\train.csv', index_col=0)
testA = pd.read_csv(r'.\testA.csv', index_col=0)
print(train.shape)
(800000, 46)
print(testA.shape)
(200000, 45)
数据探索性分析
查看正负样本占比
sns.countplot('isDefault', data=train)
plt.title('Class Distribution \n (0: No Fraud || 1: Fraud)')
print("No Fraud: Fraud =", len(train[train['isDefault']==0])/len(train[train['isDefault']==1]), ": 1")
查看缺失值占比
missing_value = train.isnull().sum() / len(train)
print(missing_value[missing_value>0])
employmentTitle 0.000001
employmentLength 0.058499
postCode 0.000001
dti 0.000299
pubRecBankruptcies 0.000506
revolUtil 0.000664
title 0.000001
n0 0.050338
n1 0.050338
n2 0.050338
n3 0.050338
n4 0.041549
n5 0.050338
n6 0.050338
n7 0.050338
n8 0.050339
n9 0.050338
n10 0.041549
n11 0.087190
n12 0.050338
n13 0.050338
n14 0.050338
dtype: float64
missing_value[missing_value>0].sort_values().plot.barh(figsize=(8,8))
缺失值集中于匿名特征,并且从上图来看可疑的是数个匿名特征的缺失比率是几乎一样的。
# 查看测试集testA的缺失值占比
missing_value = testA.isnull().sum() / len<