此步骤是为了初步了解数据,熟悉数据为特征工程做准备,不仅要要统计量来显示数据,将数据可视化更利于观察各个类型变量之间的关系
1. 数据概览了解
导入数据分析及可视化过程需要的库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import warnings
warnings.filterwarnings('ignore')
train_sample = pd.read_csv('train.csv',nrows=5) #设置nrows参数,来设置读取文件的前多少行
chunker = pd.read_csv('train.csv',chunksize = 5) #设置chunksize参数,控制每次迭代数据的大小
for item in chunker:
print(type(item))
print(len(item))
train.columns
Index(['id', 'loanAmnt', 'term', 'interestRate', 'installment', 'grade',
'subGrade', 'employmentTitle', 'employmentLength', 'homeOwnership',
'annualIncome', 'verificationStatus', 'issueDate', 'isDefault',
'purpose', 'postCode', 'regionCode', 'dti', 'delinquency_2years',
'ficoRangeLow', 'ficoRangeHigh', 'openAcc', 'pubRec',
'pubRecBankruptcies', 'revolBal', 'revolUtil', 'totalAcc',
'initialListStatus', 'applicationType', 'earliesCreditLine', 'title',
'policyCode', 'n0', 'n1', 'n2', 'n3', 'n4', 'n5', 'n6', 'n7', 'n8',
'n9', 'n10', 'n11', 'n12', 'n13', 'n14'],
dtype='object')
train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800000 entries, 0 to 799999
Data columns (total 47 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 800000 non-null int64
1 loanAmnt 800000 non-null float64
2 term 800000 non-null int64
3 interestRate 800000 non-null float64
4 installment 800000 non-null float64
5 grade 800000 non-null object
6 subGrade 800000 non-null object
7 employmentTitle 799999 non-null float64
8 employmentLength 753201 non-null object
9 homeOwnership 800000 non-null int64
10 annualIncome 800000 non-null float64
11 verificationStatus 800000 non-null int64
12 issueDate 800000 non-null object
13 isDefault 800000 non-null int64
14 purpose 800000 non-null int64
15 postCode 799999 non-null float64
16 regionCode 800000 non-null int64
17 dti 799761 non-null float64
18 delinquency_2years 800000 non-null float64
19 ficoRangeLow 800000 non-null float64
20 ficoRangeHigh 800000 non-null float64
21 openAcc 800000 non-null float64
22 pubRec 800000 non-null float64
23 pubRecBankruptcies 799595 non-null float64
24 revolBal 800000 non-null float64
25 revolUtil 799469 non-null float64
26 totalAcc 800000 non-null float64
27 initialListStatus 800000 non-null int64
28 applicationType 800000 non-null int64
29 earliesCreditLine 800000 non-null object
30 title 799999 non-null float64
31 policyCode 800000 non-null float64
32 n0 759730 non-null float64
33 n1 759730 non-null float64
34 n2 759730 non-null float64
35 n3 759730 non-null float64
36 n4 766761 non-null float64
37 n5 759730 non-null float64
38 n6 759730 non-null float64
39 n7 759730 non-null float64
40 n8 759729 non-null float64
41 n9 759730 non-null float64
42 n10 766761 non-null float64
43 n11 730248 non-null float64
44 n12 759730 non-null float64
45 n13 759730 non-null float64
46 n14 759730 non-null float64
dtypes: float64(33), int64(9), object(5)
memory usage: 286.9+ MB
train.describe()
2. 缺失值与唯一值
# 查看数据集中特征缺失值,唯一值
print(train.isnull().any().sum()) #有22列特征有缺失值
22
#进一步查看缺失特征中缺失率大于50%的特征
have_null_fea_dict =(train.isnull().sum()/len(train)).to_dict()
fea_null_moreThanHalf = {}
for key, value in have_null_fea_dict.items():
if value > 0.5:
fea_null_moreThanHalf[key] = value
fea_null_moreThanHalf
{}
# 具体查看确实特征及缺失率
# nan可视化
missing = train.isnull().sum()/len(train)
missing = missing[missing > 0]
missing.sort_values(inplace=True)
missing.plot.bar()
# 查看训练集测试集中特征属性只有一值的特征
one_value_fea = [col for col in train.columns if train[col].nunique() <=1]
one_value_fea_test = [col for col in testA.columns if testA[col].nunique() <=1]
one_value_fea
['policyCode']
# 查看特征的数值类型有哪些,对象类型有哪些
numerical_fea = list(train.select_dtypes(exclude=['object']).columns)
category_fea = list(filter(lambda x:x not in numerical_fea, list(train.columns)))
numerical_fea
['id',
'loanAmnt',
'term',
'interestRate',
'installment',
'employmentTitle',
'homeOwnership',
'annualIncome',
'verificationStatus',
'isDefault',
'purpose',
'postCode',
'regionCode',
'dti',
'delinquency_2years',
'ficoRangeLow',
'ficoRangeHigh',
'openAcc',
'pubRec',
'pubRecBankruptcies',
'revolBal',
'revolUtil',
'totalAcc',
'initialListStatus',
'applicationType',
'title',
'policyCode',
'n0',
'n1',
'n2',
'n3',
'n4',
'n5',
'n6',
'n7',
'n8',
'n9',
'n10',
'n11',
'n12',
'n13',
'n14']
category_fea
['grade', 'subGrade', 'employmentLength', 'issueDate', 'earliesCreditLine']
train.grade
0 E
1 D
2 D
3 A
4 C
..
799995 C
799996 A
799997 C
799998 A
799999 B
Name: grade, Length: 800000, dtype: object
3. 深入数据-查看数据类型
# 划分数值型变量中的连续变量和分类变量
# 过滤数值型类型特征
def get_numerical_serial_fea(data, feas):
numerical_serial_fea = []
numerical_noserial_fea = []
for fea in feas:
temp = data[fea].nunique()
if temp <=10:
numerical_noserial_fea.append(fea)
continue
numerical_serial_fea.append(fea)
return numerical_serial_fea, numerical_noserial_fea
numerical_serial_fea, numerical_noserial_fea = get_numerical_serial_fea(train, numerical_fea)
numerical_serial_fea
['id',
'loanAmnt',
'interestRate',
'installment',
'employmentTitle',
'annualIncome',
'purpose',
'postCode',
'regionCode',
'dti',
'delinquency_2years',
'ficoRangeLow',
'ficoRangeHigh',
'openAcc',
'pubRec',
'pubRecBankruptcies',
'revolBal',
'revolUtil',
'totalAcc',
'title',
'n0',
'n1',
'n2',
'n3',
'n4',
'n5',
'n6',
'n7',
'n8',
'n9',
'n10',
'n13',
'n14']
numerical_noserial_fea
['term',
'homeOwnership',
'verificationStatus',
'isDefault',
'initialListStatus',
'applicationType',
'policyCode',
'n11',
'n12']
# 数值类别型变量分析
train['term'].value_counts() # 离散型变量
3 606902
5 193098
Name: term, dtype: int64
对于其他类别型变量同理
# 查看某一数值型变量的分布,查看变量是否符合正态分布,如果不符合正态分布的变量可以log化后在观察是否符合正态分布
# 如果想统一处理一批数据变标准化,必须把这些之前已经正态化的数据提出
plt.figure(figsize=(16,12))
plt.suptitle('Transaction Values Distribution', fontsize=22)
plt.subplot(221)
sub_plot_1 = sns.distplot(train['loanAmnt'])
sub_plot_1.set_title("loanAmnt Distribution", fontsize=18)
sub_plot_1.set_xlabel("")
sub_plot_1.set_ylabel("Probability", fontsize=15)
plt.subplot(222)
sub_plot_2 = sns.distplot(np.log(train['loanAmnt']))
sub_plot_2.set_title("loanAmnt(Log)Distribution",fontsize=18)
sub_plot_2.set_xlabel("")
sub_plot_2.set_ylabel("Probability", fontsize=15)
# 非数值类别型变量分析
train['grade'].value_counts()
B 233690
C 227118
A 139661
D 119453
E 55661
F 19053
G 5364
Name: grade, dtype: int64
其他非数值型类别型变量同理
# 单一变量分布可视化
plt.figure(figsize=(8,8))
sns.barplot(train["employmentLength"].value_counts(dropna = False)[:20], train["employmentLength"].value_counts(dropna=False).keys()[:20])
plt.show()
# 根据y值不同可视化x某个特征的分布
# 查看类别型变量在不同y值上的分布
train_loan_fr = train.loc[train['isDefault']==1]
train_loan_nofr = train.loc[train['isDefault']==0]
fig, ((ax1,ax2),(ax3,ax4)) = plt.subplots(2,2, figsize=(15,8))
train_loan_fr.groupby('grade')['grade'].count().plot(kind='barh', ax=ax1, title='Count of grade fraud')
train_loan_nofr.groupby('grade')['grade'].count().plot(kind='barh', ax=ax2, title='Count of grade non-fraud')
train_loan_fr.groupby('employmentLength')['employmentLength'].count().plot(kind='barh', ax=ax3, title='Count of employmentLength fraud')
train_loan_nofr.groupby('employmentLength')['employmentLength'].count().plot(kind='barh', ax=ax4, title='Count of employmentLength non-fraud')
plt.show()
# 其次查看连续型变量在不同Y值上的分布
fig,((ax1,ax2)) = plt.subplots(1,2, figsize=(15,6))
train.loc[train['isDefault']==1]['loanAmnt'].apply(np.log).plot(kind='hist',bins=100,title='Log Loan Amt - Fraud',
color='r',
xlim=(-3, 10),
ax=ax1)
train.loc[train['isDefault']==0]['loanAmnt'].apply(np.log).plot(kind='hist',
bins=100,
title='Log Loan Amt - Not Fraud',
color='b',
xlim=(-3,10),
ax=ax2)