1 数据总体了解:
import pandas as pd
import numpy as np
import matplotlib. pyplot as plt
import seaborn as sns
import datetime
import warnings
warnings. filterwarnings( 'ignore' )
1.1 读取数据集并了解数据集大小,原始特征维度
train = pd. read_csv( './data/train.csv' )
testA = pd. read_csv( './data/testA.csv' )
print ( train. shape)
print ( testA. shape)
train. head( 3 ) . append( train. tail( 3 ) )
testA. head( 3 ) . append( testA. tail( 3 ) )
train. columns
1.2 通过info熟悉数据类型
train. info( )
1.3粗略查看数据集中各特征基本统计量
train. describe( )
2 缺失值和唯一值
2.1 查看数据缺失值情况
train. isnull( ) . any ( ) . sum ( )
have_null_fea_dict = ( ( train. isnull( ) . sum ( ) ) / len ( train) ) . to_dict( )
fea_null = { }
for k, v in have_null_fea_dict. items( ) :
if v > 0.5 :
fea_null[ k] = v
print ( fea_null)
2.2 查看缺失特征及缺失率
missing = ( train. isnull( ) . sum ( ) ) / len ( train)
miss = missing[ missing> 0 ]
miss = miss. sort_values( ascending= True )
miss. plot. bar( )
2.3 查看训练集测试集中特征属性只有一值的特征
numerical_fea = list ( train. select_dtypes( exclude= [ 'object' ] ) . columns)
category_fea = list ( filter ( lambda x: x not in numerical_fea, list ( train. columns) ) )
print ( numerical_fea)
print ( category_fea)
2.5 数值型变量分析,数值型包括连续型变量和离散型变量
2.5.1划分数值型变量中的连续变量和离散型变量
def get_numerical_serial_fea ( data, feas) :
numerical_serial_fea = [ ]
numerical_noserial_fea = [ ]
for fea in feas:
temp = data[ fea] . nunique( )
if temp <= 20 :
numerical_noserial_fea. append( fea)
continue
numerical_serial_fea. append( fea)
return numerical_serial_fea, numerical_noserial_fea
numerical_serial_fea, numerical_noserial_fea= get_numerical_serial_fea( train, numerical_fea)
print ( numerical_serial_fea)
print ( numerical_noserial_fea)
2.5.2离散型变量逐一查看
train[ 'term' ] . value_counts( )
train[ 'homeOwnership' ] . value_counts( )
train[ 'verificationStatus' ] . value_counts( )
train[ 'purpose' ] . value_counts( )
train[ 'pubRecBankruptcies' ] . value_counts( )
train[ 'initialListStatus' ] . value_counts( )
train[ 'applicationType' ] . value_counts( )
train[ 'policyCode' ] . value_counts( )
train[ 'n11' ] . value_counts( )
train[ 'n12' ] . value_counts( )
2.5.3数值连续型变量分析 -每个数字特征的分布可视化
f = pd. melt( train, value_vars= numerical_serial_fea)
g = sns. FacetGrid( f, col= 'variable' , col_wrap= 4 , sharex= False , sharey= False )
g = g. map ( sns. distplot, 'value' )
2.6 非数值类别变量分析
train[ 'grade' ] . value_counts( )
train[ 'subGrade' ] . value_counts( )
train[ 'employmentLength' ] . value_counts( )
train[ 'issueDate' ] . value_counts( )
train[ 'earliesCreditLine' ] . value_counts( )
3 变量分布可视化
3.1 单一变量分布可视化
plt. figure( figsize= ( 8 , 8 ) )
sns. barplot( train[ 'employmentLength' ] . value_counts( dropna= False ) [ : 20 ] ,
train[ 'employmentLength' ] . value_counts( dropna= False ) . keys( ) [ : 20 ] )
plt. show( )
3.2 时间格式数据处理及查看
train[ 'issueDate' ] = pd. to_datetime( train[ 'issueDate' ] )
startdate = datetime. datetime. strptime( '2007-06-01' , '%Y-%m-%d' )
train[ 'issueDateDT' ] = train[ 'issueDate' ] . apply ( lambda x: x- startdate) . dt. days
testA[ 'issueDate' ] = pd. to_datetime( testA[ 'issueDate' ] )
startdate = datetime. datetime. strptime( '2007-06-01' , '%Y-%m-%d' )
testA[ 'issueDateDT' ] = testA[ 'issueDate' ] . apply ( lambda x: x- startdate) . dt. days
plt. hist( train[ 'issueDateDT' ] , label= 'train' )
plt. hist( testA[ 'issueDateDT' ] , label= 'testA' )
plt. legend( )
3.3 利用透视图了解数据
pd. pivot_table( train, index= [ 'grade' ] , columns= [ 'issueDateDT' ] , values=
[ 'loanAmnt' ] , aggfunc= np. sum )
3.4 用pandas_profiling生成数据报告
import pandas_profiling
pfr = pandas_profiling. ProfileReport( train)
pfr. to_file( "./example.html" )