一、EDA-数据探索性分析
1.1 EDA目标
熟悉数据集,了解数据集,对数据集进行验证以确保该数据集可用于机器学习/深度学习使用 了解变量间的相互关系以及变量与预测值之间的关系 数据处理与特征工程 对于数据进行一些图表或文字总结
1.2 内容介绍
1.载入各种数据科学以及可视化库:
数据科学库:pandas、numpy、scipy; 可视化库:matplotlib、seabon; 其他;
import warnings
warnings. filterwarnings( 'ignore' )
import pandas as pd
import numpy as np
import matplotlib. pyplot as plt
import seaborn as sns
import missingno as msno
2.载入数据:
载入训练集、测试集; 简略观察数据(head()+shape);
import pandas as pd
import numpy as np
path = './datalab/'
Train_data = pd. read_csv( path+ 'used_car_train_20200313.csv' , sep= ' ' )
Test_data = pd. read_csv( path+ 'used_car_testA_20200313.csv' , sep= ' ' )
Train_data. shape
Train_data. head( ) . append( Train_data. tail( ) )
3.数据总览:
通过describe()来熟悉数据的相关统计量; describe()中有每列的统计量:个数count、 平均值mean、方差std、最小值min、中位数25%50%75%、最大值 通过info()来熟悉数据类型; 了解数据每列的type
Train_data. describe( )
Train_data. info( )
4.判断数据缺失和异常
查看每列NaN存在情况; 主要的目的在于 nan存在的个数是否真的很大,如果很小一般选择填充,如果使用lgb等树模型可以直接空缺,让树自己去优化,但如果nan存在的过多、可以考虑删掉. 异常值检测;
Train_data. isnull( ) . sum ( )
missing = Train_data. isnull( ) . sum ( )
missing = missing[ missing > 0 ]
missing. sort_values( inplace= True )
missing. plot. bar( )
msno. matrix( Train_data. sample( 250 ) )
msno. bar( Train_data. sample( 1000 ) )
Train_data. info( )
Train_data[ 'notRepairedDamage' ] . value_counts( )
Train- data[ 'notRepairedDamage' ] . replace( '-' , np. nan, inplace= True )
Train_data[ 'notRepairedDamage' ] . value_counts( )
Train_data. isnull( ) . sum ( )
Train_data[ 'seller' ] . value_counts( )
Train_data[ 'offerType' ] . value_counts( )
del Train_data[ 'seller' ]
del Train_data[ 'offerType' ]
del Test_data[ 'seller' ]
del Test_data[ 'offerType' ]
5.了解预测值的分布
总体分布概况(无界约翰逊分布等); 查看skewness and kurtosis; 查看预测值的具体频数;
Train_data[ 'price' ]
Train_data[ 'price' ] . value_counts( )
import scipy. stats as st
y = Train_data[ 'price' ]
plt. figure( 1 ) ; plt. title( 'Johnson SU' )
sns. distplot( y, kde= False , fit= st. johnsonsu)
plt. figure( 2 ) ; plt. title( 'Normal' )
sns. distplot( y, kde= False , fit= st. norm)
plt. figure( 3 ) ; plt. title( 'Log Normal' )
sns. distplot( y, kde= False , fit= st. lognorm)
sns. distplot( Train_data[ 'price' ] ) ;
print ( "Skewness: %f" % Train_data[ 'price' ] . skew( ) )
print ( "Kurtosis: %f" % Train_data[ 'price' ] . kurt( ) )
Train_data. skew( ) , Train_data. kurt( )
sns. distplot( Train_data. skew( ) , color= 'blue' , axlabel = 'Skewness' )
sns. distplot( Train_data. kurt( ) , color= 'orange' , axlabel = 'Kurtness' )
plt. hist( Train_data[ 'price' ] , orientation = 'vertical' , histtype = 'bar' , color = 'red' )
plt. show( )
plt. hist( np. log( Train_data[ 'price' ] ) , orientation = 'vertical' , histtype = 'bar' , color = 'red' )
plt. show( )
6.特征分为类别特征和数字特征,并对类别特征查看unique分布
Y_train = Train_data[ 'price' ]
numeric_features = [ 'power' , 'kilometer' , 'v_0' , 'v_1' , 'v_2' , 'v_3' , 'v_4' , 'v_5' , 'v_6' , 'v_7' , 'v_8' , 'v_9' , 'v_10' , 'v_11' , 'v_12' , 'v_13' , 'v_14' ]
categorical_features = [ 'name' , 'model' , 'brand' , 'bodyType' , 'fuelType' , 'gearbox' , 'notRepairedDamage' , 'regionCode' , ]
for cat_fea in categorical_features:
print ( cat_fea + "的特征分布如下:" )
print ( "{}特征有个{}不同的值" . format ( cat_fea, Train_data[ cat_fea] . nunique( ) ) )
print ( Train_data[ cat_fea] . value_counts( ) )
7.数字特征分析
相关性分析 查看几个特征的偏度和峰值 每个数字特征的分布可视化 数字特征相互之间的关系可视化 多变量互相回归关系可视化
numeric_features. append( 'price' )
numeric_features
Train_data. head( )
price_numeric = Train_data[ numeric_features]
correlation = price_numeric. corr( )
print ( correlation[ 'price' ] . sort_values( ascending = False ) , '\n' )
f , ax = plt. subplots( figsize = ( 7 , 7 ) )
plt. title( 'Correlation of Numeric Features with Price' , y= 1 , size= 16 )
sns. heatmap( correlation, square = True , vmax= 0.8 )
del price_numeric[ 'price' ]
for col in numeric_features:
print ( '{:15}' . format ( col) ,
'Skewness: {:05.2f}' . format ( Train_data[ col] . skew( ) ) ,
' ' ,
'Kurtosis: {:06.2f}' . format ( Train_data[ col] . kurt( ) )
)
f = pd. melt( Train_data, value_vars= numeric_features)
g = sns. FacetGrid( f, col= "variable" , col_wrap= 2 , sharex= False , sharey= False )
g = g. map ( sns. distplot, "value" )
sns. set ( )
columns = [ 'price' , 'v_12' , 'v_8' , 'v_0' , 'power' , 'v_5' , 'v_2' , 'v_6' , 'v_1' , 'v_14' ]
sns. pairplot( Train_data[ columns] , size = 2 , kind = 'scatter' , diag_kind= 'kde' )
plt. show( )
Train_data. columns
Y_train
fig, ( ( ax1, ax2) , ( ax3, ax4) , ( ax5, ax6) , ( ax7, ax8) , ( ax9, ax10) ) = plt. subplots( nrows= 5 , ncols= 2 , figsize= ( 24 , 20 ) )
v_12_scatter_plot = pd. concat( [ Y_train, Train_data[ 'v_12' ] ] , axis = 1 )
sns. regplot( x= 'v_12' , y = 'price' , data = v_12_scatter_plot, scatter= True , fit_reg= True , ax= ax1)
v_8_scatter_plot = pd. concat( [ Y_train, Train_data[ 'v_8' ] ] , axis = 1 )
sns. regplot( x= 'v_8' , y = 'price' , data = v_8_scatter_plot, scatter= True , fit_reg= True , ax= ax2)
v_0_scatter_plot = pd. concat( [ Y_train, Train_data[ 'v_0' ] ] , axis = 1 )
sns. regplot( x= 'v_0' , y = 'price' , data = v_0_scatter_plot, scatter= True , fit_reg= True , ax= ax3)
power_scatter_plot = pd. concat( [ Y_train, Train_data[ 'power' ] ] , axis = 1 )
sns. regplot( x= 'power' , y = 'price' , data = power_scatter_plot, scatter= True , fit_reg= True , ax= ax4)
v_5_scatter_plot = pd. concat( [ Y_train, Train_data[ 'v_5' ] ] , axis = 1 )
sns. regplot( x= 'v_5' , y = 'price' , data = v_5_scatter_plot, scatter= True , fit_reg= True , ax= ax5)
v_2_scatter_plot = pd. concat( [ Y_train, Train_data[ 'v_2' ] ] , axis = 1 )
sns. regplot( x= 'v_2' , y = 'price' , data = v_2_scatter_plot, scatter= True , fit_reg= True , ax= ax6)
v_6_scatter_plot = pd. concat( [ Y_train, Train_data[ 'v_6' ] ] , axis = 1 )
sns. regplot( x= 'v_6' , y = 'price' , data = v_6_scatter_plot, scatter= True , fit_reg= True , ax= ax7)
v_1_scatter_plot = pd. concat( [ Y_train, Train_data[ 'v_1' ] ] , axis = 1 )
sns. regplot( x= 'v_1' , y = 'price' , data = v_1_scatter_plot, scatter= True , fit_reg= True , ax= ax8)
v_14_scatter_plot = pd. concat( [ Y_train, Train_data[ 'v_14' ] ] , axis = 1 )
sns. regplot( x= 'v_14' , y = 'price' , data = v_14_scatter_plot, scatter= True , fit_reg= True , ax= ax9)
v_13_scatter_plot = pd. concat( [ Y_train, Train_data[ 'v_13' ] ] , axis = 1 )
sns. regplot( x= 'v_13' , y = 'price' , data = v_13_scatter_plot, scatter= True , fit_reg= True , ax= ax10)
8.类别特征分析
unique分布 类别特征箱形图可视化 类别特征的小提琴图可视化 类别特征的柱状图可视化类别 特征的每个类别频数可视化(count_plot)
for fea in categorical_features:
print ( Train_data[ fea] . nunique( ) )
categorical_features
categorical_features = [ 'model' ,
'brand' ,
'bodyType' ,
'fuelType' ,
'gearbox' ,
'notRepairedDamage' ]
for c in categorical_features:
Train_data[ c] = Train_data[ c] . astype( 'category' )
if Train_data[ c] . isnull( ) . any ( ) :
Train_data[ c] = Train_data[ c] . cat. add_categories( [ 'MISSING' ] )
Train_data[ c] = Train_data[ c] . fillna( 'MISSING' )
def boxplot ( x, y, ** kwargs) :
sns. boxplot( x= x, y= y)
x= plt. xticks( rotation= 90 )
f = pd. melt( Train_data, id_vars= [ 'price' ] , value_vars= categorical_features)
g = sns. FacetGrid( f, col= "variable" , col_wrap= 2 , sharex= False , sharey= False , size= 5 )
g = g. map ( boxplot, "value" , "price" )
Train_data. columns
catg_list = categorical_features
target = 'price'
for catg in catg_list :
sns. violinplot( x= catg, y= target, data= Train_data)
plt. show( )
categorical_features = [ 'model' ,
'brand' ,
'bodyType' ,
'fuelType' ,
'gearbox' ,
'notRepairedDamage' ]
def bar_plot ( x, y, ** kwargs) :
sns. barplot( x= x, y= y)
x= plt. xticks( rotation= 90 )
f = pd. melt( Train_data, id_vars= [ 'price' ] , value_vars= categorical_features)
g = sns. FacetGrid( f, col= "variable" , col_wrap= 2 , sharex= False , sharey= False , size= 5 )
g = g. map ( bar_plot, "value" , "price" )
def count_plot ( x, ** kwargs) :
sns. countplot( x= x)
x= plt. xticks( rotation= 90 )
f = pd. melt( Train_data, value_vars= categorical_features)
g = sns. FacetGrid( f, col= "variable" , col_wrap= 2 , sharex= False , sharey= False , size= 5 )
g = g. map ( count_plot, "value" )
9.用pandas_profiling生成数据报告
import pandas_profiling
pfr = pandas_profiling. ProfileReport( Train_data)
pfr. to_file( "./example.html" )
附录