【Python数据分析与可视化】Pandas数据载入与预处理-实训
导包
import pandas as pd
import matplotlib. pyplot as plt
plt. rcParams[ 'font.sans-serif' ] = [ 'SimHei' ]
plt. rcParams[ 'axes.unicode_minus' ] = False
% matplotlib inline
读取数据
f1= pd. read_excel( 'tips_mod.xls' )
f1. head( )
total_bill tip sex smoker day time size 0 16.99 NaN Female No Sun Dinner 2.0 1 NaN 1.66 Male No Mon Diner 3.0 2 21.01 NaN Male No Sun Dier 3.0 3 23.68 3.31 Male No Sun Diner 6.0 4 NaN 3.61 Female No Sun Dinner 4.0
分析数据
1、查看数据的描述和统计信息;
f1. describe( )
total_bill tip size count 236.000000 236.000000 240.000000 mean 19.792161 3.018051 2.566667 std 8.898466 1.394868 0.970126 min 3.070000 1.000000 1.000000 25% 13.277500 2.000000 2.000000 50% 17.815000 2.960000 2.000000 75% 24.065000 3.627500 3.000000 max 50.810000 10.000000 6.000000
2、显示聚餐时间段time的不重复值;
unique_time= f1[ 'time' ] . unique( )
unique_time
array(['Dinner', 'Diner', 'Dier', nan, 'Lunch'], dtype=object)
3、修改拼写错误的字段值;
f1= f1. replace( { 'Diner' : 'Dinner' , 'Dier' : 'Dinner' } )
unique_time= f1[ 'time' ] . unique( )
unique_time
array(['Dinner', nan, 'Lunch'], dtype=object)
4、检测数据中的缺失值;
f1_null= f1. isnull( ) . sum ( )
f1_null
total_bill 8
tip 8
sex 5
smoker 3
day 2
time 5
size 4
dtype: int64
5、删除一行内有两个缺失值的数据;
nan_list= [ ]
for i, row in f1. iterrows( ) :
if row. isnull( ) . sum ( ) == 2 :
nan_list. append( i)
print ( nan_list)
f1. drop( index = nan_list)
f1. isnull( ) . sum ( )
[16, 23, 27, 42]
total_bill 8
tip 8
sex 5
smoker 3
day 2
time 5
size 4
dtype: int64
6、删除sex或time为空的行;
f1. dropna( subset= [ 'sex' , 'time' ] , axis= 0 , how= 'any' , inplace= True )
f1. isnull( ) . sum ( )
total_bill 6
tip 8
sex 0
smoker 3
day 2
time 0
size 4
dtype: int64
7、对剩余有空缺的数据用平均值替换;
f1[ 'total_bill' ] = f1[ 'total_bill' ] . fillna( f1[ 'total_bill' ] . mean( ) )
f1[ 'tip' ] = f1[ 'tip' ] . fillna( f1[ 'tip' ] . mean( ) )
f1[ 'size' ] = f1[ 'size' ] . fillna( f1[ 'size' ] . mean( ) )
f1. isnull( ) . sum ( )
total_bill 0
tip 0
sex 0
smoker 3
day 2
time 0
size 0
dtype: int64