基于pandas数据清洗
- - - 原数据会存在缺失值 ( 空值)
- - - 重复值
- - - 异常值
None
np. nan( NaN)
import numpy as np
type ( None )
None + 1
np. nan + 1
isnull notnull any all dropna fillna
import pandas as pd
from pandas import DataFrame, Series
import numpy as np
df = DataFrame( data= np. random. randint( 0 , 100 , size= ( 8 , 6 ) ) )
df. iloc[ 2 , 3 ] = None
df. iloc[ 4 , 4 ] = np. nan
df. iloc[ 5 , 2 ] = None
df
0 1 2 3 4 5 0 19 32 76.0 22.0 49.0 22 1 63 21 85.0 86.0 12.0 79 2 55 76 93.0 NaN 78.0 50 3 24 40 36.0 63.0 30.0 92 4 38 66 93.0 42.0 NaN 89 5 81 59 NaN 70.0 83.0 14 6 42 92 19.0 9.0 30.0 74 7 12 42 91.0 55.0 87.0 15
df. isnull( ) . any ( axis = 1 )
drop_index = df. loc[ df. isnull( ) . any ( axis = 1 ) ] . index
df. drop( labels= drop_index, axis= 0 )
0 1 2 3 4 5 0 19 32 76.0 22.0 49.0 22 1 63 21 85.0 86.0 12.0 79 3 24 40 36.0 63.0 30.0 92 6 42 92 19.0 9.0 30.0 74 7 12 42 91.0 55.0 87.0 15
df
0 1 2 3 4 5 0 19 32 76.0 22.0 49.0 22 1 63 21 85.0 86.0 12.0 79 2 55 76 93.0 NaN 78.0 50 3 24 40 36.0 63.0 30.0 92 4 38 66 93.0 42.0 NaN 89 5 81 59 NaN 70.0 83.0 14 6 42 92 19.0 9.0 30.0 74 7 12 42 91.0 55.0 87.0 15
df. fillna( method = 'bfill' , axis= 1 )
C:\Users\LENOVO\AppData\Local\Temp\ipykernel_15040\198194620.py:1: FutureWarning: DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.
df.fillna(method = 'bfill',axis=1) # 向前填充 使用水平方向的向前填充去补充空值
0 1 2 3 4 5 0 19.0 32.0 76.0 22.0 49.0 22.0 1 63.0 21.0 85.0 86.0 12.0 79.0 2 55.0 76.0 93.0 78.0 78.0 50.0 3 24.0 40.0 36.0 63.0 30.0 92.0 4 38.0 66.0 93.0 42.0 89.0 89.0 5 81.0 59.0 70.0 70.0 83.0 14.0 6 42.0 92.0 19.0 9.0 30.0 74.0 7 12.0 42.0 91.0 55.0 87.0 15.0
df = DataFrame( data = np. random. randint( 0 , 100 , size= ( 8 , 4 ) ) )
df. iloc[ 2 ] = [ 0 , 0 , 0 , 0 ]
df. iloc[ 4 ] = [ 0 , 0 , 0 , 0 ]
df. iloc[ 5 ] = [ 0 , 0 , 0 , 0 ]
df
0 1 2 3 0 90 54 2 16 1 27 68 44 98 2 0 0 0 0 3 31 90 76 48 4 0 0 0 0 5 0 0 0 0 6 80 99 85 93 7 66 48 34 23
df. drop_duplicates( keep= 'first' )
0 1 2 3 0 90 54 2 16 1 27 68 44 98 2 0 0 0 0 3 31 90 76 48 6 80 99 85 93 7 66 48 34 23
df = DataFrame( data= np. random. random( size= ( 1000 , 3 ) ) , columns= [ 'A' , 'B' , 'C' ] )
twice_std = df[ 'C' ] . std( ) * 2
df[ 'C' ] > twice_std
df. loc[ ~ ( df[ 'C' ] > twice_std) ]
A B C 0 0.857403 0.925467 0.414313 2 0.796859 0.042008 0.473506 3 0.142472 0.324235 0.191976 4 0.468402 0.044693 0.383667 5 0.333189 0.449965 0.149515 ... ... ... ... 983 0.095379 0.965317 0.104301 988 0.053357 0.864976 0.320073 991 0.505450 0.055545 0.405785 995 0.503850 0.310566 0.167394 998 0.583521 0.128341 0.417372
559 rows × 3 columns