pandas数据清洗(1、缺失值处理)

处理缺失值

import pandas as pd
import numpy as np
#创建含有缺失值的DataFrame
df=pd.DataFrame(np.random.randn(10,6))
df.iloc[:4,1]=None
df.iloc[:2,4:6]=None
df.iloc[6,3:5]=None
df.iloc[8,0:2]=None
print(df)

          0         1         2         3         4         5
0 -0.338736       NaN -0.265401 -0.335392       NaN       NaN
1  0.735121       NaN  0.039484  0.684837       NaN       NaN
2 -0.774130       NaN -0.421404  1.472310  1.207911 -0.237753
3 -0.152982       NaN  0.635784  0.476728 -1.615527  0.640051
4  0.132076 -0.986871 -1.038595 -0.517508 -0.497453 -0.650124
5 -1.756492  1.740063 -0.283258 -1.969033 -0.856556 -0.421336
6 -0.332347  0.181860 -0.166790       NaN       NaN -0.420950
7  0.553072 -1.324093 -0.361992 -0.710529 -0.080789 -1.034787
8       NaN       NaN -1.206436 -0.041814  1.106454 -0.036821
9 -0.702848  0.141501  0.402418  0.814163  0.244678  0.072794
判断缺失值
#元素级别的判断,把对应的所有元素的位置都列出来,元素为空或者NA就显示True,否则就是False
result=df.isnull()
print(result)

       0      1      2      3      4      5
0  False   True  False  False   True   True
1  False   True  False  False   True   True
2  False   True  False  False  False  False
3  False   True  False  False  False  False
4  False  False  False  False  False  False
5  False  False  False  False  False  False
6  False  False  False   True   True  False
7  False  False  False  False  False  False
8   True   True  False  False  False  False
9  False  False  False  False  False  False

#列级别的判断,只要该列有为空或者NA的元素,就为True,否则Flase
result=df.isnull().any()
print(result)

0     True
1     True
2    False
3     True
4     True
5     True
dtype: bool

#只显示存在缺失值的行列,清楚的确定缺失值的位置
#用drop_duplicates()去掉重复的行
result=df[df.isnull().values==True].drop_duplicates()
print(result)

          0        1         2         3         4         5
0 -0.338736      NaN -0.265401 -0.335392       NaN       NaN
1  0.735121      NaN  0.039484  0.684837       NaN       NaN
2 -0.774130      NaN -0.421404  1.472310  1.207911 -0.237753
3 -0.152982      NaN  0.635784  0.476728 -1.615527  0.640051
6 -0.332347  0.18186 -0.166790       NaN       NaN -0.420950
8       NaN      NaN -1.206436 -0.041814  1.106454 -0.036821

#获得为空或者NA的列索引
result=df.columns[df.isnull().any()==True]
print(result)

Int64Index([0, 1, 3, 4, 5], dtype='int64')

#获得每列为空的数据的个数
num=df.isnull().sum()
print(num)

0    1
1    5
2    0
3    1
4    3
5    2
dtype: int64

#获得每行为空的数据的个数
num=df.isnull().sum(axis=1)
print(num)

0    3
1    3
2    1
3    1
4    0
5    0
6    2
7    0
8    2
9    0
dtype: int64
滤除缺失值
#使用默认参数,删除行,只要有空值就会删除,不改变源数据
print(df.dropna())
print('源数据不变')
print(df)

          0         1         2         3         4         5
4  0.132076 -0.986871 -1.038595 -0.517508 -0.497453 -0.650124
5 -1.756492  1.740063 -0.283258 -1.969033 -0.856556 -0.421336
7  0.553072 -1.324093 -0.361992 -0.710529 -0.080789 -1.034787
9 -0.702848  0.141501  0.402418  0.814163  0.244678  0.072794
源数据不变
          0         1         2         3         4         5
0 -0.338736       NaN -0.265401 -0.335392       NaN       NaN
1  0.735121       NaN  0.039484  0.684837       NaN       NaN
2 -0.774130       NaN -0.421404  1.472310  1.207911 -0.237753
3 -0.152982       NaN  0.635784  0.476728 -1.615527  0.640051
4  0.132076 -0.986871 -1.038595 -0.517508 -0.497453 -0.650124
5 -1.756492  1.740063 -0.283258 -1.969033 -0.856556 -0.421336
6 -0.332347  0.181860 -0.166790       NaN       NaN -0.420950
7  0.553072 -1.324093 -0.361992 -0.710529 -0.080789 -1.034787
8       NaN       NaN -1.206436 -0.041814  1.106454 -0.036821
9 -0.702848  0.141501  0.402418  0.814163  0.244678  0.072794

#使用axis删除有空值的列
print(df.dropna(axis=1))

          2
0 -0.265401
1  0.039484
2 -0.421404
3  0.635784
4 -1.038595
5 -0.283258
6 -0.166790
7 -0.361992
8 -1.206436
9  0.402418

#所有值为缺失值才删除
print(df.dropna(how='all'))

          0         1         2         3         4         5
0 -0.338736       NaN -0.265401 -0.335392       NaN       NaN
1  0.735121       NaN  0.039484  0.684837       NaN       NaN
2 -0.774130       NaN -0.421404  1.472310  1.207911 -0.237753
3 -0.152982       NaN  0.635784  0.476728 -1.615527  0.640051
4  0.132076 -0.986871 -1.038595 -0.517508 -0.497453 -0.650124
5 -1.756492  1.740063 -0.283258 -1.969033 -0.856556 -0.421336
6 -0.332347  0.181860 -0.166790       NaN       NaN -0.420950
7  0.553072 -1.324093 -0.361992 -0.710529 -0.080789 -1.034787
8       NaN       NaN -1.206436 -0.041814  1.106454 -0.036821
9 -0.702848  0.141501  0.402418  0.814163  0.244678  0.072794

#至少有四个非空值才保留
print(df.dropna(thresh=4))

          0         1         2         3         4         5
2 -0.774130       NaN -0.421404  1.472310  1.207911 -0.237753
3 -0.152982       NaN  0.635784  0.476728 -1.615527  0.640051
4  0.132076 -0.986871 -1.038595 -0.517508 -0.497453 -0.650124
5 -1.756492  1.740063 -0.283258 -1.969033 -0.856556 -0.421336
6 -0.332347  0.181860 -0.166790       NaN       NaN -0.420950
7  0.553072 -1.324093 -0.361992 -0.710529 -0.080789 -1.034787
8       NaN       NaN -1.206436 -0.041814  1.106454 -0.036821
9 -0.702848  0.141501  0.402418  0.814163  0.244678  0.072794

#删除这个subset中含有缺失值的行
print(df.dropna(subset=[2,4]))

          0         1         2         3         4         5
2 -0.774130       NaN -0.421404  1.472310  1.207911 -0.237753
3 -0.152982       NaN  0.635784  0.476728 -1.615527  0.640051
4  0.132076 -0.986871 -1.038595 -0.517508 -0.497453 -0.650124
5 -1.756492  1.740063 -0.283258 -1.969033 -0.856556 -0.421336
7  0.553072 -1.324093 -0.361992 -0.710529 -0.080789 -1.034787
8       NaN       NaN -1.206436 -0.041814  1.106454 -0.036821
9 -0.702848  0.141501  0.402418  0.814163  0.244678  0.072794

#删除这个subset中含有缺失值的列
print(df.dropna(subset=[2,4],axis=1))
          0         2         3         4         5
0 -0.338736 -0.265401 -0.335392       NaN       NaN
1  0.735121  0.039484  0.684837       NaN       NaN
2 -0.774130 -0.421404  1.472310  1.207911 -0.237753
3 -0.152982  0.635784  0.476728 -1.615527  0.640051
4  0.132076 -1.038595 -0.517508 -0.497453 -0.650124
5 -1.756492 -0.283258 -1.969033 -0.856556 -0.421336
6 -0.332347 -0.166790       NaN       NaN -0.420950
7  0.553072 -0.361992 -0.710529 -0.080789 -1.034787
8       NaN -1.206436 -0.041814  1.106454 -0.036821
9 -0.702848  0.402418  0.814163  0.244678  0.072794
填充缺失值
#将缺失值替换为常量0
print(df.fillna(0))

          0         1         2         3         4         5
0 -0.338736  0.000000 -0.265401 -0.335392  0.000000  0.000000
1  0.735121  0.000000  0.039484  0.684837  0.000000  0.000000
2 -0.774130  0.000000 -0.421404  1.472310  1.207911 -0.237753
3 -0.152982  0.000000  0.635784  0.476728 -1.615527  0.640051
4  0.132076 -0.986871 -1.038595 -0.517508 -0.497453 -0.650124
5 -1.756492  1.740063 -0.283258 -1.969033 -0.856556 -0.421336
6 -0.332347  0.181860 -0.166790  0.000000  0.000000 -0.420950
7  0.553072 -1.324093 -0.361992 -0.710529 -0.080789 -1.034787
8  0.000000  0.000000 -1.206436 -0.041814  1.106454 -0.036821
9 -0.702848  0.141501  0.402418  0.814163  0.244678  0.072794

#横向用缺失值前面的值替代缺失值
print(df.fillna(axis=1,method='ffill'))

          0         1         2         3         4         5
0 -0.338736 -0.338736 -0.265401 -0.335392 -0.335392 -0.335392
1  0.735121  0.735121  0.039484  0.684837  0.684837  0.684837
2 -0.774130 -0.774130 -0.421404  1.472310  1.207911 -0.237753
3 -0.152982 -0.152982  0.635784  0.476728 -1.615527  0.640051
4  0.132076 -0.986871 -1.038595 -0.517508 -0.497453 -0.650124
5 -1.756492  1.740063 -0.283258 -1.969033 -0.856556 -0.421336
6 -0.332347  0.181860 -0.166790 -0.166790 -0.166790 -0.420950
7  0.553072 -1.324093 -0.361992 -0.710529 -0.080789 -1.034787
8       NaN       NaN -1.206436 -0.041814  1.106454 -0.036821
9 -0.702848  0.141501  0.402418  0.814163  0.244678  0.072794

#纵向用缺失值上面的值替代缺失值
print(df.fillna(axis=0,method='ffill'))

          0         1         2         3         4         5
0 -0.338736       NaN -0.265401 -0.335392       NaN       NaN
1  0.735121       NaN  0.039484  0.684837       NaN       NaN
2 -0.774130       NaN -0.421404  1.472310  1.207911 -0.237753
3 -0.152982       NaN  0.635784  0.476728 -1.615527  0.640051
4  0.132076 -0.986871 -1.038595 -0.517508 -0.497453 -0.650124
5 -1.756492  1.740063 -0.283258 -1.969033 -0.856556 -0.421336
6 -0.332347  0.181860 -0.166790 -1.969033 -0.856556 -0.420950
7  0.553072 -1.324093 -0.361992 -0.710529 -0.080789 -1.034787
8  0.553072 -1.324093 -1.206436 -0.041814  1.106454 -0.036821
9 -0.702848  0.141501  0.402418  0.814163  0.244678  0.072794

#不同的列用不同的值填充
info={0:1,1:2,2:3,3:4,4:5,5:6}
print(df.fillna(value=info))

          0         1         2         3         4         5
0 -0.338736  2.000000 -0.265401 -0.335392  5.000000  6.000000
1  0.735121  2.000000  0.039484  0.684837  5.000000  6.000000
2 -0.774130  2.000000 -0.421404  1.472310  1.207911 -0.237753
3 -0.152982  2.000000  0.635784  0.476728 -1.615527  0.640051
4  0.132076 -0.986871 -1.038595 -0.517508 -0.497453 -0.650124
5 -1.756492  1.740063 -0.283258 -1.969033 -0.856556 -0.421336
6 -0.332347  0.181860 -0.166790  4.000000  5.000000 -0.420950
7  0.553072 -1.324093 -0.361992 -0.710529 -0.080789 -1.034787
8  1.000000  2.000000 -1.206436 -0.041814  1.106454 -0.036821
9 -0.702848  0.141501  0.402418  0.814163  0.244678  0.072794

#对每列出现的替换值有次数限制
print(df.fillna(value=info,limit=1))

          0         1         2         3         4         5
0 -0.338736  2.000000 -0.265401 -0.335392  5.000000  6.000000
1  0.735121       NaN  0.039484  0.684837       NaN       NaN
2 -0.774130       NaN -0.421404  1.472310  1.207911 -0.237753
3 -0.152982       NaN  0.635784  0.476728 -1.615527  0.640051
4  0.132076 -0.986871 -1.038595 -0.517508 -0.497453 -0.650124
5 -1.756492  1.740063 -0.283258 -1.969033 -0.856556 -0.421336
6 -0.332347  0.181860 -0.166790  4.000000       NaN -0.420950
7  0.553072 -1.324093 -0.361992 -0.710529 -0.080789 -1.034787
8  1.000000       NaN -1.206436 -0.041814  1.106454 -0.036821
9 -0.702848  0.141501  0.402418  0.814163  0.244678  0.072794
  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值