数据分析-数据处理-pandas处理缺失值

缺失值处理

找出缺失值

# 处理缺失值
import pandas as pd
import numpy as np

# 创建含有缺失值的DateFrame
df = pd.DataFrame(np.random.randn(10,6))
df.iloc[:4, 1] = None
df.iloc[:2, 4:6] = None
df.iloc[6, 3:5] = None
df.iloc[8, 0:2] = None

print(df)
          0         1         2         3         4         5
0  1.905276       NaN -0.978137  1.682227       NaN       NaN
1  0.164089       NaN -1.373336  1.388321       NaN       NaN
2  0.599135       NaN  2.294221  0.737271  0.238057  0.526875
3 -0.481358       NaN -0.519011  0.214852  0.040489  0.599064
4 -0.358184 -0.794557 -0.454329 -1.024130 -0.090786 -0.018543
5 -0.679481 -0.126602  0.726568  1.432684 -1.403756  0.252173
6 -0.842605  1.192915 -0.831709       NaN       NaN -0.073455
7  2.223354 -1.164356 -1.222986  0.009452  1.687198  0.344141
8       NaN       NaN -1.024819  0.689602  1.210335 -0.714473
9 -0.397766  0.739464 -1.535297 -1.868259  0.724042 -1.714549
# 判断缺失值
# 元素级别的判断,把对应的所有元素的位置都列出来,元素为空或者为NA为True,否则返回False
res = df.isnull()

res
012345
0FalseTrueFalseFalseTrueTrue
1FalseTrueFalseFalseTrueTrue
2FalseTrueFalseFalseFalseFalse
3FalseTrueFalseFalseFalseFalse
4FalseFalseFalseFalseFalseFalse
5FalseFalseFalseFalseFalseFalse
6FalseFalseFalseTrueTrueFalse
7FalseFalseFalseFalseFalseFalse
8TrueTrueFalseFalseFalseFalse
9FalseFalseFalseFalseFalseFalse
# 列级别的判断,只要该该列有空或者NA就返回True,否则返回False
res = df.isnull().any()

res
0     True
1     True
2    False
3     True
4     True
5     True
dtype: bool
# 只显示存在缺失值的行列,确定缺失值的位置
# drop_duplicates() 去掉重复的行

result = df[df.isnull().values==True].drop_duplicates()
result
012345
01.905276NaN-0.9781371.682227NaNNaN
10.164089NaN-1.3733361.388321NaNNaN
20.599135NaN2.2942210.7372710.2380570.526875
3-0.481358NaN-0.5190110.2148520.0404890.599064
6-0.8426051.192915-0.831709NaNNaN-0.073455
8NaNNaN-1.0248190.6896021.210335-0.714473
# 获得为空或者NA的列索引
res = df.columns[df.isnull().any()==True]
res
Int64Index([0, 1, 3, 4, 5], dtype='int64')
# 获取每列为空的数据的个数
num = df.isnull().sum()
num
0    1
1    5
2    0
3    1
4    3
5    2
dtype: int64
# 获取每行为空的数据的个数 axis=1 代表行
num = df.isnull().sum(axis=1)
num
0    3
1    3
2    1
3    1
4    0
5    0
6    2
7    0
8    2
9    0
dtype: int64

删除缺失值所在的行列

# 删除具有空值的行 不改变原矩阵
df.dropna()
012345
4-0.358184-0.794557-0.454329-1.024130-0.090786-0.018543
5-0.679481-0.1266020.7265681.432684-1.4037560.252173
72.223354-1.164356-1.2229860.0094521.6871980.344141
9-0.3977660.739464-1.535297-1.8682590.724042-1.714549
# 删除具有空值的列
df.dropna(axis=1)
2
0-0.978137
1-1.373336
22.294221
3-0.519011
4-0.454329
50.726568
6-0.831709
7-1.222986
8-1.024819
9-1.535297
# 所有值为缺失值才删除
df.dropna(how='all')
012345
01.905276NaN-0.9781371.682227NaNNaN
10.164089NaN-1.3733361.388321NaNNaN
20.599135NaN2.2942210.7372710.2380570.526875
3-0.481358NaN-0.5190110.2148520.0404890.599064
4-0.358184-0.794557-0.454329-1.024130-0.090786-0.018543
5-0.679481-0.1266020.7265681.432684-1.4037560.252173
6-0.8426051.192915-0.831709NaNNaN-0.073455
72.223354-1.164356-1.2229860.0094521.6871980.344141
8NaNNaN-1.0248190.6896021.210335-0.714473
9-0.3977660.739464-1.535297-1.8682590.724042-1.714549
# 至少有四个非空值才保留
df.dropna(thresh=4)
012345
20.599135NaN2.2942210.7372710.2380570.526875
3-0.481358NaN-0.5190110.2148520.0404890.599064
4-0.358184-0.794557-0.454329-1.024130-0.090786-0.018543
5-0.679481-0.1266020.7265681.432684-1.4037560.252173
6-0.8426051.192915-0.831709NaNNaN-0.073455
72.223354-1.164356-1.2229860.0094521.6871980.344141
8NaNNaN-1.0248190.6896021.210335-0.714473
9-0.3977660.739464-1.535297-1.8682590.724042-1.714549
# 删除这个subset中的含有缺失值的行
df.dropna(subset=[2,4])
012345
20.599135NaN2.2942210.7372710.2380570.526875
3-0.481358NaN-0.5190110.2148520.0404890.599064
4-0.358184-0.794557-0.454329-1.024130-0.090786-0.018543
5-0.679481-0.1266020.7265681.432684-1.4037560.252173
72.223354-1.164356-1.2229860.0094521.6871980.344141
8NaNNaN-1.0248190.6896021.210335-0.714473
9-0.3977660.739464-1.535297-1.8682590.724042-1.714549

缺失值填充

# 缺失值填充0
df.fillna(0)
012345
01.9052760.000000-0.9781371.6822270.0000000.000000
10.1640890.000000-1.3733361.3883210.0000000.000000
20.5991350.0000002.2942210.7372710.2380570.526875
3-0.4813580.000000-0.5190110.2148520.0404890.599064
4-0.358184-0.794557-0.454329-1.024130-0.090786-0.018543
5-0.679481-0.1266020.7265681.432684-1.4037560.252173
6-0.8426051.192915-0.8317090.0000000.000000-0.073455
72.223354-1.164356-1.2229860.0094521.6871980.344141
80.0000000.000000-1.0248190.6896021.210335-0.714473
9-0.3977660.739464-1.535297-1.8682590.724042-1.714549
# 横向用缺失值前面的值替换缺失值
df.fillna(axis=1, method='ffill')
012345
01.9052761.905276-0.9781371.6822271.6822271.682227
10.1640890.164089-1.3733361.3883211.3883211.388321
20.5991350.5991352.2942210.7372710.2380570.526875
3-0.481358-0.481358-0.5190110.2148520.0404890.599064
4-0.358184-0.794557-0.454329-1.024130-0.090786-0.018543
5-0.679481-0.1266020.7265681.432684-1.4037560.252173
6-0.8426051.192915-0.831709-0.831709-0.831709-0.073455
72.223354-1.164356-1.2229860.0094521.6871980.344141
8NaNNaN-1.0248190.6896021.210335-0.714473
9-0.3977660.739464-1.535297-1.8682590.724042-1.714549
# 纵向用缺失值上面的值替换缺失值
df.fillna(axis=0, method='ffill')
012345
01.905276NaN-0.9781371.682227NaNNaN
10.164089NaN-1.3733361.388321NaNNaN
20.599135NaN2.2942210.7372710.2380570.526875
3-0.481358NaN-0.5190110.2148520.0404890.599064
4-0.358184-0.794557-0.454329-1.024130-0.090786-0.018543
5-0.679481-0.1266020.7265681.432684-1.4037560.252173
6-0.8426051.192915-0.8317091.432684-1.403756-0.073455
72.223354-1.164356-1.2229860.0094521.6871980.344141
82.223354-1.164356-1.0248190.6896021.210335-0.714473
9-0.3977660.739464-1.535297-1.8682590.724042-1.714549
# 不同的列用不同的值进行填充
info = {0:0,1:1,2:2,3:3,4:4,5:5}
df.fillna(value=info)
012345
01.9052761.000000-0.9781371.6822274.0000005.000000
10.1640891.000000-1.3733361.3883214.0000005.000000
20.5991351.0000002.2942210.7372710.2380570.526875
3-0.4813581.000000-0.5190110.2148520.0404890.599064
4-0.358184-0.794557-0.454329-1.024130-0.090786-0.018543
5-0.679481-0.1266020.7265681.432684-1.4037560.252173
6-0.8426051.192915-0.8317093.0000004.000000-0.073455
72.223354-1.164356-1.2229860.0094521.6871980.344141
80.0000001.000000-1.0248190.6896021.210335-0.714473
9-0.3977660.739464-1.535297-1.8682590.724042-1.714549
# 对每列出现的替换值有次数限制
df.fillna(value=info, limit=1)
012345
01.9052761.000000-0.9781371.6822274.0000005.000000
10.164089NaN-1.3733361.388321NaNNaN
20.599135NaN2.2942210.7372710.2380570.526875
3-0.481358NaN-0.5190110.2148520.0404890.599064
4-0.358184-0.794557-0.454329-1.024130-0.090786-0.018543
5-0.679481-0.1266020.7265681.432684-1.4037560.252173
6-0.8426051.192915-0.8317093.000000NaN-0.073455
72.223354-1.164356-1.2229860.0094521.6871980.344141
80.000000NaN-1.0248190.6896021.210335-0.714473
9-0.3977660.739464-1.535297-1.8682590.724042-1.714549
  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值