处理dataframe的缺失值

import numpy as np
import pandas as pd
#在pandas或者numpy中要非常注意NaN与NaN之间并不相同
#但是None相当于np.nan
None == None  # noqa: E711
True
np.nan == np.nan #还要注意的一点是NaN是float
False
#Filling missing values: fillna
df = pd.DataFrame(np.random.randn(5, 3),index=["a", "c", "e", "f", "h"],columns=["one", "two", "three"],)
df2 = df.reindex(["a", "b", "c", "d", "e", "f", "g", "h"])
df2
onetwothree
a0.5231091.1711670.083875
bNaNNaNNaN
c1.1448980.023587-1.429626
dNaNNaNNaN
e0.850141-1.096858-0.877410
f-2.938262-1.470641-0.760278
gNaNNaNNaN
h-0.358836-1.0727620.015806
df2.fillna(0)
onetwothree
a0.5231091.1711670.083875
b0.0000000.0000000.000000
c1.1448980.023587-1.429626
d0.0000000.0000000.000000
e0.850141-1.096858-0.877410
f-2.938262-1.470641-0.760278
g0.0000000.0000000.000000
h-0.358836-1.0727620.015806
df2["one"].fillna("missing")
a    0.523109
b     missing
c      1.1449
d     missing
e    0.850141
f    -2.93826
g     missing
h   -0.358836
Name: one, dtype: object
df2.fillna(method="ffill") #使用上一行对应列的数值填充NA
onetwothree
a0.5231091.1711670.083875
b0.5231091.1711670.083875
c1.1448980.023587-1.429626
d1.1448980.023587-1.429626
e0.850141-1.096858-0.877410
f-2.938262-1.470641-0.760278
g-2.938262-1.470641-0.760278
h-0.358836-1.0727620.015806
df2.fillna(method="ffill", limit=1) #limit参数限制填充次数,否则可能出现整列都是一个数字
onetwothree
a0.5231091.1711670.083875
b0.5231091.1711670.083875
c1.1448980.023587-1.429626
d1.1448980.023587-1.429626
e0.850141-1.096858-0.877410
f-2.938262-1.470641-0.760278
g-2.938262-1.470641-0.760278
h-0.358836-1.0727620.015806
df2.fillna(method="backfill") #df2.fillna(method="backfill",limit=1)
onetwothree
a0.5231091.1711670.083875
b1.1448980.023587-1.429626
c1.1448980.023587-1.429626
d0.850141-1.096858-0.877410
e0.850141-1.096858-0.877410
f-2.938262-1.470641-0.760278
g-0.358836-1.0727620.015806
h-0.358836-1.0727620.015806
dff = pd.DataFrame(np.random.randn(10, 3), columns=list("ABC"))
dff.iloc[3:5, 0] = np.nan
dff.iloc[4:6, 1] = np.nan
dff.iloc[5:8, 2] = np.nan
dff
ABC
00.214955-0.857945-1.136325
1-0.213562-0.7196260.431266
2-1.0532911.8123490.491484
3NaN0.016753-0.218812
4NaNNaN2.033018
5-0.642261NaNNaN
6-0.881779-0.995053NaN
71.4540170.591962NaN
82.000574-0.420521-0.245732
9-0.893512-0.4722570.181229
dff.fillna(dff.mean())
ABC
00.214955-0.857945-1.136325
1-0.213562-0.7196260.431266
2-1.0532911.8123490.491484
3-0.0018570.016753-0.218812
4-0.001857-0.1305422.033018
5-0.642261-0.1305420.219447
6-0.881779-0.9950530.219447
71.4540170.5919620.219447
82.000574-0.420521-0.245732
9-0.893512-0.4722570.181229
dff.fillna(dff.mean()["B":"C"])
ABC
00.214955-0.857945-1.136325
1-0.213562-0.7196260.431266
2-1.0532911.8123490.491484
3NaN0.016753-0.218812
4NaN-0.1305422.033018
5-0.642261-0.1305420.219447
6-0.881779-0.9950530.219447
71.4540170.5919620.219447
82.000574-0.420521-0.245732
9-0.893512-0.4722570.181229
dff.dropna(axis=0)
ABC
00.214955-0.857945-1.136325
1-0.213562-0.7196260.431266
2-1.0532911.8123490.491484
82.000574-0.420521-0.245732
9-0.893512-0.4722570.181229
dff.dropna(axis=1)
0
1
2
3
4
5
6
7
8
9
dff["A"].dropna()
0    0.214955
1   -0.213562
2   -1.053291
5   -0.642261
6   -0.881779
7    1.454017
8    2.000574
9   -0.893512
Name: A, dtype: float64
#缺失值填充方法:df.interpolate(method="") #暂不叙述
#值的替换
ser = pd.Series([0.0, 1.0, 2.0, 3.0, 4.0])
ser.replace(0, 5) #value,replace_value
0    5.0
1    1.0
2    2.0
3    3.0
4    4.0
dtype: float64
ser.replace([0, 1, 2, 3, 4], [4, 3, 2, 1, 0])
0    4.0
1    3.0
2    2.0
3    1.0
4    0.0
dtype: float64
df = pd.DataFrame({"a": [0, 1, 2, 3, 4], "b": [5, 6, 7, 8, 9]})
df
ab
005
116
227
338
449
df.replace({"a": 0, "b": 5}, 100)
ab
0100100
116
227
338
449
#String/regular expression replacement
d = {"a": list(range(4)), "b": list("ab.."), "c": ["a", "b", np.nan, "d"]}
df = pd.DataFrame(d)
df
abc
00aa
11bb
22.NaN
33.d
df.replace(".", np.nan)
abc
00aa
11bb
22NaNNaN
33NaNd
df.replace(r"\s*\.\s*", np.nan, regex=True)
abc
00aa
11bb
22NaNNaN
33NaNd
df.replace(["a", "."], ["b", np.nan])
abc
00bb
11bb
22NaNNaN
33NaNd
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值