缺失值标志:NaN(no a number)
缺失值的处理:1-删除dropna 2-填充fillna
1-删除
from pandas import Series
import numpy as np
strSer = Series(['a', 'b', np.nan, 'd', 'e'])
print(strSer)
输出结果:
0 a
1 b
2 NaN
3 d
4 e
dtype: object
# isnull
print(strSer.isnull())
输出结果:
0 False
1 False
2 True
3 False
4 False
dtype: bool
# notnull
print(strSer.notnull())
输出结果:
0 True
1 True
2 False
3 True
4 True
dtype: bool
# 筛选出不为空的项
r = strSer[strSer.notnull()]
print(r)
输出结果:
0 a
1 b
3 d
4 e
dtype: object
# dropna --删除缺失值
r = strSer.dropna()
print(r)
输出结果:
0 a
1 b
3 d
4 e
dtype: object
****************************************************************************************
****************************************************************************************
from pandas import DataFrame
import numpy as np
df = DataFrame([[1.4, np.nan],
[7.1, -4.5],
[np.nan, np.nan],
[0.75, -1.3]],
index=['a', 'b', 'c', 'd'],
columns=['one', 'two']
)
print(df)
输出结果:
one two
a 1.40 NaN
b 7.10 -4.5
c NaN NaN
d 0.75 -1.3
# 删除含有缺失值的项
r=df.dropna()
print(r)
输出结果:
one two
b 7.10 -4.5
d 0.75 -1.3
# 删除全缺失的项
r=df.dropna(how='all')
print(r)
输出结果:
one two
a 1.40 NaN
b 7.10 -4.5
d 0.75 -1.3
2-填充
from pandas import DataFrame
import numpy as np
df = DataFrame([[1.4, np.nan],
[7.1, -4.5],
[np.nan, np.nan],
[0.75, -1.3]],
index=['a', 'b', 'c', 'd'],
columns=['one', 'two']
)
print(df)
输出结果:
one two
a 1.40 NaN
b 7.10 -4.5
c NaN NaN
d 0.75 -1.3
# 填充fillna
r = df.fillna(0) # 将缺失值用0填充
print(r)
输出结果:
one two
a 1.40 0.0
b 7.10 -4.5
c 0.00 0.0
d 0.75 -1.3
# one这一列用0填充,two这一列用1填充
r = df.fillna({'one': 0, 'two': 1})
print(r)
输出结果:
one two
a 1.40 1.0
b 7.10 -4.5
c 0.00 1.0
d 0.75 -1.3
# 以均值填充缺失值
print(df.mean()) # 求出每一列的均值
输出结果:
one 3.083333
two -2.900000
dtype: float64
r = df.fillna(df.mean()) #每一列的均值填充每一列的缺失值
print(r)
输出结果:
one two
a 1.400000 -2.9
b 7.100000 -4.5
c 3.083333 -2.9
d 0.750000 -1.3