pandas处理数据表中缺失数据的方法
import numpy as np
import pandas as pd
# 定义DataFrame对象
>>> df = pd.read_csv('test.csv')
>>> df
A B C D
0 foo one -1.437858 0.155025
1 bar one 1.150565 -0.614996
2 foo NaN 0.296236 0.538160
3 bar three -1.355619 NaN
4 foo two -0.411405 -1.167204
5 bar two -0.178302 -0.451726
6 foo one NaN -0.407458
7 foo three -1.608615 -1.025847
1. df.backfill(inplace=True)
是否填充缺失值(使用附近的数据填充),同df.bfill(inplace=True)
。
>>> df.backfill(inplace=False)
>>> df
A B C D
0 foo one -1.437858 0.155025
1 bar one 1.150565 -0.614996
2 foo NaN 0.296236 0.538160
3 bar three -1.355619 NaN
4 foo two -0.411405 -1.167204
5 bar two -0.178302 -0.451726
6 foo one NaN -0.407458
7 foo three -1.608615 -1.025847
>>> df.backfill(inplace=True)
>>> df
A B C D
0 foo one -1.437858 0.155025
1 bar one 1.150565 -0.614996
2 foo three 0.296236 0.538160
3 bar three -1.355619 -1.167204
4 foo two -0.411405 -1.167204
5 bar two -0.178302 -0.451726
6 foo one -1.608615 -0.407458
7 foo three -1.608615 -1.025847
2. DataFrame.dropna(axis, how, inplace)
丢弃缺失值。
参数:
axis: {0 or 'index', 1 or 'columns'}, default 0
0: 丢弃有空值的行;
1: 丢弃有空值的列;how: {'any', 'all'}, default 'any'
‘any’: 有一个空值即丢弃;
‘all’:全为空值则丢弃;inplace: bool, default False
是否直接更改DataFrame对象的值,返回None;
>>> df.dropna(axis=0)
A B C D
0 foo one -1.437858 0.155025
1 bar one 1.150565 -0.614996
4 foo two -0.411405 -1.167204
5 bar two -0.178302 -0.451726
7 foo three -1.608615 -1.025847
>>> df.dropna(axis=1)
A
0 foo
1 bar
2 foo
3 bar
4 foo
5 bar
6 foo
7 foo
3. DataFrame.fillna(value, method, axis, inplace, limit)
填充缺失值。
参数:
value: scalar, dict, Series, or DataFrame
用于填充空值的值;method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None
填充空值的方法;axis : {0 or 'index', 1 or 'columns'}
0: 处理空值的行;
1: 处理空值的列;inplace : bool, default False
是否原地更改对象;limit : int, default None
限制修改的个数;
>>> df = pd.read_csv('test.csv')
>>> df
A B C D
0 foo one -1.437858 0.155025
1 bar one 1.150565 -0.614996
2 foo NaN 0.296236 0.538160
3 bar three -1.355619 NaN
4 foo two -0.411405 -1.167204
5 bar two -0.178302 -0.451726
6 foo one NaN -0.407458
7 foo three -1.608615 -1.025847
>>> df.fillna(value=0)
A B C D
0 foo one -1.437858 0.155025
1 bar one 1.150565 -0.614996
2 foo 0 0.296236 0.538160
3 bar three -1.355619 0.000000
4 foo two -0.411405 -1.167204
5 bar two -0.178302 -0.451726
6 foo one 0.000000 -0.407458
7 foo three -1.608615 -1.025847
>>> df.fillna(value={'B': 'zero', 'C': 1, 'D': 0})
A B C D
0 foo one -1.437858 0.155025
1 bar one 1.150565 -0.614996
2 foo zero 0.296236 0.538160
3 bar three -1.355619 0.000000
4 foo two -0.411405 -1.167204
5 bar two -0.178302 -0.451726
6 foo one 1.000000 -0.407458
7 foo three -1.608615 -1.025847
4. DataFrame.interpolate(method, axis, limit, inplace, limit_direction, limit_area)
参数:
method: str, default ‘linear’
- ‘linear’: 线性填充;
- ‘pad’: 使用现有值填充;
- ‘ploynomial’: 多项式填充;
- ···
axis: {{0 or ‘index’, 1 or ‘columns’, None}}, default None
操作行或者列;limit: int, optional
最多填充的个数;inplace: bool, default False
是否原地修改;limit_direction: {{‘forward’, ‘backward’, ‘both’}}, Optional
limit_area: {{None, ‘inside’, ‘outside’}}, default None
>>> df = pd.read_csv('test.csv')
>>> df
A B C D
0 foo one -1.437858 0.155025
1 bar one 1.150565 -0.614996
2 foo NaN 0.296236 0.538160
3 bar three -1.355619 NaN
4 foo two -0.411405 -1.167204
5 bar two -0.178302 -0.451726
6 foo one NaN -0.407458
7 foo three -1.608615 -1.025847
>>> df.interpolate(method='linear')
A B C D
0 foo one -1.437858 0.155025
1 bar one 1.150565 -0.614996
2 foo NaN 0.296236 0.538160
3 bar three -1.355619 -0.314522
4 foo two -0.411405 -1.167204
5 bar two -0.178302 -0.451726
6 foo one -0.893458 -0.407458
7 foo three -1.608615 -1.025847
>>> df.interpolate(method='pad')
A B C D
0 foo one -1.437858 0.155025
1 bar one 1.150565 -0.614996
2 foo one 0.296236 0.538160
3 bar three -1.355619 0.538160
4 foo two -0.411405 -1.167204
5 bar two -0.178302 -0.451726
6 foo one -0.178302 -0.407458
7 foo three -1.608615 -1.025847
>>> df.interpolate(method='polynomial', order=2)
A B C D
0 foo one -1.437858 0.155025
1 bar one 1.150565 -0.614996
2 foo NaN 0.296236 0.538160
3 bar three -1.355619 -0.202757
4 foo two -0.411405 -1.167204
5 bar two -0.178302 -0.451726
6 foo one -0.617035 -0.407458
7 foo three -1.608615 -1.025847
5. DataFrame.isna()
检查是否有空值,同DataFrame.isnull()
。
>>> df = pd.read_csv('test.csv')
>>> df
A B C D
0 foo one -1.437858 0.155025
1 bar one 1.150565 -0.614996
2 foo NaN 0.296236 0.538160
3 bar three -1.355619 NaN
4 foo two -0.411405 -1.167204
5 bar two -0.178302 -0.451726
6 foo one NaN -0.407458
>>> df.isna()
A B C D
0 False False False False
1 False False False False
2 False True False False
3 False False False True
4 False False False False
5 False False False False
6 False False True False
7 False False False False
6. DataFrame.notna()
检查是否不为空,同df.notnull()
。
>>> df.notna()
A B C D
0 True True True True
1 True True True True
2 True False True True
3 True True True False
4 True True True True
5 True True True True
6 True True False True
7 True True True True
7. DataFrame.replace(to_replace, value, inplace, limit, method)
替换值。
参数:
to_replace: str, regex, list, dict, Series, int, float, or None
被替换的值;value: scalar, dict, list, str, regex, default None
被替换为的值;inplace: bool, default False
是否原地修改;limit: int, default None
最多替换的数量;method{‘pad’, ‘ffill’, ‘bfill’, None}
>>> df = pd.read_csv('test.csv')
>>> df
A B C D
0 foo one -1.437858 0.155025
1 bar one 1.150565 -0.614996
2 foo NaN 0.296236 0.538160
3 bar three -1.355619 NaN
4 foo two -0.411405 -1.167204
5 bar two -0.178302 -0.451726
6 foo one NaN -0.407458
>>> df.replace(np.NaN, 0)
A B C D
0 foo one -1.437858 0.155025
1 bar one 1.150565 -0.614996
2 foo 0 0.296236 0.538160
3 bar three -1.355619 0.000000
4 foo two -0.411405 -1.167204
5 bar two -0.178302 -0.451726
6 foo one 0.000000 -0.407458
7 foo three -1.608615 -1.025847
>>> df.replace({'B': 'one'}, 'ONE')
A B C D
0 foo ONE -1.437858 0.155025
1 bar ONE 1.150565 -0.614996
2 foo NaN 0.296236 0.538160
3 bar three -1.355619 NaN
4 foo two -0.411405 -1.167204
5 bar two -0.178302 -0.451726
6 foo ONE NaN -0.407458
7 foo three -1.608615 -1.025847