缺失值处理
找出缺失值
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randn(10,6))
df.iloc[:4, 1] = None
df.iloc[:2, 4:6] = None
df.iloc[6, 3:5] = None
df.iloc[8, 0:2] = None
print(df)
0 1 2 3 4 5
0 1.905276 NaN -0.978137 1.682227 NaN NaN
1 0.164089 NaN -1.373336 1.388321 NaN NaN
2 0.599135 NaN 2.294221 0.737271 0.238057 0.526875
3 -0.481358 NaN -0.519011 0.214852 0.040489 0.599064
4 -0.358184 -0.794557 -0.454329 -1.024130 -0.090786 -0.018543
5 -0.679481 -0.126602 0.726568 1.432684 -1.403756 0.252173
6 -0.842605 1.192915 -0.831709 NaN NaN -0.073455
7 2.223354 -1.164356 -1.222986 0.009452 1.687198 0.344141
8 NaN NaN -1.024819 0.689602 1.210335 -0.714473
9 -0.397766 0.739464 -1.535297 -1.868259 0.724042 -1.714549
res = df.isnull()
res
| 0 | 1 | 2 | 3 | 4 | 5 |
---|
0 | False | True | False | False | True | True |
---|
1 | False | True | False | False | True | True |
---|
2 | False | True | False | False | False | False |
---|
3 | False | True | False | False | False | False |
---|
4 | False | False | False | False | False | False |
---|
5 | False | False | False | False | False | False |
---|
6 | False | False | False | True | True | False |
---|
7 | False | False | False | False | False | False |
---|
8 | True | True | False | False | False | False |
---|
9 | False | False | False | False | False | False |
---|
res = df.isnull().any()
res
0 True
1 True
2 False
3 True
4 True
5 True
dtype: bool
result = df[df.isnull().values==True].drop_duplicates()
result
| 0 | 1 | 2 | 3 | 4 | 5 |
---|
0 | 1.905276 | NaN | -0.978137 | 1.682227 | NaN | NaN |
---|
1 | 0.164089 | NaN | -1.373336 | 1.388321 | NaN | NaN |
---|
2 | 0.599135 | NaN | 2.294221 | 0.737271 | 0.238057 | 0.526875 |
---|
3 | -0.481358 | NaN | -0.519011 | 0.214852 | 0.040489 | 0.599064 |
---|
6 | -0.842605 | 1.192915 | -0.831709 | NaN | NaN | -0.073455 |
---|
8 | NaN | NaN | -1.024819 | 0.689602 | 1.210335 | -0.714473 |
---|
res = df.columns[df.isnull().any()==True]
res
Int64Index([0, 1, 3, 4, 5], dtype='int64')
num = df.isnull().sum()
num
0 1
1 5
2 0
3 1
4 3
5 2
dtype: int64
num = df.isnull().sum(axis=1)
num
0 3
1 3
2 1
3 1
4 0
5 0
6 2
7 0
8 2
9 0
dtype: int64
删除缺失值所在的行列
df.dropna()
| 0 | 1 | 2 | 3 | 4 | 5 |
---|
4 | -0.358184 | -0.794557 | -0.454329 | -1.024130 | -0.090786 | -0.018543 |
---|
5 | -0.679481 | -0.126602 | 0.726568 | 1.432684 | -1.403756 | 0.252173 |
---|
7 | 2.223354 | -1.164356 | -1.222986 | 0.009452 | 1.687198 | 0.344141 |
---|
9 | -0.397766 | 0.739464 | -1.535297 | -1.868259 | 0.724042 | -1.714549 |
---|
df.dropna(axis=1)
| 2 |
---|
0 | -0.978137 |
---|
1 | -1.373336 |
---|
2 | 2.294221 |
---|
3 | -0.519011 |
---|
4 | -0.454329 |
---|
5 | 0.726568 |
---|
6 | -0.831709 |
---|
7 | -1.222986 |
---|
8 | -1.024819 |
---|
9 | -1.535297 |
---|
df.dropna(how='all')
| 0 | 1 | 2 | 3 | 4 | 5 |
---|
0 | 1.905276 | NaN | -0.978137 | 1.682227 | NaN | NaN |
---|
1 | 0.164089 | NaN | -1.373336 | 1.388321 | NaN | NaN |
---|
2 | 0.599135 | NaN | 2.294221 | 0.737271 | 0.238057 | 0.526875 |
---|
3 | -0.481358 | NaN | -0.519011 | 0.214852 | 0.040489 | 0.599064 |
---|
4 | -0.358184 | -0.794557 | -0.454329 | -1.024130 | -0.090786 | -0.018543 |
---|
5 | -0.679481 | -0.126602 | 0.726568 | 1.432684 | -1.403756 | 0.252173 |
---|
6 | -0.842605 | 1.192915 | -0.831709 | NaN | NaN | -0.073455 |
---|
7 | 2.223354 | -1.164356 | -1.222986 | 0.009452 | 1.687198 | 0.344141 |
---|
8 | NaN | NaN | -1.024819 | 0.689602 | 1.210335 | -0.714473 |
---|
9 | -0.397766 | 0.739464 | -1.535297 | -1.868259 | 0.724042 | -1.714549 |
---|
df.dropna(thresh=4)
| 0 | 1 | 2 | 3 | 4 | 5 |
---|
2 | 0.599135 | NaN | 2.294221 | 0.737271 | 0.238057 | 0.526875 |
---|
3 | -0.481358 | NaN | -0.519011 | 0.214852 | 0.040489 | 0.599064 |
---|
4 | -0.358184 | -0.794557 | -0.454329 | -1.024130 | -0.090786 | -0.018543 |
---|
5 | -0.679481 | -0.126602 | 0.726568 | 1.432684 | -1.403756 | 0.252173 |
---|
6 | -0.842605 | 1.192915 | -0.831709 | NaN | NaN | -0.073455 |
---|
7 | 2.223354 | -1.164356 | -1.222986 | 0.009452 | 1.687198 | 0.344141 |
---|
8 | NaN | NaN | -1.024819 | 0.689602 | 1.210335 | -0.714473 |
---|
9 | -0.397766 | 0.739464 | -1.535297 | -1.868259 | 0.724042 | -1.714549 |
---|
df.dropna(subset=[2,4])
| 0 | 1 | 2 | 3 | 4 | 5 |
---|
2 | 0.599135 | NaN | 2.294221 | 0.737271 | 0.238057 | 0.526875 |
---|
3 | -0.481358 | NaN | -0.519011 | 0.214852 | 0.040489 | 0.599064 |
---|
4 | -0.358184 | -0.794557 | -0.454329 | -1.024130 | -0.090786 | -0.018543 |
---|
5 | -0.679481 | -0.126602 | 0.726568 | 1.432684 | -1.403756 | 0.252173 |
---|
7 | 2.223354 | -1.164356 | -1.222986 | 0.009452 | 1.687198 | 0.344141 |
---|
8 | NaN | NaN | -1.024819 | 0.689602 | 1.210335 | -0.714473 |
---|
9 | -0.397766 | 0.739464 | -1.535297 | -1.868259 | 0.724042 | -1.714549 |
---|
缺失值填充
df.fillna(0)
| 0 | 1 | 2 | 3 | 4 | 5 |
---|
0 | 1.905276 | 0.000000 | -0.978137 | 1.682227 | 0.000000 | 0.000000 |
---|
1 | 0.164089 | 0.000000 | -1.373336 | 1.388321 | 0.000000 | 0.000000 |
---|
2 | 0.599135 | 0.000000 | 2.294221 | 0.737271 | 0.238057 | 0.526875 |
---|
3 | -0.481358 | 0.000000 | -0.519011 | 0.214852 | 0.040489 | 0.599064 |
---|
4 | -0.358184 | -0.794557 | -0.454329 | -1.024130 | -0.090786 | -0.018543 |
---|
5 | -0.679481 | -0.126602 | 0.726568 | 1.432684 | -1.403756 | 0.252173 |
---|
6 | -0.842605 | 1.192915 | -0.831709 | 0.000000 | 0.000000 | -0.073455 |
---|
7 | 2.223354 | -1.164356 | -1.222986 | 0.009452 | 1.687198 | 0.344141 |
---|
8 | 0.000000 | 0.000000 | -1.024819 | 0.689602 | 1.210335 | -0.714473 |
---|
9 | -0.397766 | 0.739464 | -1.535297 | -1.868259 | 0.724042 | -1.714549 |
---|
df.fillna(axis=1, method='ffill')
| 0 | 1 | 2 | 3 | 4 | 5 |
---|
0 | 1.905276 | 1.905276 | -0.978137 | 1.682227 | 1.682227 | 1.682227 |
---|
1 | 0.164089 | 0.164089 | -1.373336 | 1.388321 | 1.388321 | 1.388321 |
---|
2 | 0.599135 | 0.599135 | 2.294221 | 0.737271 | 0.238057 | 0.526875 |
---|
3 | -0.481358 | -0.481358 | -0.519011 | 0.214852 | 0.040489 | 0.599064 |
---|
4 | -0.358184 | -0.794557 | -0.454329 | -1.024130 | -0.090786 | -0.018543 |
---|
5 | -0.679481 | -0.126602 | 0.726568 | 1.432684 | -1.403756 | 0.252173 |
---|
6 | -0.842605 | 1.192915 | -0.831709 | -0.831709 | -0.831709 | -0.073455 |
---|
7 | 2.223354 | -1.164356 | -1.222986 | 0.009452 | 1.687198 | 0.344141 |
---|
8 | NaN | NaN | -1.024819 | 0.689602 | 1.210335 | -0.714473 |
---|
9 | -0.397766 | 0.739464 | -1.535297 | -1.868259 | 0.724042 | -1.714549 |
---|
df.fillna(axis=0, method='ffill')
| 0 | 1 | 2 | 3 | 4 | 5 |
---|
0 | 1.905276 | NaN | -0.978137 | 1.682227 | NaN | NaN |
---|
1 | 0.164089 | NaN | -1.373336 | 1.388321 | NaN | NaN |
---|
2 | 0.599135 | NaN | 2.294221 | 0.737271 | 0.238057 | 0.526875 |
---|
3 | -0.481358 | NaN | -0.519011 | 0.214852 | 0.040489 | 0.599064 |
---|
4 | -0.358184 | -0.794557 | -0.454329 | -1.024130 | -0.090786 | -0.018543 |
---|
5 | -0.679481 | -0.126602 | 0.726568 | 1.432684 | -1.403756 | 0.252173 |
---|
6 | -0.842605 | 1.192915 | -0.831709 | 1.432684 | -1.403756 | -0.073455 |
---|
7 | 2.223354 | -1.164356 | -1.222986 | 0.009452 | 1.687198 | 0.344141 |
---|
8 | 2.223354 | -1.164356 | -1.024819 | 0.689602 | 1.210335 | -0.714473 |
---|
9 | -0.397766 | 0.739464 | -1.535297 | -1.868259 | 0.724042 | -1.714549 |
---|
info = {0:0,1:1,2:2,3:3,4:4,5:5}
df.fillna(value=info)
| 0 | 1 | 2 | 3 | 4 | 5 |
---|
0 | 1.905276 | 1.000000 | -0.978137 | 1.682227 | 4.000000 | 5.000000 |
---|
1 | 0.164089 | 1.000000 | -1.373336 | 1.388321 | 4.000000 | 5.000000 |
---|
2 | 0.599135 | 1.000000 | 2.294221 | 0.737271 | 0.238057 | 0.526875 |
---|
3 | -0.481358 | 1.000000 | -0.519011 | 0.214852 | 0.040489 | 0.599064 |
---|
4 | -0.358184 | -0.794557 | -0.454329 | -1.024130 | -0.090786 | -0.018543 |
---|
5 | -0.679481 | -0.126602 | 0.726568 | 1.432684 | -1.403756 | 0.252173 |
---|
6 | -0.842605 | 1.192915 | -0.831709 | 3.000000 | 4.000000 | -0.073455 |
---|
7 | 2.223354 | -1.164356 | -1.222986 | 0.009452 | 1.687198 | 0.344141 |
---|
8 | 0.000000 | 1.000000 | -1.024819 | 0.689602 | 1.210335 | -0.714473 |
---|
9 | -0.397766 | 0.739464 | -1.535297 | -1.868259 | 0.724042 | -1.714549 |
---|
df.fillna(value=info, limit=1)
| 0 | 1 | 2 | 3 | 4 | 5 |
---|
0 | 1.905276 | 1.000000 | -0.978137 | 1.682227 | 4.000000 | 5.000000 |
---|
1 | 0.164089 | NaN | -1.373336 | 1.388321 | NaN | NaN |
---|
2 | 0.599135 | NaN | 2.294221 | 0.737271 | 0.238057 | 0.526875 |
---|
3 | -0.481358 | NaN | -0.519011 | 0.214852 | 0.040489 | 0.599064 |
---|
4 | -0.358184 | -0.794557 | -0.454329 | -1.024130 | -0.090786 | -0.018543 |
---|
5 | -0.679481 | -0.126602 | 0.726568 | 1.432684 | -1.403756 | 0.252173 |
---|
6 | -0.842605 | 1.192915 | -0.831709 | 3.000000 | NaN | -0.073455 |
---|
7 | 2.223354 | -1.164356 | -1.222986 | 0.009452 | 1.687198 | 0.344141 |
---|
8 | 0.000000 | NaN | -1.024819 | 0.689602 | 1.210335 | -0.714473 |
---|
9 | -0.397766 | 0.739464 | -1.535297 | -1.868259 | 0.724042 | -1.714549 |
---|