包含全部示例的代码仓库见GIthub
1 导入库
import pandas as pd
import numpy as np
2 重复值处理
data = pd.DataFrame({'k1':['one']*3 + ['two']*2, 'k2':[1,1,2,3,3]})
data
# output
k1 k2
0 one 1
1 one 1
2 one 2
3 two 3
4 two 3
(data.duplicated()).sum()
# output
2
data.drop_duplicates()
# output
k1 k2
0 one 1
2 one 2
3 two 3
根据k1进行去重
data.drop_duplicates('k1') # 根据k1进行去重
# output
k1 k2
0 one 1
3 two 3
根据k1,k3进行去重
data['k3'] = 1
data
# output
k1 k2 k3
0 one 1 1
1 one 1 1
2 one 2 1
3 two 3 1
4 two 3 1
data.drop_duplicates(['k1', 'k3']) # 根据k1,k3进行去重
# output
k1 k2 k3
0 one 1 1
3 two 3 1
3 数值替换
data
# output
k1 k2 k3
0 one 1 1
1 one 1 1
2 one 2 1
3 two 3 1
4 two 3 1
data.k1.replace('two','three') # k1列,two替换成three(对Series)
# output
0 one
1 one
2 one
3 three
4 three
Name: k1, dtype: object
data
# output
k1 k2 k3
0 one 1 1
1 one 1 1
2 one 2 1
3 two 3 1
4 two 3 1
对整个DataFrame替换
data.replace(1, 100) # 对DataFrame
# output
k1 k2 k3
0 one 100 100
1 one 100 100
2 one 2 100
3 two 3 100
4 two 3 100
立即替换
data.replace(1, 100, inplace=True)
data
# output
k1 k2 k3
0 one 100 100
1 one 100 100
2 one 2 100
3 two 3 100
4 two 3 100
4 过滤缺失值
4.1 示例1
data = pd.Series([2, np.nan, 4, np.nan, 8.5])
data
# output
0 2.0
1 NaN
2 4.0
3 NaN
4 8.5
dtype: float64
(data.isnull()).sum()
# output
2
data.notnull().sum() # 不加括号也可以
# output
3
data[data.notnull()]
# output
0 2.0
2 4.0
4 8.5
dtype: float64
data.dropna(inplace=True) # 抛弃nan值
data
# output
0 2.0
2 4.0
4 8.5
dtype: float64
4.1 示例2
data = pd.DataFrame(np.random.randn(4,3),index=list('abcd'),columns=['aa','bb','cc'])
data.iloc[1:,:2] = np.nan
data.iloc[1,2] = np.nan
data
# output
aa bb cc
a -0.474847 -0.398699 -0.432810
b NaN NaN NaN
c NaN NaN 2.835789
d NaN NaN -0.557477
data.isna()
# output
aa bb cc
a False False False
b True True True
c True True False
d True True False
data.dropna()
# output
aa bb cc
a -0.474847 -0.398699 -0.43281
data.dropna(how='all') # 全部为nan,才会被抛弃
# output
aa bb cc
a -0.474847 -0.398699 -0.432810
c NaN NaN 2.835789
d NaN NaN -0.557477
按行或列删除缺失值
data
# output
aa bb cc
a -0.474847 -0.398699 -0.432810
b NaN NaN NaN
c NaN NaN 2.835789
d NaN NaN -0.557477
data.iloc[0, 0] = np.nan
data
# output
aa bb cc
a NaN -0.398699 -0.432810
b NaN NaN NaN
c NaN NaN 2.835789
d NaN NaN -0.557477
data.dropna(axis=0, how='all') # axis=0,根据行判断,全部为nan,才会被抛弃
# output
aa bb cc
a NaN -0.398699 -0.432810
c NaN NaN 2.835789
d NaN NaN -0.557477
data.dropna(axis=1, how='all') # axis=1,根据列判断,全部为nan,才会被抛弃
# output
bb cc
a -0.398699 -0.432810
b NaN NaN
c NaN 2.835789
d NaN -0.557477
5 填充缺失值
data
# output
aa bb cc
a NaN -0.398699 -0.432810
b NaN NaN NaN
c NaN NaN 2.835789
d NaN NaN -0.557477
data.fillna(0) # nan值填充为0
# output
aa bb cc
a 0.0 -0.398699 -0.432810
b 0.0 0.000000 0.000000
c 0.0 0.000000 2.835789
d 0.0 0.000000 -0.557477
data.fillna({'aa':1, 'bb':100}) #aa填充1,bb填充100,cc不填充
# output
aa bb cc
a 1.0 -0.398699 -0.432810
b 1.0 100.000000 NaN
c 1.0 100.000000 2.835789
d 1.0 100.000000 -0.557477
使用均值填充
data = pd.Series([2, np.nan, 4, np.nan, 8.5])
data
# output
0 2.0
1 NaN
2 4.0
3 NaN
4 8.5
dtype: float64
data.fillna(data.mean()) # 使用均值填充
# output
0 2.000000
1 4.833333
2 4.000000
3 4.833333
4 8.500000
dtype: float64