import pandas as pd
import numpy as np
空值处理
data = pd.Series(['a','b','c',np.nan,'d'],['e','f',np.nan,'g','h'])
data
e a
f b
NaN c
g NaN
h d
dtype: object
查看数据缺失值
data.isnull()
data.notnull()
e True
f True
NaN True
g False
h True
dtype: bool
过滤缺失值
from numpy import nan as NA
data.dropna()
补全缺失值
df = pd.DataFrame(np.random.randn(7,3))
df.iloc[:3,0] = NA
df.iloc[:4,2] = NA
df
| 0 | 1 | 2 |
---|
0 | NaN | -1.748768 | NaN |
1 | NaN | 1.808884 | NaN |
2 | NaN | 0.839127 | NaN |
3 | 1.403433 | 1.097943 | NaN |
4 | 0.689405 | 1.464497 | 0.088136 |
5 | -0.860885 | -1.982234 | 0.864731 |
6 | -0.428423 | 1.416755 | -0.830213 |
df.fillna(0)
| 0 | 1 | 2 |
---|
0 | 0.000000 | -1.748768 | 0.000000 |
1 | 0.000000 | 1.808884 | 0.000000 |
2 | 0.000000 | 0.839127 | 0.000000 |
3 | 1.403433 | 1.097943 | 0.000000 |
4 | 0.689405 | 1.464497 | 0.088136 |
5 | -0.860885 | -1.982234 | 0.864731 |
6 | -0.428423 | 1.416755 | -0.830213 |
df.fillna({0:99,2:11})
| 0 | 1 | 2 |
---|
0 | 99.000000 | -1.748768 | 11.000000 |
1 | 99.000000 | 1.808884 | 11.000000 |
2 | 99.000000 | 0.839127 | 11.000000 |
3 | 1.403433 | 1.097943 | 11.000000 |
4 | 0.689405 | 1.464497 | 0.088136 |
5 | -0.860885 | -1.982234 | 0.864731 |
6 | -0.428423 | 1.416755 | -0.830213 |
df.fillna(method='bfill')
| 0 | 1 | 2 |
---|
0 | 1.403433 | -1.748768 | 0.088136 |
1 | 1.403433 | 1.808884 | 0.088136 |
2 | 1.403433 | 0.839127 | 0.088136 |
3 | 1.403433 | 1.097943 | 0.088136 |
4 | 0.689405 | 1.464497 | 0.088136 |
5 | -0.860885 | -1.982234 | 0.864731 |
6 | -0.428423 | 1.416755 | -0.830213 |
df.fillna(value='空')
| 0 | 1 | 2 |
---|
0 | 空 | -1.748768 | 空 |
1 | 空 | 1.808884 | 空 |
2 | 空 | 0.839127 | 空 |
3 | 1.40343 | 1.097943 | 空 |
4 | 0.689405 | 1.464497 | 0.0881356 |
5 | -0.860885 | -1.982234 | 0.864731 |
6 | -0.428423 | 1.416755 | -0.830213 |
数据转换
删除重复值
data = pd.DataFrame({
'k1':['one','two'] * 3 + ['two'],
'k2':[1,1,2,3,3,4,4]
})
data
| k1 | k2 |
---|
0 | one | 1 |
1 | two | 1 |
2 | one | 2 |
3 | two | 3 |
4 | one | 3 |
5 | two | 4 |
6 | two | 4 |
data.duplicated()
0 False
1 False
2 False
3 False
4 False
5 False
6 True
dtype: bool
data.drop_duplicates()
| k1 | k2 |
---|
0 | one | 1 |
1 | two | 1 |
2 | one | 2 |
3 | two | 3 |
4 | one | 3 |
5 | two | 4 |
data['v1'] = range(7)
data
| k1 | k2 | v1 |
---|
0 | one | 1 | 0 |
1 | two | 1 | 1 |
2 | one | 2 | 2 |
3 | two | 3 | 3 |
4 | one | 3 | 4 |
5 | two | 4 | 5 |
6 | two | 4 | 6 |
data.drop_duplicates(['k1'])
大小写转换
data_dic = {
'Xiaoming':'Yiban',
'xiaohong':'erban',
'xiaohua':'SANBAN',
'XIAOLI':'siban'
}
data = pd.Series(data_dic)
data
Xiaoming Yiban
xiaohong erban
xiaohua SANBAN
XIAOLI siban
dtype: object
data.str.lower()
Xiaoming yiban
xiaohong erban
xiaohua sanban
XIAOLI siban
dtype: object
data.str.upper()
Xiaoming YIBAN
xiaohong ERBAN
xiaohua SANBAN
XIAOLI SIBAN
dtype: object
替换数据
data
Xiaoming Yiban
xiaohong erban
xiaohua SANBAN
XIAOLI siban
dtype: object
data.replace(['erban'],'二班')
Xiaoming Yiban
xiaohong 二班
xiaohua SANBAN
XIAOLI siban
dtype: object
离散和分箱
ages = [20,21,25,27,26,30,54,37,35,45,49,15,55,33,41]
bins = [10,20,30,40,50,60]
cats = pd.cut(ages,bins)
cats
[(10, 20], (20, 30], (20, 30], (20, 30], (20, 30], ..., (40, 50], (10, 20], (50, 60], (30, 40], (40, 50]]
Length: 15
Categories (5, interval[int64]): [(10, 20] < (20, 30] < (30, 40] < (40, 50] < (50, 60]]
cats.codes
array([0, 1, 1, 1, 1, 1, 4, 2, 2, 3, 3, 0, 4, 2, 3], dtype=int8)
cats.categories
IntervalIndex([(10, 20], (20, 30], (30, 40], (40, 50], (50, 60]],
closed='right',
dtype='interval[int64]')
groups = ['少年','青年','壮年','中年','老年']
pd.cut(ages,bins,labels=groups)
[少年, 青年, 青年, 青年, 青年, ..., 中年, 少年, 老年, 壮年, 中年]
Length: 15
Categories (5, object): [少年 < 青年 < 壮年 < 中年 < 老年]
pd.cut(ages,4)
[(14.96, 25.0], (14.96, 25.0], (14.96, 25.0], (25.0, 35.0], (25.0, 35.0], ..., (45.0, 55.0], (14.96, 25.0], (45.0, 55.0], (25.0, 35.0], (35.0, 45.0]]
Length: 15
Categories (4, interval[float64]): [(14.96, 25.0] < (25.0, 35.0] < (35.0, 45.0] < (45.0, 55.0]]
cats = pd.qcut(ages,4)
cats
[(14.999, 25.5], (14.999, 25.5], (14.999, 25.5], (25.5, 33.0], (25.5, 33.0], ..., (43.0, 55.0], (14.999, 25.5], (43.0, 55.0], (25.5, 33.0], (33.0, 43.0]]
Length: 15
Categories (4, interval[float64]): [(14.999, 25.5] < (25.5, 33.0] < (33.0, 43.0] < (43.0, 55.0]]
pd.value_counts(cats)
(43.0, 55.0] 4
(25.5, 33.0] 4
(14.999, 25.5] 4
(33.0, 43.0] 3
dtype: int64
pd.qcut(ages,[0,0.1,0.3,0.5,0.7,1])
[(14.999, 20.4], (20.4, 26.2], (20.4, 26.2], (26.2, 33.0], (20.4, 26.2], ..., (40.2, 55.0], (14.999, 20.4], (40.2, 55.0], (26.2, 33.0], (40.2, 55.0]]
Length: 15
Categories (5, interval[float64]): [(14.999, 20.4] < (20.4, 26.2] < (26.2, 33.0] < (33.0, 40.2] < (40.2, 55.0]]
data = {
'小明':'xiaoming@qq.com',
'小华':'xiaohua@qq.com',
'小丽':np.nan,
'小张':'xiaozhang@gmail.com'
}
data = pd.Series(data)
data
小明 xiaoming@qq.com
小华 xiaohua@qq.com
小丽 NaN
小张 xiaozhang@gmail.com
dtype: object
data.str.contains('qq')
小明 True
小华 True
小丽 NaN
小张 False
dtype: object