数据清理常用操作

import pandas as pd
import numpy as np

空值处理

data = pd.Series(['a','b','c',np.nan,'d'],['e','f',np.nan,'g','h'])
data
e        a
f        b
NaN      c
g      NaN
h        d
dtype: object

查看数据缺失值

data.isnull()
data.notnull()
e       True
f       True
NaN     True
g      False
h       True
dtype: bool

过滤缺失值

from numpy import nan as NA
# 默认删除包含缺失值的所有行即how='all' 删除列设置axis=1 
data.dropna()
0
0a
1b
2c
4d

补全缺失值

df = pd.DataFrame(np.random.randn(7,3))
df.iloc[:3,0] = NA
df.iloc[:4,2] = NA
df
012
0NaN-1.748768NaN
1NaN1.808884NaN
2NaN0.839127NaN
31.4034331.097943NaN
40.6894051.4644970.088136
5-0.860885-1.9822340.864731
6-0.4284231.416755-0.830213
df.fillna(0)
012
00.000000-1.7487680.000000
10.0000001.8088840.000000
20.0000000.8391270.000000
31.4034331.0979430.000000
40.6894051.4644970.088136
5-0.860885-1.9822340.864731
6-0.4284231.416755-0.830213
df.fillna({0:99,2:11})
012
099.000000-1.74876811.000000
199.0000001.80888411.000000
299.0000000.83912711.000000
31.4034331.09794311.000000
40.6894051.4644970.088136
5-0.860885-1.9822340.864731
6-0.4284231.416755-0.830213
#用前一个数填充空值
df.fillna(method='bfill')
012
01.403433-1.7487680.088136
11.4034331.8088840.088136
21.4034330.8391270.088136
31.4034331.0979430.088136
40.6894051.4644970.088136
5-0.860885-1.9822340.864731
6-0.4284231.416755-0.830213
#使用特定值填充空值
df.fillna(value='空')
012
0-1.748768
11.808884
20.839127
31.403431.097943
40.6894051.4644970.0881356
5-0.860885-1.9822340.864731
6-0.4284231.416755-0.830213

数据转换

删除重复值

data = pd.DataFrame({
    'k1':['one','two'] * 3 + ['two'],
    'k2':[1,1,2,3,3,4,4]
})
data
k1k2
0one1
1two1
2one2
3two3
4one3
5two4
6two4
data.duplicated()
0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool
data.drop_duplicates()
k1k2
0one1
1two1
2one2
3two3
4one3
5two4
#去除某一列的重复值
data['v1'] = range(7)
data
k1k2v1
0one10
1two11
2one22
3two33
4one34
5two45
6two46
data.drop_duplicates(['k1'])
k1k2v1
0one10
1two11

大小写转换

data_dic = {
    'Xiaoming':'Yiban',
    'xiaohong':'erban',
    'xiaohua':'SANBAN',
    'XIAOLI':'siban'
}
data = pd.Series(data_dic)
data
Xiaoming     Yiban
xiaohong     erban
xiaohua     SANBAN
XIAOLI       siban
dtype: object
data.str.lower()
Xiaoming     yiban
xiaohong     erban
xiaohua     sanban
XIAOLI       siban
dtype: object
data.str.upper()
Xiaoming     YIBAN
xiaohong     ERBAN
xiaohua     SANBAN
XIAOLI       SIBAN
dtype: object

替换数据

data
Xiaoming     Yiban
xiaohong     erban
xiaohua     SANBAN
XIAOLI       siban
dtype: object
data.replace(['erban'],'二班')
Xiaoming     Yiban
xiaohong        二班
xiaohua     SANBAN
XIAOLI       siban
dtype: object

离散和分箱

ages = [20,21,25,27,26,30,54,37,35,45,49,15,55,33,41]
bins = [10,20,30,40,50,60]
cats = pd.cut(ages,bins)
cats
[(10, 20], (20, 30], (20, 30], (20, 30], (20, 30], ..., (40, 50], (10, 20], (50, 60], (30, 40], (40, 50]]
Length: 15
Categories (5, interval[int64]): [(10, 20] < (20, 30] < (30, 40] < (40, 50] < (50, 60]]
cats.codes
array([0, 1, 1, 1, 1, 1, 4, 2, 2, 3, 3, 0, 4, 2, 3], dtype=int8)
cats.categories
IntervalIndex([(10, 20], (20, 30], (30, 40], (40, 50], (50, 60]],
              closed='right',
              dtype='interval[int64]')
#为分箱命名
groups = ['少年','青年','壮年','中年','老年']
pd.cut(ages,bins,labels=groups)
[少年, 青年, 青年, 青年, 青年, ..., 中年, 少年, 老年, 壮年, 中年]
Length: 15
Categories (5, object): [少年 < 青年 < 壮年 < 中年 < 老年]
#自动分箱 例分4个箱子 
pd.cut(ages,4)
[(14.96, 25.0], (14.96, 25.0], (14.96, 25.0], (25.0, 35.0], (25.0, 35.0], ..., (45.0, 55.0], (14.96, 25.0], (45.0, 55.0], (25.0, 35.0], (35.0, 45.0]]
Length: 15
Categories (4, interval[float64]): [(14.96, 25.0] < (25.0, 35.0] < (35.0, 45.0] < (45.0, 55.0]]
#使用qcut获得等长的分箱
cats = pd.qcut(ages,4)
cats
[(14.999, 25.5], (14.999, 25.5], (14.999, 25.5], (25.5, 33.0], (25.5, 33.0], ..., (43.0, 55.0], (14.999, 25.5], (43.0, 55.0], (25.5, 33.0], (33.0, 43.0]]
Length: 15
Categories (4, interval[float64]): [(14.999, 25.5] < (25.5, 33.0] < (33.0, 43.0] < (43.0, 55.0]]
#查看每个箱子的个数
pd.value_counts(cats)
(43.0, 55.0]      4
(25.5, 33.0]      4
(14.999, 25.5]    4
(33.0, 43.0]      3
dtype: int64
#自定义分箱百分比
pd.qcut(ages,[0,0.1,0.3,0.5,0.7,1])
[(14.999, 20.4], (20.4, 26.2], (20.4, 26.2], (26.2, 33.0], (20.4, 26.2], ..., (40.2, 55.0], (14.999, 20.4], (40.2, 55.0], (26.2, 33.0], (40.2, 55.0]]
Length: 15
Categories (5, interval[float64]): [(14.999, 20.4] < (20.4, 26.2] < (26.2, 33.0] < (33.0, 40.2] < (40.2, 55.0]]
#查看字符串是否包含某元素
data = {
    '小明':'xiaoming@qq.com',
    '小华':'xiaohua@qq.com',
    '小丽':np.nan,
    '小张':'xiaozhang@gmail.com'
}
data = pd.Series(data)
data
小明        xiaoming@qq.com
小华         xiaohua@qq.com
小丽                    NaN
小张    xiaozhang@gmail.com
dtype: object
#str.contains()方法会跳过空值
data.str.contains('qq')
小明     True
小华     True
小丽      NaN
小张    False
dtype: object
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值