lect02_codes02_pandas

数据清洗


1.处理缺失数据

import pandas as pd

log_data = pd.read_csv('log.csv')
log_data
timeuservideoplayback positionpausedvolume
01469974424cherylintro.html5False10.0
11469974454cherylintro.html6NaNNaN
21469974544cherylintro.html9NaNNaN
31469974574cherylintro.html10NaNNaN
41469977514bobintro.html1NaNNaN
51469977544bobintro.html1NaNNaN
61469977574bobintro.html1NaNNaN
71469977604bobintro.html1NaNNaN
81469974604cherylintro.html11NaNNaN
91469974694cherylintro.html14NaNNaN
101469974724cherylintro.html15NaNNaN
111469974454sueadvanced.html24NaNNaN
121469974524sueadvanced.html25NaNNaN
131469974424sueadvanced.html23False10.0
141469974554sueadvanced.html26NaNNaN
151469974624sueadvanced.html27NaNNaN
161469974654sueadvanced.html28NaN5.0
171469974724sueadvanced.html29NaNNaN
181469974484cherylintro.html7NaNNaN
191469974514cherylintro.html8NaNNaN
201469974754sueadvanced.html30NaNNaN
211469974824sueadvanced.html31NaNNaN
221469974854sueadvanced.html32NaNNaN
231469974924sueadvanced.html33NaNNaN
241469977424bobintro.html1True10.0
251469977454bobintro.html1NaNNaN
261469977484bobintro.html1NaNNaN
271469977634bobintro.html1NaNNaN
281469977664bobintro.html1NaNNaN
291469974634cherylintro.html12NaNNaN
301469974664cherylintro.html13NaNNaN
311469977694bobintro.html1NaNNaN
321469977724bobintro.html1NaNNaN
  • 1.1判断数据缺失
log_data.isnull()
timeuservideoplayback positionpausedvolume
0FalseFalseFalseFalseFalseFalse
1FalseFalseFalseFalseTrueTrue
2FalseFalseFalseFalseTrueTrue
3FalseFalseFalseFalseTrueTrue
4FalseFalseFalseFalseTrueTrue
5FalseFalseFalseFalseTrueTrue
6FalseFalseFalseFalseTrueTrue
7FalseFalseFalseFalseTrueTrue
8FalseFalseFalseFalseTrueTrue
9FalseFalseFalseFalseTrueTrue
10FalseFalseFalseFalseTrueTrue
11FalseFalseFalseFalseTrueTrue
12FalseFalseFalseFalseTrueTrue
13FalseFalseFalseFalseFalseFalse
14FalseFalseFalseFalseTrueTrue
15FalseFalseFalseFalseTrueTrue
16FalseFalseFalseFalseTrueFalse
17FalseFalseFalseFalseTrueTrue
18FalseFalseFalseFalseTrueTrue
19FalseFalseFalseFalseTrueTrue
20FalseFalseFalseFalseTrueTrue
21FalseFalseFalseFalseTrueTrue
22FalseFalseFalseFalseTrueTrue
23FalseFalseFalseFalseTrueTrue
24FalseFalseFalseFalseFalseFalse
25FalseFalseFalseFalseTrueTrue
26FalseFalseFalseFalseTrueTrue
27FalseFalseFalseFalseTrueTrue
28FalseFalseFalseFalseTrueTrue
29FalseFalseFalseFalseTrueTrue
30FalseFalseFalseFalseTrueTrue
31FalseFalseFalseFalseTrueTrue
32FalseFalseFalseFalseTrueTrue
log_data['paused'].isnull()
0     False
1      True
2      True
3      True
4      True
5      True
6      True
7      True
8      True
9      True
10     True
11     True
12     True
13    False
14     True
15     True
16     True
17     True
18     True
19     True
20     True
21     True
22     True
23     True
24    False
25     True
26     True
27     True
28     True
29     True
30     True
31     True
32     True
Name: paused, dtype: bool
# 取出volume不为空的数据
log_data[log_data['volume'].notnull()]
timeuservideoplayback positionpausedvolume
01469974424cherylintro.html5False10.0
131469974424sueadvanced.html23False10.0
161469974654sueadvanced.html28NaN5.0
241469977424bobintro.html1True10.0
log_data.set_index(['time', 'user'], inplace=True)
log_data.sort_index(inplace=True)
log_data
videoplayback positionpausedvolume
timeuser
1469974424cherylintro.html5False10.0
sueadvanced.html23False10.0
1469974454cherylintro.html6NaNNaN
sueadvanced.html24NaNNaN
1469974484cherylintro.html7NaNNaN
1469974514cherylintro.html8NaNNaN
1469974524sueadvanced.html25NaNNaN
1469974544cherylintro.html9NaNNaN
1469974554sueadvanced.html26NaNNaN
1469974574cherylintro.html10NaNNaN
1469974604cherylintro.html11NaNNaN
1469974624sueadvanced.html27NaNNaN
1469974634cherylintro.html12NaNNaN
1469974654sueadvanced.html28NaN5.0
1469974664cherylintro.html13NaNNaN
1469974694cherylintro.html14NaNNaN
1469974724cherylintro.html15NaNNaN
sueadvanced.html29NaNNaN
1469974754sueadvanced.html30NaNNaN
1469974824sueadvanced.html31NaNNaN
1469974854sueadvanced.html32NaNNaN
1469974924sueadvanced.html33NaNNaN
1469977424bobintro.html1True10.0
1469977454bobintro.html1NaNNaN
1469977484bobintro.html1NaNNaN
1469977514bobintro.html1NaNNaN
1469977544bobintro.html1NaNNaN
1469977574bobintro.html1NaNNaN
1469977604bobintro.html1NaNNaN
1469977634bobintro.html1NaNNaN
1469977664bobintro.html1NaNNaN
1469977694bobintro.html1NaNNaN
1469977724bobintro.html1NaNNaN
log_data.fillna(0)
videoplayback positionpausedvolume
timeuser
1469974424cherylintro.html5False10.0
sueadvanced.html23False10.0
1469974454cherylintro.html600.0
sueadvanced.html2400.0
1469974484cherylintro.html700.0
1469974514cherylintro.html800.0
1469974524sueadvanced.html2500.0
1469974544cherylintro.html900.0
1469974554sueadvanced.html2600.0
1469974574cherylintro.html1000.0
1469974604cherylintro.html1100.0
1469974624sueadvanced.html2700.0
1469974634cherylintro.html1200.0
1469974654sueadvanced.html2805.0
1469974664cherylintro.html1300.0
1469974694cherylintro.html1400.0
1469974724cherylintro.html1500.0
sueadvanced.html2900.0
1469974754sueadvanced.html3000.0
1469974824sueadvanced.html3100.0
1469974854sueadvanced.html3200.0
1469974924sueadvanced.html3300.0
1469977424bobintro.html1True10.0
1469977454bobintro.html100.0
1469977484bobintro.html100.0
1469977514bobintro.html100.0
1469977544bobintro.html100.0
1469977574bobintro.html100.0
1469977604bobintro.html100.0
1469977634bobintro.html100.0
1469977664bobintro.html100.0
1469977694bobintro.html100.0
1469977724bobintro.html100.0
log_data.dropna(subset=['volume'])
videoplayback positionpausedvolume
timeuser
1469974424cherylintro.html5False10.0
sueadvanced.html23False10.0
1469974654sueadvanced.html28NaN5.0
1469977424bobintro.html1True10.0
log_data
videoplayback positionpausedvolume
timeuser
1469974424cherylintro.html5False10.0
sueadvanced.html23False10.0
1469974454cherylintro.html6NaNNaN
sueadvanced.html24NaNNaN
1469974484cherylintro.html7NaNNaN
1469974514cherylintro.html8NaNNaN
1469974524sueadvanced.html25NaNNaN
1469974544cherylintro.html9NaNNaN
1469974554sueadvanced.html26NaNNaN
1469974574cherylintro.html10NaNNaN
1469974604cherylintro.html11NaNNaN
1469974624sueadvanced.html27NaNNaN
1469974634cherylintro.html12NaNNaN
1469974654sueadvanced.html28NaN5.0
1469974664cherylintro.html13NaNNaN
1469974694cherylintro.html14NaNNaN
1469974724cherylintro.html15NaNNaN
sueadvanced.html29NaNNaN
1469974754sueadvanced.html30NaNNaN
1469974824sueadvanced.html31NaNNaN
1469974854sueadvanced.html32NaNNaN
1469974924sueadvanced.html33NaNNaN
1469977424bobintro.html1True10.0
1469977454bobintro.html1NaNNaN
1469977484bobintro.html1NaNNaN
1469977514bobintro.html1NaNNaN
1469977544bobintro.html1NaNNaN
1469977574bobintro.html1NaNNaN
1469977604bobintro.html1NaNNaN
1469977634bobintro.html1NaNNaN
1469977664bobintro.html1NaNNaN
1469977694bobintro.html1NaNNaN
1469977724bobintro.html1NaNNaN
log_data.ffill()
videoplayback positionpausedvolume
timeuser
1469974424cherylintro.html5False10.0
sueadvanced.html23False10.0
1469974454cherylintro.html6False10.0
sueadvanced.html24False10.0
1469974484cherylintro.html7False10.0
1469974514cherylintro.html8False10.0
1469974524sueadvanced.html25False10.0
1469974544cherylintro.html9False10.0
1469974554sueadvanced.html26False10.0
1469974574cherylintro.html10False10.0
1469974604cherylintro.html11False10.0
1469974624sueadvanced.html27False10.0
1469974634cherylintro.html12False10.0
1469974654sueadvanced.html28False5.0
1469974664cherylintro.html13False5.0
1469974694cherylintro.html14False5.0
1469974724cherylintro.html15False5.0
sueadvanced.html29False5.0
1469974754sueadvanced.html30False5.0
1469974824sueadvanced.html31False5.0
1469974854sueadvanced.html32False5.0
1469974924sueadvanced.html33False5.0
1469977424bobintro.html1True10.0
1469977454bobintro.html1True10.0
1469977484bobintro.html1True10.0
1469977514bobintro.html1True10.0
1469977544bobintro.html1True10.0
1469977574bobintro.html1True10.0
1469977604bobintro.html1True10.0
1469977634bobintro.html1True10.0
1469977664bobintro.html1True10.0
1469977694bobintro.html1True10.0
1469977724bobintro.html1True10.0
log_data.bfill()
videoplayback positionpausedvolume
timeuser
1469974424cherylintro.html5False10.0
sueadvanced.html23False10.0
1469974454cherylintro.html6True5.0
sueadvanced.html24True5.0
1469974484cherylintro.html7True5.0
1469974514cherylintro.html8True5.0
1469974524sueadvanced.html25True5.0
1469974544cherylintro.html9True5.0
1469974554sueadvanced.html26True5.0
1469974574cherylintro.html10True5.0
1469974604cherylintro.html11True5.0
1469974624sueadvanced.html27True5.0
1469974634cherylintro.html12True5.0
1469974654sueadvanced.html28True5.0
1469974664cherylintro.html13True10.0
1469974694cherylintro.html14True10.0
1469974724cherylintro.html15True10.0
sueadvanced.html29True10.0
1469974754sueadvanced.html30True10.0
1469974824sueadvanced.html31True10.0
1469974854sueadvanced.html32True10.0
1469974924sueadvanced.html33True10.0
1469977424bobintro.html1True10.0
1469977454bobintro.html1NaNNaN
1469977484bobintro.html1NaNNaN
1469977514bobintro.html1NaNNaN
1469977544bobintro.html1NaNNaN
1469977574bobintro.html1NaNNaN
1469977604bobintro.html1NaNNaN
1469977634bobintro.html1NaNNaN
1469977664bobintro.html1NaNNaN
1469977694bobintro.html1NaNNaN
1469977724bobintro.html1NaNNaN

2.数据变形

  • 2.1处理重复数据
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                     'k2': [1, 1, 2, 3, 3, 4, 4]})
data
k1k2
0one1
1two1
2one2
3two3
4one3
5two4
6two4
# 判断数据是否重复
data.duplicated()
0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool
# 去除重复数据
data.drop_duplicates()
k1k2
0one1
1two1
2one2
3two3
4one3
5two4
data['v1'] = range(7)
data
k1k2v1
0one10
1two11
2one22
3two33
4one34
5two45
6two46
# 去除指定列的重复数据
data.drop_duplicates(['k1'])
k1k2v1
0one10
1two11
data.drop_duplicates(['k1', 'k2'], keep='last')
k1k2v1
0one10
1two11
2one22
3two33
4one34
6two46
  • 使用函数或map转化数据
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami', 'corned beef', 'Bacon', 'pastrami', 'honey ham', 'nova lox'],
                     'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data
foodounces
0bacon4.0
1pulled pork3.0
2bacon12.0
3Pastrami6.0
4corned beef7.5
5Bacon8.0
6pastrami3.0
7honey ham5.0
8nova lox6.0
# 添加一列,用于指定食物的来源
meat_to_animal = {
    'bacon': 'pig',
    'pulled pork': 'pig',
    'pastrami': 'cow',
    'corned beef': 'cow',
    'honey ham': 'pig',
    'nova lox': 'salmon'
}
# 使用map()
lowercased = data['food'].str.lower()
data['animal'] = lowercased.map(meat_to_animal)
data
foodouncesanimal
0bacon4.0pig
1pulled pork3.0pig
2bacon12.0pig
3Pastrami6.0cow
4corned beef7.5cow
5Bacon8.0pig
6pastrami3.0cow
7honey ham5.0pig
8nova lox6.0salmon
# 使用方法
data['animal2'] = data['food'].map(lambda x: meat_to_animal[x.lower()])
data
foodouncesanimalanimal2
0bacon4.0pigpig
1pulled pork3.0pigpig
2bacon12.0pigpig
3Pastrami6.0cowcow
4corned beef7.5cowcow
5Bacon8.0pigpig
6pastrami3.0cowcow
7honey ham5.0pigpig
8nova lox6.0salmonsalmon
  • 2.2替换值
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data
0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64
import numpy as np

# 将-999替换为空值
data.replace(-999, np.nan)
0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64
# 将-999,-1000都替换为空值
data.replace([-999, -1000], np.nan)
0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64
# 将-999,-1000分别替换为空值和0
data.replace([-999, -1000], [np.nan, 0])
0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64
data.replace({-999: np.nan, -1000: 0})
0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64
  • 离散化和分箱操作
import pandas as pd
# 年龄数据
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

# 分箱的边界
bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins)
print(type(cats))
<class 'pandas.core.arrays.categorical.Categorical'>
# Categorical对象
cats
[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
# 获取分箱编码
cats.codes
array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)
# 返回分箱边界索引
cats.categories
IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]]
              closed='right',
              dtype='interval[int64]')
# 统计箱中元素的个数
pd.value_counts(cats)
(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64
# 带标签的分箱
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
cats = pd.cut(ages, bins, labels=group_names)
cats.get_values()
array(['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', 'Youth',
       'MiddleAged', 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged',
       'YoungAdult'], dtype=object)
  • 2.3哑变量操作
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'], 
                   'data1': range(6)})
df
keydata1
0b0
1b1
2a2
3c3
4a4
5b5
pd.get_dummies(df['key'])
abc
0010
1010
2100
3001
4100
5010
  • 2.4向量化字符串操作
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com', 'Rob': 'rob@gmail.com', 'Wes': np.nan}
data = pd.Series(data)
data
Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object
data.str.contains('gmail')
Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object
data.str[:5]
Dave     dave@
Steve    steve
Rob      rob@g
Wes        NaN
dtype: object
split_df = data.str.split('@', expand=True)
split_df
01
Davedavegoogle.com
Stevestevegmail.com
Robrobgmail.com
WesNaNNaN
split_df[0].str.cat(split_df[1], sep='@')
Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
Name: 0, dtype: object

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值