数据清洗
1.处理缺失数据
import pandas as pd
log_data = pd. read_csv( 'log.csv' )
log_data
time user video playback position paused volume 0 1469974424 cheryl intro.html 5 False 10.0 1 1469974454 cheryl intro.html 6 NaN NaN 2 1469974544 cheryl intro.html 9 NaN NaN 3 1469974574 cheryl intro.html 10 NaN NaN 4 1469977514 bob intro.html 1 NaN NaN 5 1469977544 bob intro.html 1 NaN NaN 6 1469977574 bob intro.html 1 NaN NaN 7 1469977604 bob intro.html 1 NaN NaN 8 1469974604 cheryl intro.html 11 NaN NaN 9 1469974694 cheryl intro.html 14 NaN NaN 10 1469974724 cheryl intro.html 15 NaN NaN 11 1469974454 sue advanced.html 24 NaN NaN 12 1469974524 sue advanced.html 25 NaN NaN 13 1469974424 sue advanced.html 23 False 10.0 14 1469974554 sue advanced.html 26 NaN NaN 15 1469974624 sue advanced.html 27 NaN NaN 16 1469974654 sue advanced.html 28 NaN 5.0 17 1469974724 sue advanced.html 29 NaN NaN 18 1469974484 cheryl intro.html 7 NaN NaN 19 1469974514 cheryl intro.html 8 NaN NaN 20 1469974754 sue advanced.html 30 NaN NaN 21 1469974824 sue advanced.html 31 NaN NaN 22 1469974854 sue advanced.html 32 NaN NaN 23 1469974924 sue advanced.html 33 NaN NaN 24 1469977424 bob intro.html 1 True 10.0 25 1469977454 bob intro.html 1 NaN NaN 26 1469977484 bob intro.html 1 NaN NaN 27 1469977634 bob intro.html 1 NaN NaN 28 1469977664 bob intro.html 1 NaN NaN 29 1469974634 cheryl intro.html 12 NaN NaN 30 1469974664 cheryl intro.html 13 NaN NaN 31 1469977694 bob intro.html 1 NaN NaN 32 1469977724 bob intro.html 1 NaN NaN
log_data. isnull( )
time user video playback position paused volume 0 False False False False False False 1 False False False False True True 2 False False False False True True 3 False False False False True True 4 False False False False True True 5 False False False False True True 6 False False False False True True 7 False False False False True True 8 False False False False True True 9 False False False False True True 10 False False False False True True 11 False False False False True True 12 False False False False True True 13 False False False False False False 14 False False False False True True 15 False False False False True True 16 False False False False True False 17 False False False False True True 18 False False False False True True 19 False False False False True True 20 False False False False True True 21 False False False False True True 22 False False False False True True 23 False False False False True True 24 False False False False False False 25 False False False False True True 26 False False False False True True 27 False False False False True True 28 False False False False True True 29 False False False False True True 30 False False False False True True 31 False False False False True True 32 False False False False True True
log_data[ 'paused' ] . isnull( )
0 False
1 True
2 True
3 True
4 True
5 True
6 True
7 True
8 True
9 True
10 True
11 True
12 True
13 False
14 True
15 True
16 True
17 True
18 True
19 True
20 True
21 True
22 True
23 True
24 False
25 True
26 True
27 True
28 True
29 True
30 True
31 True
32 True
Name: paused, dtype: bool
log_data[ log_data[ 'volume' ] . notnull( ) ]
time user video playback position paused volume 0 1469974424 cheryl intro.html 5 False 10.0 13 1469974424 sue advanced.html 23 False 10.0 16 1469974654 sue advanced.html 28 NaN 5.0 24 1469977424 bob intro.html 1 True 10.0
log_data. set_index( [ 'time' , 'user' ] , inplace= True )
log_data. sort_index( inplace= True )
log_data
video playback position paused volume time user 1469974424 cheryl intro.html 5 False 10.0 sue advanced.html 23 False 10.0 1469974454 cheryl intro.html 6 NaN NaN sue advanced.html 24 NaN NaN 1469974484 cheryl intro.html 7 NaN NaN 1469974514 cheryl intro.html 8 NaN NaN 1469974524 sue advanced.html 25 NaN NaN 1469974544 cheryl intro.html 9 NaN NaN 1469974554 sue advanced.html 26 NaN NaN 1469974574 cheryl intro.html 10 NaN NaN 1469974604 cheryl intro.html 11 NaN NaN 1469974624 sue advanced.html 27 NaN NaN 1469974634 cheryl intro.html 12 NaN NaN 1469974654 sue advanced.html 28 NaN 5.0 1469974664 cheryl intro.html 13 NaN NaN 1469974694 cheryl intro.html 14 NaN NaN 1469974724 cheryl intro.html 15 NaN NaN sue advanced.html 29 NaN NaN 1469974754 sue advanced.html 30 NaN NaN 1469974824 sue advanced.html 31 NaN NaN 1469974854 sue advanced.html 32 NaN NaN 1469974924 sue advanced.html 33 NaN NaN 1469977424 bob intro.html 1 True 10.0 1469977454 bob intro.html 1 NaN NaN 1469977484 bob intro.html 1 NaN NaN 1469977514 bob intro.html 1 NaN NaN 1469977544 bob intro.html 1 NaN NaN 1469977574 bob intro.html 1 NaN NaN 1469977604 bob intro.html 1 NaN NaN 1469977634 bob intro.html 1 NaN NaN 1469977664 bob intro.html 1 NaN NaN 1469977694 bob intro.html 1 NaN NaN 1469977724 bob intro.html 1 NaN NaN
log_data. fillna( 0 )
video playback position paused volume time user 1469974424 cheryl intro.html 5 False 10.0 sue advanced.html 23 False 10.0 1469974454 cheryl intro.html 6 0 0.0 sue advanced.html 24 0 0.0 1469974484 cheryl intro.html 7 0 0.0 1469974514 cheryl intro.html 8 0 0.0 1469974524 sue advanced.html 25 0 0.0 1469974544 cheryl intro.html 9 0 0.0 1469974554 sue advanced.html 26 0 0.0 1469974574 cheryl intro.html 10 0 0.0 1469974604 cheryl intro.html 11 0 0.0 1469974624 sue advanced.html 27 0 0.0 1469974634 cheryl intro.html 12 0 0.0 1469974654 sue advanced.html 28 0 5.0 1469974664 cheryl intro.html 13 0 0.0 1469974694 cheryl intro.html 14 0 0.0 1469974724 cheryl intro.html 15 0 0.0 sue advanced.html 29 0 0.0 1469974754 sue advanced.html 30 0 0.0 1469974824 sue advanced.html 31 0 0.0 1469974854 sue advanced.html 32 0 0.0 1469974924 sue advanced.html 33 0 0.0 1469977424 bob intro.html 1 True 10.0 1469977454 bob intro.html 1 0 0.0 1469977484 bob intro.html 1 0 0.0 1469977514 bob intro.html 1 0 0.0 1469977544 bob intro.html 1 0 0.0 1469977574 bob intro.html 1 0 0.0 1469977604 bob intro.html 1 0 0.0 1469977634 bob intro.html 1 0 0.0 1469977664 bob intro.html 1 0 0.0 1469977694 bob intro.html 1 0 0.0 1469977724 bob intro.html 1 0 0.0
log_data. dropna( subset= [ 'volume' ] )
video playback position paused volume time user 1469974424 cheryl intro.html 5 False 10.0 sue advanced.html 23 False 10.0 1469974654 sue advanced.html 28 NaN 5.0 1469977424 bob intro.html 1 True 10.0
log_data
video playback position paused volume time user 1469974424 cheryl intro.html 5 False 10.0 sue advanced.html 23 False 10.0 1469974454 cheryl intro.html 6 NaN NaN sue advanced.html 24 NaN NaN 1469974484 cheryl intro.html 7 NaN NaN 1469974514 cheryl intro.html 8 NaN NaN 1469974524 sue advanced.html 25 NaN NaN 1469974544 cheryl intro.html 9 NaN NaN 1469974554 sue advanced.html 26 NaN NaN 1469974574 cheryl intro.html 10 NaN NaN 1469974604 cheryl intro.html 11 NaN NaN 1469974624 sue advanced.html 27 NaN NaN 1469974634 cheryl intro.html 12 NaN NaN 1469974654 sue advanced.html 28 NaN 5.0 1469974664 cheryl intro.html 13 NaN NaN 1469974694 cheryl intro.html 14 NaN NaN 1469974724 cheryl intro.html 15 NaN NaN sue advanced.html 29 NaN NaN 1469974754 sue advanced.html 30 NaN NaN 1469974824 sue advanced.html 31 NaN NaN 1469974854 sue advanced.html 32 NaN NaN 1469974924 sue advanced.html 33 NaN NaN 1469977424 bob intro.html 1 True 10.0 1469977454 bob intro.html 1 NaN NaN 1469977484 bob intro.html 1 NaN NaN 1469977514 bob intro.html 1 NaN NaN 1469977544 bob intro.html 1 NaN NaN 1469977574 bob intro.html 1 NaN NaN 1469977604 bob intro.html 1 NaN NaN 1469977634 bob intro.html 1 NaN NaN 1469977664 bob intro.html 1 NaN NaN 1469977694 bob intro.html 1 NaN NaN 1469977724 bob intro.html 1 NaN NaN
log_data. ffill( )
video playback position paused volume time user 1469974424 cheryl intro.html 5 False 10.0 sue advanced.html 23 False 10.0 1469974454 cheryl intro.html 6 False 10.0 sue advanced.html 24 False 10.0 1469974484 cheryl intro.html 7 False 10.0 1469974514 cheryl intro.html 8 False 10.0 1469974524 sue advanced.html 25 False 10.0 1469974544 cheryl intro.html 9 False 10.0 1469974554 sue advanced.html 26 False 10.0 1469974574 cheryl intro.html 10 False 10.0 1469974604 cheryl intro.html 11 False 10.0 1469974624 sue advanced.html 27 False 10.0 1469974634 cheryl intro.html 12 False 10.0 1469974654 sue advanced.html 28 False 5.0 1469974664 cheryl intro.html 13 False 5.0 1469974694 cheryl intro.html 14 False 5.0 1469974724 cheryl intro.html 15 False 5.0 sue advanced.html 29 False 5.0 1469974754 sue advanced.html 30 False 5.0 1469974824 sue advanced.html 31 False 5.0 1469974854 sue advanced.html 32 False 5.0 1469974924 sue advanced.html 33 False 5.0 1469977424 bob intro.html 1 True 10.0 1469977454 bob intro.html 1 True 10.0 1469977484 bob intro.html 1 True 10.0 1469977514 bob intro.html 1 True 10.0 1469977544 bob intro.html 1 True 10.0 1469977574 bob intro.html 1 True 10.0 1469977604 bob intro.html 1 True 10.0 1469977634 bob intro.html 1 True 10.0 1469977664 bob intro.html 1 True 10.0 1469977694 bob intro.html 1 True 10.0 1469977724 bob intro.html 1 True 10.0
log_data. bfill( )
video playback position paused volume time user 1469974424 cheryl intro.html 5 False 10.0 sue advanced.html 23 False 10.0 1469974454 cheryl intro.html 6 True 5.0 sue advanced.html 24 True 5.0 1469974484 cheryl intro.html 7 True 5.0 1469974514 cheryl intro.html 8 True 5.0 1469974524 sue advanced.html 25 True 5.0 1469974544 cheryl intro.html 9 True 5.0 1469974554 sue advanced.html 26 True 5.0 1469974574 cheryl intro.html 10 True 5.0 1469974604 cheryl intro.html 11 True 5.0 1469974624 sue advanced.html 27 True 5.0 1469974634 cheryl intro.html 12 True 5.0 1469974654 sue advanced.html 28 True 5.0 1469974664 cheryl intro.html 13 True 10.0 1469974694 cheryl intro.html 14 True 10.0 1469974724 cheryl intro.html 15 True 10.0 sue advanced.html 29 True 10.0 1469974754 sue advanced.html 30 True 10.0 1469974824 sue advanced.html 31 True 10.0 1469974854 sue advanced.html 32 True 10.0 1469974924 sue advanced.html 33 True 10.0 1469977424 bob intro.html 1 True 10.0 1469977454 bob intro.html 1 NaN NaN 1469977484 bob intro.html 1 NaN NaN 1469977514 bob intro.html 1 NaN NaN 1469977544 bob intro.html 1 NaN NaN 1469977574 bob intro.html 1 NaN NaN 1469977604 bob intro.html 1 NaN NaN 1469977634 bob intro.html 1 NaN NaN 1469977664 bob intro.html 1 NaN NaN 1469977694 bob intro.html 1 NaN NaN 1469977724 bob intro.html 1 NaN NaN
2.数据变形
data = pd. DataFrame( { 'k1' : [ 'one' , 'two' ] * 3 + [ 'two' ] ,
'k2' : [ 1 , 1 , 2 , 3 , 3 , 4 , 4 ] } )
data
k1 k2 0 one 1 1 two 1 2 one 2 3 two 3 4 one 3 5 two 4 6 two 4
data. duplicated( )
0 False
1 False
2 False
3 False
4 False
5 False
6 True
dtype: bool
data. drop_duplicates( )
k1 k2 0 one 1 1 two 1 2 one 2 3 two 3 4 one 3 5 two 4
data[ 'v1' ] = range ( 7 )
data
k1 k2 v1 0 one 1 0 1 two 1 1 2 one 2 2 3 two 3 3 4 one 3 4 5 two 4 5 6 two 4 6
data. drop_duplicates( [ 'k1' ] )
data. drop_duplicates( [ 'k1' , 'k2' ] , keep= 'last' )
k1 k2 v1 0 one 1 0 1 two 1 1 2 one 2 2 3 two 3 3 4 one 3 4 6 two 4 6
data = pd. DataFrame( { 'food' : [ 'bacon' , 'pulled pork' , 'bacon' , 'Pastrami' , 'corned beef' , 'Bacon' , 'pastrami' , 'honey ham' , 'nova lox' ] ,
'ounces' : [ 4 , 3 , 12 , 6 , 7.5 , 8 , 3 , 5 , 6 ] } )
data
food ounces 0 bacon 4.0 1 pulled pork 3.0 2 bacon 12.0 3 Pastrami 6.0 4 corned beef 7.5 5 Bacon 8.0 6 pastrami 3.0 7 honey ham 5.0 8 nova lox 6.0
meat_to_animal = {
'bacon' : 'pig' ,
'pulled pork' : 'pig' ,
'pastrami' : 'cow' ,
'corned beef' : 'cow' ,
'honey ham' : 'pig' ,
'nova lox' : 'salmon'
}
lowercased = data[ 'food' ] . str . lower( )
data[ 'animal' ] = lowercased. map ( meat_to_animal)
data
food ounces animal 0 bacon 4.0 pig 1 pulled pork 3.0 pig 2 bacon 12.0 pig 3 Pastrami 6.0 cow 4 corned beef 7.5 cow 5 Bacon 8.0 pig 6 pastrami 3.0 cow 7 honey ham 5.0 pig 8 nova lox 6.0 salmon
data[ 'animal2' ] = data[ 'food' ] . map ( lambda x: meat_to_animal[ x. lower( ) ] )
data
food ounces animal animal2 0 bacon 4.0 pig pig 1 pulled pork 3.0 pig pig 2 bacon 12.0 pig pig 3 Pastrami 6.0 cow cow 4 corned beef 7.5 cow cow 5 Bacon 8.0 pig pig 6 pastrami 3.0 cow cow 7 honey ham 5.0 pig pig 8 nova lox 6.0 salmon salmon
data = pd. Series( [ 1 . , - 999 . , 2 . , - 999 . , - 1000 . , 3 . ] )
data
0 1.0
1 -999.0
2 2.0
3 -999.0
4 -1000.0
5 3.0
dtype: float64
import numpy as np
data. replace( - 999 , np. nan)
0 1.0
1 NaN
2 2.0
3 NaN
4 -1000.0
5 3.0
dtype: float64
data. replace( [ - 999 , - 1000 ] , np. nan)
0 1.0
1 NaN
2 2.0
3 NaN
4 NaN
5 3.0
dtype: float64
data. replace( [ - 999 , - 1000 ] , [ np. nan, 0 ] )
0 1.0
1 NaN
2 2.0
3 NaN
4 0.0
5 3.0
dtype: float64
data. replace( { - 999 : np. nan, - 1000 : 0 } )
0 1.0
1 NaN
2 2.0
3 NaN
4 0.0
5 3.0
dtype: float64
import pandas as pd
ages = [ 20 , 22 , 25 , 27 , 21 , 23 , 37 , 31 , 61 , 45 , 41 , 32 ]
bins = [ 18 , 25 , 35 , 60 , 100 ]
cats = pd. cut( ages, bins)
print ( type ( cats) )
<class 'pandas.core.arrays.categorical.Categorical'>
cats
[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
cats. codes
array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)
cats. categories
IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]]
closed='right',
dtype='interval[int64]')
pd. value_counts( cats)
(18, 25] 5
(35, 60] 3
(25, 35] 3
(60, 100] 1
dtype: int64
group_names = [ 'Youth' , 'YoungAdult' , 'MiddleAged' , 'Senior' ]
cats = pd. cut( ages, bins, labels= group_names)
cats. get_values( )
array(['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', 'Youth',
'MiddleAged', 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged',
'YoungAdult'], dtype=object)
df = pd. DataFrame( { 'key' : [ 'b' , 'b' , 'a' , 'c' , 'a' , 'b' ] ,
'data1' : range ( 6 ) } )
df
key data1 0 b 0 1 b 1 2 a 2 3 c 3 4 a 4 5 b 5
pd. get_dummies( df[ 'key' ] )
a b c 0 0 1 0 1 0 1 0 2 1 0 0 3 0 0 1 4 1 0 0 5 0 1 0
data = { 'Dave' : 'dave@google.com' , 'Steve' : 'steve@gmail.com' , 'Rob' : 'rob@gmail.com' , 'Wes' : np. nan}
data = pd. Series( data)
data
Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Wes NaN
dtype: object
data. str . contains( 'gmail' )
Dave False
Steve True
Rob True
Wes NaN
dtype: object
data. str [ : 5 ]
Dave dave@
Steve steve
Rob rob@g
Wes NaN
dtype: object
split_df = data. str . split( '@' , expand= True )
split_df
0 1 Dave dave google.com Steve steve gmail.com Rob rob gmail.com Wes NaN NaN
split_df[ 0 ] . str . cat( split_df[ 1 ] , sep= '@' )
Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Wes NaN
Name: 0, dtype: object