数据规整化:清理、转换、合并、重塑
合并数据集
pandas.merge可根据一个或多个键将不同DataFrame中的行连接起来。 pandas.concat可以沿着一条轴将多个对象堆叠到一起。 实例方法combine_first可以将重复数据编接在一起,用一个对象中的值填充另一个对象中的缺失值。
数据库风格的DataFrame合并
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
df1 = DataFrame( { 'key' : [ 'b' , 'b' , 'a' , 'c' , 'a' , 'a' , 'b' ] , 'data1' : range ( 7 ) } )
df2 = DataFrame( { 'key' : [ 'a' , 'b' , 'd' ] ,
'data2' : range ( 3 ) } )
df1
key data1 0 b 0 1 b 1 2 a 2 3 c 3 4 a 4 5 a 5 6 b 6
df2
pd. merge( df1, df2)
key data1 data2 0 b 0 1 1 b 1 1 2 b 6 1 3 a 2 0 4 a 4 0 5 a 5 0
pd. merge( df1, df2, on= 'key' )
key data1 data2 0 b 0 1 1 b 1 1 2 b 6 1 3 a 2 0 4 a 4 0 5 a 5 0
df3 = DataFrame( { 'lkey' : [ 'b' , 'b' , 'a' , 'c' , 'a' , 'a' , 'b' ] ,
'data1' : range ( 7 ) } )
df4 = DataFrame( { 'rkey' : [ 'a' , 'b' , 'd' ] ,
'data2' : range ( 3 ) } )
pd. merge( df3, df4, left_on= 'lkey' , right_on= 'rkey' )
lkey data1 rkey data2 0 b 0 b 1 1 b 1 b 1 2 b 6 b 1 3 a 2 a 0 4 a 4 a 0 5 a 5 a 0
pd. merge( df1, df2, how= 'outer' )
key data1 data2 0 b 0.0 1.0 1 b 1.0 1.0 2 b 6.0 1.0 3 a 2.0 0.0 4 a 4.0 0.0 5 a 5.0 0.0 6 c 3.0 NaN 7 d NaN 2.0
df1 = DataFrame( { 'key' : [ 'b' , 'b' , 'a' , 'c' , 'a' , 'b' ] ,
'data1' : range ( 6 ) } )
df2 = DataFrame( { 'key' : [ 'a' , 'b' , 'a' , 'b' , 'd' ] ,
'data2' : range ( 5 ) } )
df1
key data1 0 b 0 1 b 1 2 a 2 3 c 3 4 a 4 5 b 5
df2
pd. merge( df1, df2, on= 'key' , how= 'left' )
key data1 data2 0 b 0 1.0 1 b 0 3.0 2 b 1 1.0 3 b 1 3.0 4 a 2 0.0 5 a 2 2.0 6 c 3 NaN 7 a 4 0.0 8 a 4 2.0 9 b 5 1.0 10 b 5 3.0
pd. merge( df1, df2, how= 'inner' )
key data1 data2 0 b 0 1 1 b 0 3 2 b 1 1 3 b 1 3 4 b 5 1 5 b 5 3 6 a 2 0 7 a 2 2 8 a 4 0 9 a 4 2
left = DataFrame( { 'key1' : [ 'foo' , 'foo' , 'bar' ] ,
'key2' : [ 'one' , 'two' , 'one' ] ,
'lval' : [ 1 , 2 , 3 ] } )
right = DataFrame( { 'key1' : [ 'foo' , 'foo' , 'bar' , 'bar' ] ,
'key2' : [ 'one' , 'one' , 'one' , 'two' ] ,
'rval' : [ 4 , 5 , 6 , 7 ] } )
pd. merge( left, right, on= [ 'key1' , 'key2' ] , how= 'outer' )
key1 key2 lval rval 0 foo one 1.0 4.0 1 foo one 1.0 5.0 2 foo two 2.0 NaN 3 bar one 3.0 6.0 4 bar two NaN 7.0
pd. merge( left, right, on= 'key1' )
key1 key2_x lval key2_y rval 0 foo one 1 one 4 1 foo one 1 one 5 2 foo two 2 one 4 3 foo two 2 one 5 4 bar one 3 one 6 5 bar one 3 two 7
pd. merge( left, right, on= 'key1' , suffixes= ( '_left' , '_right' ) )
key1 key2_left lval key2_right rval 0 foo one 1 one 4 1 foo one 1 one 5 2 foo two 2 one 4 3 foo two 2 one 5 4 bar one 3 one 6 5 bar one 3 two 7
索引上的合并
left1 = DataFrame( { 'key' : [ 'a' , 'b' , 'a' , 'a' , 'b' , 'c' ] ,
'value' : range ( 6 ) } )
right1 = DataFrame( { 'group_val' : [ 3.5 , 7 ] } , index= [ 'a' , 'b' ] )
left1
key value 0 a 0 1 b 1 2 a 2 3 a 3 4 b 4 5 c 5
right1
pd. merge( left1, right1, left_on= 'key' , right_index= True )
key value group_val 0 a 0 3.5 2 a 2 3.5 3 a 3 3.5 1 b 1 7.0 4 b 4 7.0
pd. merge( left1, right1, left_on= 'key' , right_index= True , how= 'outer' )
key value group_val 0 a 0 3.5 2 a 2 3.5 3 a 3 3.5 1 b 1 7.0 4 b 4 7.0 5 c 5 NaN
lefth = DataFrame( { 'key1' : [ 'Ohio' , 'Ohio' , 'Ohio' , 'Nevada' , 'Nevada' ] ,
'key2' : [ 2000 , 2001 , 2002 , 2001 , 2002 ] ,
'data' : np. arange( 5 . ) } )
righth = DataFrame( np. arange( 12 ) . reshape( ( 6 , 2 ) ) ,
index= [ [ 'Nevada' , 'Nevada' , 'Ohio' , 'Ohio' , 'Ohio' , 'Ohio' ] ,
[ 2001 , 2000 , 2000 , 2000 , 2001 , 2002 ] ] ,
columns= [ 'event1' , 'event2' ] )
lefth
key1 key2 data 0 Ohio 2000 0.0 1 Ohio 2001 1.0 2 Ohio 2002 2.0 3 Nevada 2001 3.0 4 Nevada 2002 4.0
righth
event1 event2 Nevada 2001 0 1 2000 2 3 Ohio 2000 4 5 2000 6 7 2001 8 9 2002 10 11
pd. merge( lefth, righth, left_on= [ 'key1' , 'key2' ] , right_index= True )
key1 key2 data event1 event2 0 Ohio 2000 0.0 4 5 0 Ohio 2000 0.0 6 7 1 Ohio 2001 1.0 8 9 2 Ohio 2002 2.0 10 11 3 Nevada 2001 3.0 0 1
pd. merge( lefth, righth, left_on= [ 'key1' , 'key2' ] ,
right_index= True , how= 'outer' )
key1 key2 data event1 event2 0 Ohio 2000 0.0 4.0 5.0 0 Ohio 2000 0.0 6.0 7.0 1 Ohio 2001 1.0 8.0 9.0 2 Ohio 2002 2.0 10.0 11.0 3 Nevada 2001 3.0 0.0 1.0 4 Nevada 2002 4.0 NaN NaN 4 Nevada 2000 NaN 2.0 3.0
left2 = DataFrame( [ [ 1 . , 2 . ] , [ 3 . , 4 . ] , [ 5 . , 6 . ] ] , index= [ 'a' , 'c' , 'e' ] ,
columns= [ 'Ohio' , 'Nevada' ] )
right2 = DataFrame( [ [ 7 . , 8 . ] , [ 9 . , 10 . ] , [ 11 . , 12 . ] , [ 13 . , 14 . ] ] ,
index= [ 'b' , 'c' , 'd' , 'e' ] , columns= [ 'Missouri' , 'Alabama' ] )
left2
Ohio Nevada a 1.0 2.0 c 3.0 4.0 e 5.0 6.0
right2
Missouri Alabama b 7.0 8.0 c 9.0 10.0 d 11.0 12.0 e 13.0 14.0
pd. merge( left2, right2, how= 'outer' , left_index= True , right_index= True )
Ohio Nevada Missouri Alabama a 1.0 2.0 NaN NaN b NaN NaN 7.0 8.0 c 3.0 4.0 9.0 10.0 d NaN NaN 11.0 12.0 e 5.0 6.0 13.0 14.0
left2. join( right2, how= 'outer' )
Ohio Nevada Missouri Alabama a 1.0 2.0 NaN NaN b NaN NaN 7.0 8.0 c 3.0 4.0 9.0 10.0 d NaN NaN 11.0 12.0 e 5.0 6.0 13.0 14.0
left1. join( right1, on= 'key' )
key value group_val 0 a 0 3.5 1 b 1 7.0 2 a 2 3.5 3 a 3 3.5 4 b 4 7.0 5 c 5 NaN
another = DataFrame( [ [ 7 . , 8 . ] , [ 9 . , 10 . ] , [ 11 . , 12 . ] , [ 16 . , 17 ] ] ,
index= [ 'a' , 'c' , 'e' , 'f' ] , columns= [ 'New York' , 'Oregon' ] )
left2. join( [ right2, another] )
Ohio Nevada Missouri Alabama New York Oregon a 1.0 2.0 NaN NaN 7.0 8.0 c 3.0 4.0 9.0 10.0 9.0 10.0 e 5.0 6.0 13.0 14.0 11.0 12.0
left2. join( [ right2, another] , how= 'outer' )
Ohio Nevada Missouri Alabama New York Oregon a 1.0 2.0 NaN NaN 7.0 8.0 c 3.0 4.0 9.0 10.0 9.0 10.0 e 5.0 6.0 13.0 14.0 11.0 12.0 b NaN NaN 7.0 8.0 NaN NaN d NaN NaN 11.0 12.0 NaN NaN f NaN NaN NaN NaN 16.0 17.0
轴向连接
arr = np. arange( 12 ) . reshape( ( 3 , 4 ) )
arr
array([[ 0, 1, 2, 3],
[ 4, 5, 6, 7],
[ 8, 9, 10, 11]])
np. concatenate( [ arr, arr] , axis= 1 )
array([[ 0, 1, 2, 3, 0, 1, 2, 3],
[ 4, 5, 6, 7, 4, 5, 6, 7],
[ 8, 9, 10, 11, 8, 9, 10, 11]])
s1 = Series( [ 0 , 1 ] , index= [ 'a' , 'b' ] )
s2 = Series( [ 2 , 3 , 4 ] , index= [ 'c' , 'd' , 'e' ] )
s3 = Series( [ 5 , 6 ] , index= [ 'f' , 'g' ] )
pd. concat( [ s1, s2, s3] )
a 0
b 1
c 2
d 3
e 4
f 5
g 6
dtype: int64
pd. concat( [ s1, s2, s3] , axis= 1 )
0 1 2 a 0.0 NaN NaN b 1.0 NaN NaN c NaN 2.0 NaN d NaN 3.0 NaN e NaN 4.0 NaN f NaN NaN 5.0 g NaN NaN 6.0
s4 = pd. concat( [ s1* 5 , s3] )
s4
a 0
b 5
f 5
g 6
dtype: int64
pd. concat( [ s1, s4] , axis= 1 )
pd. concat( [ s1, s4] , axis= 1 , join= 'inner' )
pd. concat( [ s1, s4] , axis= 1 , join_axes= [ [ 'a' , 'c' , 'b' , 'e' ] ] )
---------------------------------------------------------------------------
TypeError: concat() got an unexpected keyword argument 'join_axes'
result = pd. concat( [ s1, s1, s3] , keys= [ 'one' , 'two' , 'three' ] )
result
one a 0
b 1
two a 0
b 1
three f 5
g 6
dtype: int64
result. unstack( )
a b f g one 0.0 1.0 NaN NaN two 0.0 1.0 NaN NaN three NaN NaN 5.0 6.0
pd. concat( [ s1, s2, s3] , axis= 1 , keys= [ 'one' , 'two' , 'three' ] )
one two three a 0.0 NaN NaN b 1.0 NaN NaN c NaN 2.0 NaN d NaN 3.0 NaN e NaN 4.0 NaN f NaN NaN 5.0 g NaN NaN 6.0
df1 = DataFrame( np. arange( 6 ) . reshape( 3 , 2 ) , index= [ 'a' , 'b' , 'c' ] ,
columns= [ 'one' , 'two' ] )
df2 = DataFrame( 5 + np. arange( 4 ) . reshape( 2 , 2 ) , index= [ 'a' , 'c' ] ,
columns= [ 'three' , 'four' ] )
pd. concat( [ df1, df2] , axis= 1 , keys= [ 'level1' , 'level2' ] )
level1 level2 one two three four a 0 1 5.0 6.0 b 2 3 NaN NaN c 4 5 7.0 8.0
pd. concat( [ df1, df2] , axis= 1 , keys= [ 'level1' , 'level2' ] )
level1 level2 one two three four a 0 1 5.0 6.0 b 2 3 NaN NaN c 4 5 7.0 8.0
pd. concat( [ df1, df2] , axis= 1 , keys= [ 'level1' , 'level2' ] ,
names= [ 'upper' , 'lower' ] )
upper level1 level2 lower one two three four a 0 1 5.0 6.0 b 2 3 NaN NaN c 4 5 7.0 8.0
df1 = DataFrame( np. random. randn( 3 , 4 ) , columns= [ 'a' , 'b' , 'c' , 'd' ] )
df2 = DataFrame( np. random. randn( 2 , 3 ) , columns= [ 'b' , 'd' , 'a' ] )
df1
a b c d 0 0.909729 -1.166310 -0.988668 -0.398099 1 1.226715 1.429447 -0.353615 1.096639 2 0.420715 1.319891 -2.096798 -0.125654
df2
b d a 0 -0.077881 0.031641 -0.839706 1 -0.231105 1.227890 0.422392
pd. concat( [ df1, df2] , ignore_index= True )
a b c d 0 0.909729 -1.166310 -0.988668 -0.398099 1 1.226715 1.429447 -0.353615 1.096639 2 0.420715 1.319891 -2.096798 -0.125654 3 -0.839706 -0.077881 NaN 0.031641 4 0.422392 -0.231105 NaN 1.227890
合并重叠数据
a = Series( [ np. nan, 2.5 , np. nan, 3.5 , 4.5 , np. nan] ,
index= [ 'f' , 'e' , 'd' , 'c' , 'b' , 'a' ] )
b = Series( np. arange( len ( a) , dtype= np. float64) , index= [ 'f' , 'e' , 'd' , 'c' , 'b' , 'a' ] )
b[ - 1 ] = np. nan
a
f NaN
e 2.5
d NaN
c 3.5
b 4.5
a NaN
dtype: float64
b
f 0.0
e 1.0
d 2.0
c 3.0
b 4.0
a NaN
dtype: float64
np. where( pd. isnull( a) , b, a)
array([0. , 2.5, 2. , 3.5, 4.5, nan])
b[ : - 2 ] . combine_first( a[ 2 : ] )
a NaN
b 4.5
c 3.0
d 2.0
e 1.0
f 0.0
dtype: float64
df1 = DataFrame( { 'a' : [ 1 . , np. nan, 5 . , np. nan] ,
'b' : [ np. nan, 2 . , np. nan, 6 . ] ,
'c' : range ( 2 , 18 , 4 ) } )
df2 = DataFrame( { 'a' : [ 5 . , 4 . , np. nan, 3 . , 7 . ] ,
'b' : [ np. nan, 3 . , 4 . , 6 . , 8 . ] } )
df1. combine_first( df2)
a b c 0 1.0 NaN 2.0 1 4.0 2.0 6.0 2 5.0 4.0 10.0 3 3.0 6.0 14.0 4 7.0 8.0 NaN
a b c 0 1.0 NaN 2 1 NaN 2.0 6 2 5.0 NaN 10 3 NaN 6.0 14
df2
a b 0 5.0 NaN 1 4.0 3.0 2 NaN 4.0 3 3.0 6.0 4 7.0 8.0
重塑和轴向旋转
重塑层次化索引
stack:将数据的列旋转为行 unstack:将数据的行旋转为列
data = DataFrame( np. arange( 6 ) . reshape( ( 2 , 3 ) ) , index= pd. Index( [ 'Ohio' , 'Colorado' ] , name= 'state' ) ,
columns= pd. Index( [ 'one' , 'two' , 'three' ] , name= 'number' ) )
data
number one two three state Ohio 0 1 2 Colorado 3 4 5
result = data. stack( )
result
state number
Ohio one 0
two 1
three 2
Colorado one 3
two 4
three 5
dtype: int32
result. unstack( )
number one two three state Ohio 0 1 2 Colorado 3 4 5
result. unstack( 0 )
state Ohio Colorado number one 0 3 two 1 4 three 2 5
result. unstack( 'state' )
state Ohio Colorado number one 0 3 two 1 4 three 2 5
s1 = Series( [ 0 , 1 , 2 , 3 ] , index= [ 'a' , 'b' , 'c' , 'd' ] )
s2 = Series( [ 4 , 5 , 6 ] , index= [ 'c' , 'd' , 'e' ] )
data2 = pd. concat( [ s1, s2] , keys= [ 'one' , 'two' ] )
data2. unstack( )
a b c d e one 0.0 1.0 2.0 3.0 NaN two NaN NaN 4.0 5.0 6.0
data2. unstack( ) . stack( )
one a 0.0
b 1.0
c 2.0
d 3.0
two c 4.0
d 5.0
e 6.0
dtype: float64
data2. unstack( ) . stack( dropna= False )
one a 0.0
b 1.0
c 2.0
d 3.0
e NaN
two a NaN
b NaN
c 4.0
d 5.0
e 6.0
dtype: float64
df = DataFrame( { 'left' : result, 'right' : result+ 5 } , columns= pd. Index( [ 'left' , 'right' ] , name= 'side' ) )
df
side left right state number Ohio one 0 5 two 1 6 three 2 7 Colorado one 3 8 two 4 9 three 5 10
df. unstack( 'state' )
side left right state Ohio Colorado Ohio Colorado number one 0 3 5 8 two 1 4 6 9 three 2 5 7 10
df. unstack( 'state' ) . stack( 'side' )
state Colorado Ohio number side one left 3 0 right 8 5 two left 4 1 right 9 6 three left 5 2 right 10 7
将“长格式”旋转为“宽格式”
ldata = DataFrame( { 'date' : [ '1959-03-31' , '1959-03-31' , '1959-03-31' ,
'1959-06-30' , '1959-06-30' , '1959-06-30' ,
'1959-09-30' , '1959-09-30' , '1959-09-30' ] ,
'item' : [ 'realgdp' , 'infl' , 'unemp' ] * 3 ,
'value' : [ 2710.349 , 0.0 , 5.8 ] * 3 } )
print ( ldata)
date item value
0 1959-03-31 realgdp 2710.349
1 1959-03-31 infl 0.000
2 1959-03-31 unemp 5.800
3 1959-06-30 realgdp 2710.349
4 1959-06-30 infl 0.000
5 1959-06-30 unemp 5.800
6 1959-09-30 realgdp 2710.349
7 1959-09-30 infl 0.000
8 1959-09-30 unemp 5.800
pivoted = ldata. pivot( 'date' , 'item' , 'value' )
pivoted. head( )
item infl realgdp unemp date 1959-03-31 0.0 2710.349 5.8 1959-06-30 0.0 2710.349 5.8 1959-09-30 0.0 2710.349 5.8
ldata[ 'value2' ] = np. random. randn( len ( ldata) )
ldata
date item value value2 0 1959-03-31 realgdp 2710.349 0.576713 1 1959-03-31 infl 0.000 0.164654 2 1959-03-31 unemp 5.800 -0.309467 3 1959-06-30 realgdp 2710.349 1.321802 4 1959-06-30 infl 0.000 0.964655 5 1959-06-30 unemp 5.800 -0.355550 6 1959-09-30 realgdp 2710.349 -0.303973 7 1959-09-30 infl 0.000 0.791143 8 1959-09-30 unemp 5.800 0.985764
pivoted = ldata. pivot( 'date' , 'item' )
pivoted
value value2 item infl realgdp unemp infl realgdp unemp date 1959-03-31 0.0 2710.349 5.8 0.164654 0.576713 -0.309467 1959-06-30 0.0 2710.349 5.8 0.964655 1.321802 -0.355550 1959-09-30 0.0 2710.349 5.8 0.791143 -0.303973 0.985764
pivoted[ 'value' ]
item infl realgdp unemp date 1959-03-31 0.0 2710.349 5.8 1959-06-30 0.0 2710.349 5.8 1959-09-30 0.0 2710.349 5.8
unstacked = ldata. set_index( [ 'date' , 'item' ] ) . unstack( 'item' )
unstacked
value value2 item infl realgdp unemp infl realgdp unemp date 1959-03-31 0.0 2710.349 5.8 0.164654 0.576713 -0.309467 1959-06-30 0.0 2710.349 5.8 0.964655 1.321802 -0.355550 1959-09-30 0.0 2710.349 5.8 0.791143 -0.303973 0.985764
数据转换
移除重复数据
data = DataFrame( { 'k1' : [ 'one' ] * 3 + [ 'two' ] * 4 ,
'k2' : [ 1 , 1 , 2 , 3 , 3 , 4 , 4 ] } )
data
k1 k2 0 one 1 1 one 1 2 one 2 3 two 3 4 two 3 5 two 4 6 two 4
data. duplicated( )
0 False
1 True
2 False
3 False
4 True
5 False
6 True
dtype: bool
data. drop_duplicates( )
data[ 'v1' ] = range ( 7 )
data. drop_duplicates( [ 'k1' ] )
data. drop_duplicates( [ 'k1' , 'k2' ] , keep= 'last' )
k1 k2 v1 1 one 1 1 2 one 2 2 4 two 3 4 6 two 4 6
利用函数或映射进行数据转换
data = DataFrame( { 'food' : [ 'bacon' , 'pulled pork' , 'bacon' , 'Pastrami' , 'corned beef' , 'Bacon' , 'pastrami' , 'honey ham' , 'nova lox' ] ,
'ounces' : [ 4 , 3 , 12 , 6 , 7.5 , 8 , 3 , 5 , 6 ] } )
data
food ounces 0 bacon 4.0 1 pulled pork 3.0 2 bacon 12.0 3 Pastrami 6.0 4 corned beef 7.5 5 Bacon 8.0 6 pastrami 3.0 7 honey ham 5.0 8 nova lox 6.0
meat_to_animal = {
'bacon' : 'pig' ,
'pulled pork' : 'pig' ,
'pastrami' : 'cow' ,
'corned beef' : 'cow' ,
'honey ham' : 'pig' ,
'nova lox' : 'salmon'
}
data[ 'animal' ] = data[ 'food' ] . map ( str . lower) . map ( meat_to_animal)
data
food ounces animal 0 bacon 4.0 pig 1 pulled pork 3.0 pig 2 bacon 12.0 pig 3 Pastrami 6.0 cow 4 corned beef 7.5 cow 5 Bacon 8.0 pig 6 pastrami 3.0 cow 7 honey ham 5.0 pig 8 nova lox 6.0 salmon
data[ 'food' ] . map ( lambda x: meat_to_animal[ x. lower( ) ] )
0 pig
1 pig
2 pig
3 cow
4 cow
5 pig
6 cow
7 pig
8 salmon
Name: food, dtype: object
替换值
data = Series( [ 1 . , - 999 . , 2 . , - 999 . , - 1000 . , 3 . ] )
data
0 1.0
1 -999.0
2 2.0
3 -999.0
4 -1000.0
5 3.0
dtype: float64
data. replace( - 999 , np. nan)
0 1.0
1 NaN
2 2.0
3 NaN
4 -1000.0
5 3.0
dtype: float64
data. replace( [ - 999 , - 1000 ] , np. nan)
0 1.0
1 NaN
2 2.0
3 NaN
4 NaN
5 3.0
dtype: float64
data. replace( [ - 999 , - 1000 ] , [ np. nan, 0 ] )
0 1.0
1 NaN
2 2.0
3 NaN
4 0.0
5 3.0
dtype: float64
data. replace( { - 999 : np. nan, - 1000 : 0 } )
0 1.0
1 NaN
2 2.0
3 NaN
4 0.0
5 3.0
dtype: float64
重命名轴索引
data = DataFrame( np. arange( 12 ) . reshape( ( 3 , 4 ) ) ,
index= [ 'Ohio' , 'Colorado' , 'New York' ] ,
columns= [ 'one' , 'two' , 'three' , 'four' ] )
data. index. map ( str . upper)
Index(['OHIO', 'COLORADO', 'NEW YORK'], dtype='object')
data. index = data. index. map ( str . upper)
data
one two three four OHIO 0 1 2 3 COLORADO 4 5 6 7 NEW YORK 8 9 10 11
data. rename( index= str . title, columns= str . upper)
ONE TWO THREE FOUR Ohio 0 1 2 3 Colorado 4 5 6 7 New York 8 9 10 11
data. rename( index= { 'OHIO' : 'INDIANA' } ,
columns= { 'three' : 'peekaboo' } )
one two peekaboo four INDIANA 0 1 2 3 COLORADO 4 5 6 7 NEW YORK 8 9 10 11
_ = data. rename( index= { 'OHIO' : 'INDIANA' } , inplace= True )
data
one two three four INDIANA 0 1 2 3 COLORADO 4 5 6 7 NEW YORK 8 9 10 11
离散化和面元划分
ages = [ 20 , 22 , 25 , 27 , 21 , 23 , 37 , 31 , 61 , 45 , 41 , 32 ]
bins = [ 18 , 25 , 35 , 60 , 100 ]
cats = pd. cut( ages, bins)
cats
[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
cats. codes
array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)
cats. categories
IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')
pd. value_counts( cats)
(18, 25] 5
(25, 35] 3
(35, 60] 3
(60, 100] 1
dtype: int64
pd. cut( ages, [ 18 , 26 , 36 , 61 , 100 ] , right= False )
[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64, left]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]
group_names = [ 'Youth' , 'YoungAdult' , 'MiddleAged' , 'Senior' ]
pd. cut( ages, bins, labels= group_names)
['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']
data = np. random. rand( 20 )
pd. cut( data, 4 , precision= 2 )
[(0.031, 0.27], (0.75, 1.0], (0.27, 0.51], (0.031, 0.27], (0.031, 0.27], ..., (0.27, 0.51], (0.031, 0.27], (0.75, 1.0], (0.75, 1.0], (0.031, 0.27]]
Length: 20
Categories (4, interval[float64, right]): [(0.031, 0.27] < (0.27, 0.51] < (0.51, 0.75] < (0.75, 1.0]]
data = np. random. randn( 1000 )
cats = pd. qcut( data, 4 )
cats
[(0.699, 2.801], (0.699, 2.801], (0.699, 2.801], (0.0346, 0.699], (-0.703, 0.0346], ..., (-2.582, -0.703], (-0.703, 0.0346], (-2.582, -0.703], (0.699, 2.801], (0.699, 2.801]]
Length: 1000
Categories (4, interval[float64, right]): [(-2.582, -0.703] < (-0.703, 0.0346] < (0.0346, 0.699] < (0.699, 2.801]]
pd. value_counts( cats)
(-2.582, -0.703] 250
(-0.703, 0.0346] 250
(0.0346, 0.699] 250
(0.699, 2.801] 250
dtype: int64
pd. qcut( data, [ 0 , 0.1 , 0.5 , 0.9 , 1 . ] )
[(1.263, 2.801], (0.0346, 1.263], (1.263, 2.801], (0.0346, 1.263], (-1.274, 0.0346], ..., (-2.582, -1.274], (-1.274, 0.0346], (-1.274, 0.0346], (0.0346, 1.263], (0.0346, 1.263]]
Length: 1000
Categories (4, interval[float64, right]): [(-2.582, -1.274] < (-1.274, 0.0346] < (0.0346, 1.263] < (1.263, 2.801]]
检测和过滤异常值
np. random. seed( 12345 )
data = DataFrame( np. random. randn( 1000 , 4 ) )
data. describe( )
0 1 2 3 count 1000.000000 1000.000000 1000.000000 1000.000000 mean -0.067684 0.067924 0.025598 -0.002298 std 0.998035 0.992106 1.006835 0.996794 min -3.428254 -3.548824 -3.184377 -3.745356 25% -0.774890 -0.591841 -0.641675 -0.644144 50% -0.116401 0.101143 0.002073 -0.013611 75% 0.616366 0.780282 0.680391 0.654328 max 3.366626 2.653656 3.260383 3.927528
col = data[ 3 ]
col[ np. abs ( col) > 3 ]
97 3.927528
305 -3.399312
400 -3.745356
Name: 3, dtype: float64
data[ ( np. abs ( data) > 3 ) . any ( 1 ) ]
0 1 2 3 5 -0.539741 0.476985 3.248944 -1.021228 97 -0.774363 0.552936 0.106061 3.927528 102 -0.655054 -0.565230 3.176873 0.959533 305 -2.315555 0.457246 -0.025907 -3.399312 324 0.050188 1.951312 3.260383 0.963301 400 0.146326 0.508391 -0.196713 -3.745356 499 -0.293333 -0.242459 -3.056990 1.918403 523 -3.428254 -0.296336 -0.439938 -0.867165 586 0.275144 1.179227 -3.184377 1.369891 808 -0.362528 -3.548824 1.553205 -2.186301 900 3.366626 -2.372214 0.851010 1.332846
data[ np. abs ( data) > 3 ] = np. sign( data) * 3
data. describe( )
0 1 2 3 count 1000.000000 1000.000000 1000.000000 1000.000000 mean -0.067623 0.068473 0.025153 -0.002081 std 0.995485 0.990253 1.003977 0.989736 min -3.000000 -3.000000 -3.000000 -3.000000 25% -0.774890 -0.591841 -0.641675 -0.644144 50% -0.116401 0.101143 0.002073 -0.013611 75% 0.616366 0.780282 0.680391 0.654328 max 3.000000 2.653656 3.000000 3.000000
排列和随机采样
df = DataFrame( np. arange( 5 * 4 ) . reshape( 5 , 4 ) )
sampler = np. random. permutation( 5 )
sampler
array([1, 0, 2, 3, 4])
df
0 1 2 3 0 0 1 2 3 1 4 5 6 7 2 8 9 10 11 3 12 13 14 15 4 16 17 18 19
df. take( sampler)
0 1 2 3 1 4 5 6 7 0 0 1 2 3 2 8 9 10 11 3 12 13 14 15 4 16 17 18 19
df. take( np. random. permutation( len ( df) ) [ : 3 ] )
0 1 2 3 1 4 5 6 7 3 12 13 14 15 4 16 17 18 19
bag = np. array( [ 5 , 7 , - 1 , 6 , 4 ] )
sampler = np. random. randint( 0 , len ( bag) , size= 10 )
sampler
array([4, 4, 2, 2, 2, 0, 3, 0, 4, 1])
draws = bag. take( sampler)
draws
array([ 4, 4, -1, -1, -1, 5, 6, 5, 4, 7])
计算指标/哑变量
如果DataFrame的某一列中含有k个不同的值,则可以派生初一个k列矩阵或DataFrame(其值全为1和0) pandas有一个get_dummies函数可以实现该功能
df = DataFrame( { 'key' : [ 'b' , 'b' , 'a' , 'c' , 'a' , 'b' ] ,
'data1' : range ( 6 ) } )
pd. get_dummies( df[ 'key' ] )
a b c 0 0 1 0 1 0 1 0 2 1 0 0 3 0 0 1 4 1 0 0 5 0 1 0
dummies = pd. get_dummies( df[ 'key' ] , prefix= 'key' )
df_with_dummy = df[ [ 'data1' ] ] . join( dummies)
df_with_dummy
data1 key_a key_b key_c 0 0 0 1 0 1 1 0 1 0 2 2 1 0 0 3 3 0 0 1 4 4 1 0 0 5 5 0 1 0
mnames = [ 'movie_id' , 'title' , 'genres' ]
movies = pd. read_table( "E:\python_study_files\python\pydata-book-2nd-edition\datasets\movielens\movies.dat" , sep= '::' , header= None , names= mnames)
movies[ : 10 ]
C:\windows\Temp/ipykernel_6352/824848908.py:3: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
movies = pd.read_table("E:\python_study_files\python\pydata-book-2nd-edition\datasets\movielens\movies.dat",sep='::',header=None,names=mnames)
movie_id title genres 0 1 Toy Story (1995) Animation|Children's|Comedy 1 2 Jumanji (1995) Adventure|Children's|Fantasy 2 3 Grumpier Old Men (1995) Comedy|Romance 3 4 Waiting to Exhale (1995) Comedy|Drama 4 5 Father of the Bride Part II (1995) Comedy 5 6 Heat (1995) Action|Crime|Thriller 6 7 Sabrina (1995) Comedy|Romance 7 8 Tom and Huck (1995) Adventure|Children's 8 9 Sudden Death (1995) Action 9 10 GoldenEye (1995) Action|Adventure|Thriller
genre_iter = ( set ( x. split( '|' ) ) for x in movies. genres)
genres = sorted ( set . union( * genre_iter) )
dummies = DataFrame( np. zeros( ( len ( movies) , len ( genres) ) ) , columns= genres)
for i, gen in enumerate ( movies. genres) :
dummies. loc[ i, gen. split( '|' ) ] = 1
movies_windic = movies. join( dummies. add_prefix( 'Genre_' ) )
movies_windic. loc[ 0 ]
movie_id 1
title Toy Story (1995)
genres Animation|Children's|Comedy
Genre_Action 0.0
Genre_Adventure 0.0
Genre_Animation 1.0
Genre_Children's 1.0
Genre_Comedy 1.0
Genre_Crime 0.0
Genre_Documentary 0.0
Genre_Drama 0.0
Genre_Fantasy 0.0
Genre_Film-Noir 0.0
Genre_Horror 0.0
Genre_Musical 0.0
Genre_Mystery 0.0
Genre_Romance 0.0
Genre_Sci-Fi 0.0
Genre_Thriller 0.0
Genre_War 0.0
Genre_Western 0.0
Name: 0, dtype: object
values = np. random. rand( 10 )
values
array([0.75603383, 0.90830844, 0.96588737, 0.17373658, 0.87592824,
0.75415641, 0.163486 , 0.23784062, 0.85564381, 0.58743194])
bins = [ 0 , 0.2 , 0.4 , 0.6 , 0.8 , 1 ]
pd. get_dummies( pd. cut( values, bins) )
(0.0, 0.2] (0.2, 0.4] (0.4, 0.6] (0.6, 0.8] (0.8, 1.0] 0 0 0 0 1 0 1 0 0 0 0 1 2 0 0 0 0 1 3 1 0 0 0 0 4 0 0 0 0 1 5 0 0 0 1 0 6 1 0 0 0 0 7 0 1 0 0 0 8 0 0 0 0 1 9 0 0 1 0 0
字符串操作
字符串对象方法
val = 'a,b, guido'
val. split( ',' )
['a', 'b', ' guido']
pieces = [ x. strip( ) for x in val. split( ',' ) ]
pieces
['a', 'b', 'guido']
first, second, third = pieces
first + '::' + second + '::' + third
'a::b::guido'
'::' . join( pieces)
'a::b::guido'
'guido' in val
True
val. index( ',' )
1
val. find( ':' )
-1
val. index( ':' )
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
C:\windows\Temp/ipykernel_13684/2927268062.py in <module>
----> 1 val.index(':')
ValueError: substring not found
val. count( ',' )
2
val. replace( ',' , '::' )
'a::b:: guido'
val. replace( ',' , '' )
'ab guido'
正则表达式
正则表达式提供了一种灵活的在文本中搜索或匹配字符串模式的方式。 re模块的函数可以分为三个大类:模式匹配、替换以及拆分。
import re
text = "foo bar\t baz \tqux"
re. split( '\s+' , text)
['foo', 'bar', 'baz', 'qux']
regex = re. compile ( '\s+' )
regex. split( text)
['foo', 'bar', 'baz', 'qux']
regex. findall( text)
[' ', '\t ', ' \t']
findall返回的是字符串中所有的匹配项,而search则只返回第一个匹配项,match只匹配字符串的首部。
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
"""
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'
regex = re. compile ( pattern, flags= re. IGNORECASE)
regex. findall( text)
['dave@google.com', 'steve@gmail.com', 'rob@gmail.com']
m = regex. search( text)
m
<re.Match object; span=(5, 20), match='dave@google.com'>
text[ m. start( ) : m. end( ) ]
'dave@google.com'
print ( regex. match( text) )
None
print ( regex. sub( 'REDACTED' , text) )
Dave REDACTED
Steve REDACTED
Rob REDACTED
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re. compile ( pattern, flags= re. IGNORECASE)
m = regex. match( 'wesm@bright.net' )
m. groups( )
('wesm', 'bright', 'net')
regex. findall( text)
[('dave', 'google', 'com'), ('steve', 'gmail', 'com'), ('rob', 'gmail', 'com')]
print ( regex. sub( r'Username: \1, Domain: \2, Suffix: \3' , text) )
Dave Username: dave, Domain: google, Suffix: com
Steve Username: steve, Domain: gmail, Suffix: com
Rob Username: rob, Domain: gmail, Suffix: com
regex = re. compile ( r"""
(?P<username>[A-Z0-9._%+-]+)
@
(?P<domain>[A-Z0-9.-]+)
\.
(?P<suffix>[A-Z]{2,4})""" , flags= re. IGNORECASE| re. VERBOSE)
m = regex. match( 'wesm@bright.net' )
m. groupdict( )
{'username': 'wesm', 'domain': 'bright', 'suffix': 'net'}
pandas中矢量化的字符串函数
data = { 'Dave' : 'dave@google.com' , 'Steve' : 'steve@gmail.com' ,
'Rob' : 'rob@gmail.com' , 'Wes' : np. nan}
data = Series( data)
data
Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Wes NaN
dtype: object
data. isnull( )
Dave False
Steve False
Rob False
Wes True
dtype: bool
data. str . contains( 'gmail' )
Dave False
Steve True
Rob True
Wes NaN
dtype: object
pattern
'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'
x = data. str . findall( pattern, flags= re. IGNORECASE)
x
Dave [(dave, google, com)]
Steve [(steve, gmail, com)]
Rob [(rob, gmail, com)]
Wes NaN
dtype: object
matches = data. str . match( pattern, flags= re. IGNORECASE)
matches
Dave True
Steve True
Rob True
Wes NaN
dtype: object
x. str . get( 1 )
Dave NaN
Steve NaN
Rob NaN
Wes NaN
dtype: float64
x. str [ 0 ]
Dave (dave, google, com)
Steve (steve, gmail, com)
Rob (rob, gmail, com)
Wes NaN
dtype: object
data. str [ : 5 ]
Dave dave@
Steve steve
Rob rob@g
Wes NaN
dtype: object
示例:USDA食品数据库
import json
db = json. load( open ( "E:\\python_study_files\\python\\pydata-book-2nd-edition\\datasets\\usda_food\\database.json" ) )
len ( db)
6636
db[ 0 ] . keys( )
dict_keys(['id', 'description', 'tags', 'manufacturer', 'group', 'portions', 'nutrients'])
db[ 0 ] [ 'nutrients' ] [ 0 ]
{'value': 25.18,
'units': 'g',
'description': 'Protein',
'group': 'Composition'}
nutrients = DataFrame( db[ 0 ] [ 'nutrients' ] )
nutrients[ : 7 ]
value units description group 0 25.18 g Protein Composition 1 29.20 g Total lipid (fat) Composition 2 3.06 g Carbohydrate, by difference Composition 3 3.28 g Ash Other 4 376.00 kcal Energy Energy 5 39.28 g Water Composition 6 1573.00 kJ Energy Energy
info_keys = [ 'description' , 'group' , 'id' , 'manufacturer' ]
info = DataFrame( db, columns= info_keys)
info[ : 5 ]
description group id manufacturer 0 Cheese, caraway Dairy and Egg Products 1008 1 Cheese, cheddar Dairy and Egg Products 1009 2 Cheese, edam Dairy and Egg Products 1018 3 Cheese, feta Dairy and Egg Products 1019 4 Cheese, mozzarella, part skim milk Dairy and Egg Products 1028
info
description group id manufacturer 0 Cheese, caraway Dairy and Egg Products 1008 1 Cheese, cheddar Dairy and Egg Products 1009 2 Cheese, edam Dairy and Egg Products 1018 3 Cheese, feta Dairy and Egg Products 1019 4 Cheese, mozzarella, part skim milk Dairy and Egg Products 1028 ... ... ... ... ... 6631 Bologna, beef, low fat Sausages and Luncheon Meats 42161 6632 Turkey and pork sausage, fresh, bulk, patty or... Sausages and Luncheon Meats 42173 6633 Babyfood, juice, pear Baby Foods 43408 None 6634 Babyfood, dessert, banana yogurt, strained Baby Foods 43539 None 6635 Babyfood, banana no tapioca, strained Baby Foods 43546 None
6636 rows × 4 columns
pd. value_counts( info. group) [ : 10 ]
Vegetables and Vegetable Products 812
Beef Products 618
Baked Products 496
Breakfast Cereals 403
Legumes and Legume Products 365
Fast Foods 365
Lamb, Veal, and Game Products 345
Sweets 341
Fruits and Fruit Juices 328
Pork Products 328
Name: group, dtype: int64
nutrients = [ ]
for rec in db:
fnuts = DataFrame( rec[ 'nutrients' ] )
fnuts[ 'id' ] = rec[ 'id' ]
nutrients. append( fnuts)
nutrients = pd. concat( nutrients, ignore_index= True )
nutrients
value units description group id 0 25.180 g Protein Composition 1008 1 29.200 g Total lipid (fat) Composition 1008 2 3.060 g Carbohydrate, by difference Composition 1008 3 3.280 g Ash Other 1008 4 376.000 kcal Energy Energy 1008 ... ... ... ... ... ... 389350 0.000 mcg Vitamin B-12, added Vitamins 43546 389351 0.000 mg Cholesterol Other 43546 389352 0.072 g Fatty acids, total saturated Other 43546 389353 0.028 g Fatty acids, total monounsaturated Other 43546 389354 0.041 g Fatty acids, total polyunsaturated Other 43546
389355 rows × 5 columns
nutrients. duplicated( ) . sum ( )
14179
nutrients = nutrients. drop_duplicates( )
col_mapping = { 'description' : 'food' ,
'group' : 'fgroup' }
info = info. rename( columns= col_mapping, copy= False )
info
food fgroup id manufacturer 0 Cheese, caraway Dairy and Egg Products 1008 1 Cheese, cheddar Dairy and Egg Products 1009 2 Cheese, edam Dairy and Egg Products 1018 3 Cheese, feta Dairy and Egg Products 1019 4 Cheese, mozzarella, part skim milk Dairy and Egg Products 1028 ... ... ... ... ... 6631 Bologna, beef, low fat Sausages and Luncheon Meats 42161 6632 Turkey and pork sausage, fresh, bulk, patty or... Sausages and Luncheon Meats 42173 6633 Babyfood, juice, pear Baby Foods 43408 None 6634 Babyfood, dessert, banana yogurt, strained Baby Foods 43539 None 6635 Babyfood, banana no tapioca, strained Baby Foods 43546 None
6636 rows × 4 columns
col_mapping = { 'description' : 'nutrient' ,
'group' : 'nutgroup' }
nutrients = nutrients. rename( columns= col_mapping, copy= False )
nutrients
value units nutrient nutgroup id 0 25.180 g Protein Composition 1008 1 29.200 g Total lipid (fat) Composition 1008 2 3.060 g Carbohydrate, by difference Composition 1008 3 3.280 g Ash Other 1008 4 376.000 kcal Energy Energy 1008 ... ... ... ... ... ... 389350 0.000 mcg Vitamin B-12, added Vitamins 43546 389351 0.000 mg Cholesterol Other 43546 389352 0.072 g Fatty acids, total saturated Other 43546 389353 0.028 g Fatty acids, total monounsaturated Other 43546 389354 0.041 g Fatty acids, total polyunsaturated Other 43546
375176 rows × 5 columns
ndata = pd. merge( nutrients, info, on= 'id' , how= 'outer' )
ndata
value units nutrient nutgroup id food fgroup manufacturer 0 25.180 g Protein Composition 1008 Cheese, caraway Dairy and Egg Products 1 29.200 g Total lipid (fat) Composition 1008 Cheese, caraway Dairy and Egg Products 2 3.060 g Carbohydrate, by difference Composition 1008 Cheese, caraway Dairy and Egg Products 3 3.280 g Ash Other 1008 Cheese, caraway Dairy and Egg Products 4 376.000 kcal Energy Energy 1008 Cheese, caraway Dairy and Egg Products ... ... ... ... ... ... ... ... ... 375171 0.000 mcg Vitamin B-12, added Vitamins 43546 Babyfood, banana no tapioca, strained Baby Foods None 375172 0.000 mg Cholesterol Other 43546 Babyfood, banana no tapioca, strained Baby Foods None 375173 0.072 g Fatty acids, total saturated Other 43546 Babyfood, banana no tapioca, strained Baby Foods None 375174 0.028 g Fatty acids, total monounsaturated Other 43546 Babyfood, banana no tapioca, strained Baby Foods None 375175 0.041 g Fatty acids, total polyunsaturated Other 43546 Babyfood, banana no tapioca, strained Baby Foods None
375176 rows × 8 columns
result = ndata. groupby( [ 'nutrient' , 'fgroup' ] ) [ 'value' ] . quantile( 0.5 )
result[ 'Zinc, Zn' ] . sort_values( ) . plot( kind= 'barh' )
by_nutrient = ndata. groupby( [ 'nutgroup' , 'nutrient' ] )
get_maximum = lambda x: x. xs( x. value. idxmax( ) )
get_minimum = lambda x: x. xs( x. value. idxmin( ) )
max_foods = by_nutrient. apply ( get_maximum) [ [ 'value' , 'food' ] ]
max_foods. food = max_foods. food. str [ : 50 ]
max_foods. loc[ 'Amino Acids' ] [ 'food' ]
nutrient
Alanine Gelatins, dry powder, unsweetened
Arginine Seeds, sesame flour, low-fat
Aspartic acid Soy protein isolate
Cystine Seeds, cottonseed flour, low fat (glandless)
Glutamic acid Soy protein isolate
Glycine Gelatins, dry powder, unsweetened
Histidine Whale, beluga, meat, dried (Alaska Native)
Hydroxyproline KENTUCKY FRIED CHICKEN, Fried Chicken, ORIGINA...
Isoleucine Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Leucine Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Lysine Seal, bearded (Oogruk), meat, dried (Alaska Na...
Methionine Fish, cod, Atlantic, dried and salted
Phenylalanine Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Proline Gelatins, dry powder, unsweetened
Serine Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Threonine Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Tryptophan Sea lion, Steller, meat with fat (Alaska Native)
Tyrosine Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Valine Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Name: food, dtype: object