自我学习学习笔记:pandas索引拼接重塑 1、分层索引 2、联合与合并数据集 3、重塑和透视
1分层索引
import pandas as pd
import numpy as np
生成数据
data = pd. Series( np. random. randn( 9 ) , index= [ [ 'a' , 'a' , 'a' , 'b' , 'b' , 'c' , 'c' , 'd' , 'd' ] , [ 1 , 2 , 3 , 1 , 3 , 1 , 2 , 2 , 3 ] ] )
data
a 1 0.269045
2 -0.885298
3 0.158806
b 1 0.378847
3 0.301012
c 1 -1.255018
2 -1.453824
d 2 -0.088124
3 -1.746533
dtype: float64
data. index
MultiIndex([('a', 1),
('a', 2),
('a', 3),
('b', 1),
('b', 3),
('c', 1),
('c', 2),
('d', 2),
('d', 3)],
)
分层索引
data[ 'b' ]
1 0.378847
3 0.301012
dtype: float64
data[ 'b' : 'c' ]
b 1 0.378847
3 0.301012
c 1 -1.255018
2 -1.453824
dtype: float64
data. loc[ [ 'b' , 'c' ] ]
b 1 0.378847
3 0.301012
c 1 -1.255018
2 -1.453824
dtype: float64
在内部层级中进行选择
data. loc[ : , 2 ]
a -0.885298
c -1.453824
d -0.088124
dtype: float64
使用unstack将数据重新排列
data. unstack( )
1 2 3 a 0.269045 -0.885298 0.158806 b 0.378847 NaN 0.301012 c -1.255018 -1.453824 NaN d NaN -0.088124 -1.746533
unstack的反操作为stack
data. unstack( ) . stack( )
a 1 0.269045
2 -0.885298
3 0.158806
b 1 0.378847
3 0.301012
c 1 -1.255018
2 -1.453824
d 2 -0.088124
3 -1.746533
dtype: float64
分层索引
frame = pd. DataFrame( np. arange( 12 ) . reshape( ( 4 , 3 ) ) , index= [ [ 'a' , 'a' , 'b' , 'b' ] , [ 1 , 2 , 1 , 2 ] ] , columns= [ [ 'Ohio' , 'Ohio' , 'Colorado' ] , [ 'Green' , 'Red' , 'Gerrn' ] ] )
frame
Ohio Colorado Green Red Gerrn a 1 0 1 2 2 3 4 5 b 1 6 7 8 2 9 10 11
添加索引名称
frame. index. names = [ 'key1' , 'key2' ]
frame. columns. names = [ 'state' , 'color' ]
frame
state Ohio Colorado color Green Red Gerrn key1 key2 a 1 0 1 2 2 3 4 5 b 1 6 7 8 2 9 10 11
列索引
frame[ 'Ohio' ]
color Green Red key1 key2 a 1 0 1 2 3 4 b 1 6 7 2 9 10
from pandas import MultiIndex
使用MultiIndex创造带有层级的列
MultiIndex. from_arrays( [ [ 'Ohio' , 'Ohio' , 'Colorado' ] , [ 'Green' , 'Red' , 'Gerrn' ] ] , names= [ 'state' , 'color' ] )
MultiIndex([( 'Ohio', 'Green'),
( 'Ohio', 'Red'),
('Colorado', 'Gerrn')],
names=['state', 'color'])
1.1重排序和层级排序
frame. swaplevel( 'key1' , 'key2' )
state Ohio Colorado color Green Red Gerrn key2 key1 1 a 0 1 2 2 a 3 4 5 1 b 6 7 8 2 b 9 10 11
frame. sort_index( level= 1 )
state Ohio Colorado color Green Red Gerrn key1 key2 a 1 0 1 2 b 1 6 7 8 a 2 3 4 5 b 2 9 10 11
frame. swaplevel( 0 , 1 ) . sort_index( level= 0 )
state Ohio Colorado color Green Red Gerrn key2 key1 1 a 0 1 2 b 6 7 8 2 a 3 4 5 b 9 10 11
1.2按层级进行汇总统计
frame. sum ( level= 'key2' )
state Ohio Colorado color Green Red Gerrn key2 1 6 8 10 2 12 14 16
frame. sum ( level= 'color' , axis= 1 )
color Green Red Gerrn key1 key2 a 1 0 1 2 2 3 4 5 b 1 6 7 8 2 9 10 11
1.3使用列进行索引
frame = pd. DataFrame( { 'a' : range ( 7 ) , 'b' : range ( 7 , 0 , - 1 ) , 'c' : [ 'one' , 'one' , 'one' , 'two' , 'two' , 'two' , 'two' ] , 'd' : [ 0 , 1 , 2 , 0 , 1 , 2 , 3 ] } )
frame
a b c d 0 0 7 one 0 1 1 6 one 1 2 2 5 one 2 3 3 4 two 0 4 4 3 two 1 5 5 2 two 2 6 6 1 two 3
frame2 = frame. set_index( [ 'c' , 'd' ] )
frame2
a b c d one 0 0 7 1 1 6 2 2 5 two 0 3 4 1 4 3 2 5 2 3 6 1
frame. set_index( [ 'c' , 'd' ] , drop= False )
a b c d c d one 0 0 7 one 0 1 1 6 one 1 2 2 5 one 2 two 0 3 4 two 0 1 4 3 two 1 2 5 2 two 2 3 6 1 two 3
frame2. reset_index( )
c d a b 0 one 0 0 7 1 one 1 1 6 2 one 2 2 5 3 two 0 3 4 4 two 1 4 3 5 two 2 5 2 6 two 3 6 1
2联合与合并数据集
2.1数据库分格的DateFrame连接
df1 = pd. DataFrame( { 'key' : [ 'b' , 'b' , 'a' , 'c' , 'a' , 'a' , 'b' ] , 'data1' : range ( 7 ) } )
df1
key data1 0 b 0 1 b 1 2 a 2 3 c 3 4 a 4 5 a 5 6 b 6
df2 = pd. DataFrame( { 'key' : [ 'a' , 'b' , 'd' ] , 'data2' : range ( 3 ) } )
df2
pd. merge( df1, df2)
key data1 data2 0 b 0 1 1 b 1 1 2 b 6 1 3 a 2 0 4 a 4 0 5 a 5 0
pd. merge( df1, df2, on= 'key' )
key data1 data2 0 b 0 1 1 b 1 1 2 b 6 1 3 a 2 0 4 a 4 0 5 a 5 0
df3 = pd. DataFrame( { 'lkey' : [ 'b' , 'b' , 'a' , 'c' , 'a' , 'a' , 'b' ] , 'data1' : range ( 7 ) } )
df3
lkey data1 0 b 0 1 b 1 2 a 2 3 c 3 4 a 4 5 a 5 6 b 6
df4 = pd. DataFrame( { 'rkey' : [ 'a' , 'b' , 'd' ] , 'data2' : range ( 3 ) } )
df4
pd. merge( df3, df4, left_on= 'lkey' , right_on= 'rkey' )
lkey data1 rkey data2 0 b 0 b 1 1 b 1 b 1 2 b 6 b 1 3 a 2 a 0 4 a 4 a 0 5 a 5 a 0
pd. merge( df1, df2, how= 'outer' )
key data1 data2 0 b 0.0 1.0 1 b 1.0 1.0 2 b 6.0 1.0 3 a 2.0 0.0 4 a 4.0 0.0 5 a 5.0 0.0 6 c 3.0 NaN 7 d NaN 2.0
df1 = pd. DataFrame( { 'key' : [ 'b' , 'b' , 'a' , 'c' , 'a' , 'b' ] , 'data1' : range ( 6 ) } )
df1
key data1 0 b 0 1 b 1 2 a 2 3 c 3 4 a 4 5 b 5
df2 = pd. DataFrame( { 'key' : [ 'a' , 'b' , 'a' , 'b' , 'd' ] , 'data2' : range ( 5 ) } )
df2
pd. merge( df1, df2, on= 'key' , how= 'left' )
key data1 data2 0 b 0 1.0 1 b 0 3.0 2 b 1 1.0 3 b 1 3.0 4 a 2 0.0 5 a 2 2.0 6 c 3 NaN 7 a 4 0.0 8 a 4 2.0 9 b 5 1.0 10 b 5 3.0
pd. merge( df1, df2, how= 'inner' )
key data1 data2 0 b 0 1 1 b 0 3 2 b 1 1 3 b 1 3 4 b 5 1 5 b 5 3 6 a 2 0 7 a 2 2 8 a 4 0 9 a 4 2
left = pd. DataFrame( { 'key1' : [ 'foo' , 'foo' , 'bar' ] , 'key2' : [ 'one' , 'two' , 'one' ] , 'lval' : [ 1 , 2 , 3 ] } )
left
key1 key2 lval 0 foo one 1 1 foo two 2 2 bar one 3
right = pd. DataFrame( { 'key1' : [ 'foo' , 'foo' , 'bar' , 'bar' ] , 'key2' : [ 'one' , 'one' , 'one' , 'two' ] , 'rval' : [ 4 , 5 , 6 , 7 ] } )
right
key1 key2 rval 0 foo one 4 1 foo one 5 2 bar one 6 3 bar two 7
pd. merge( left, right, on= [ 'key1' , 'key2' ] , how= 'outer' )
key1 key2 lval rval 0 foo one 1.0 4.0 1 foo one 1.0 5.0 2 foo two 2.0 NaN 3 bar one 3.0 6.0 4 bar two NaN 7.0
pd. merge( left, right, on= 'key1' )
key1 key2_x lval key2_y rval 0 foo one 1 one 4 1 foo one 1 one 5 2 foo two 2 one 4 3 foo two 2 one 5 4 bar one 3 one 6 5 bar one 3 two 7
pd. merge( left, right, on= 'key1' , suffixes= ( '_left' , '_right' ) )
key1 key2_left lval key2_right rval 0 foo one 1 one 4 1 foo one 1 one 5 2 foo two 2 one 4 3 foo two 2 one 5 4 bar one 3 one 6 5 bar one 3 two 7
2.2根据索引合并
left1 = pd. DataFrame( { 'key' : [ 'a' , 'b' , 'a' , 'a' , 'b' , 'c' ] , 'value' : range ( 6 ) } )
left1
key value 0 a 0 1 b 1 2 a 2 3 a 3 4 b 4 5 c 5
right1 = pd. DataFrame( { 'group_val' : [ 3.5 , 7 ] } , index= [ 'a' , 'b' ] )
right1
pd. merge( left1, right1, left_on= 'key' , right_index= True )
key value group_val 0 a 0 3.5 2 a 2 3.5 3 a 3 3.5 1 b 1 7.0 4 b 4 7.0
pd. merge( left1, right1, left_on= 'key' , right_index= True , how= 'outer' )
key value group_val 0 a 0 3.5 2 a 2 3.5 3 a 3 3.5 1 b 1 7.0 4 b 4 7.0 5 c 5 NaN
lefth = pd. DataFrame( { 'key1' : [ 'Ohio' , 'Ohio' , 'Ohio' , 'Nevada' , 'Nevada' ] , 'key2' : [ 2000 , 2001 , 2002 , 2001 , 2002 ] , 'data' : np. arange( 5. ) } )
lefth
key1 key2 data 0 Ohio 2000 0.0 1 Ohio 2001 1.0 2 Ohio 2002 2.0 3 Nevada 2001 3.0 4 Nevada 2002 4.0
righth = pd. DataFrame( np. arange( 12 ) . reshape( ( 6 , 2 ) ) , index= [ [ 'Nevada' , 'Nevada' , 'Ohio' , 'Ohio' , 'Ohio' , 'Ohio' ] , [ 2001 , 2000 , 2000 , 2000 , 2001 , 2002 ] ] , columns= [ 'event1' , 'event2' ] )
righth
event1 event2 Nevada 2001 0 1 2000 2 3 Ohio 2000 4 5 2000 6 7 2001 8 9 2002 10 11
pd. merge( lefth, righth, left_on= [ 'key1' , 'key2' ] , right_index= True )
key1 key2 data event1 event2 0 Ohio 2000 0.0 4 5 0 Ohio 2000 0.0 6 7 1 Ohio 2001 1.0 8 9 2 Ohio 2002 2.0 10 11 3 Nevada 2001 3.0 0 1
pd. merge( lefth, righth, left_on= [ 'key1' , 'key2' ] , right_index= True , how= 'outer' )
key1 key2 data event1 event2 0 Ohio 2000 0.0 4.0 5.0 0 Ohio 2000 0.0 6.0 7.0 1 Ohio 2001 1.0 8.0 9.0 2 Ohio 2002 2.0 10.0 11.0 3 Nevada 2001 3.0 0.0 1.0 4 Nevada 2002 4.0 NaN NaN 4 Nevada 2000 NaN 2.0 3.0
使用两边的索引进行合并
left2 = pd. DataFrame( [ [ 1. , 2. ] , [ 3. , 4. ] , [ 5. , 6. ] ] , index= [ 'a' , 'c' , 'e' ] , columns= [ 'Ohio' , 'Nevada' ] )
left2
Ohio Nevada a 1.0 2.0 c 3.0 4.0 e 5.0 6.0
right2 = pd. DataFrame( [ [ 7. , 8. ] , [ 9. , 10. ] , [ 11. , 12. ] , [ 13. , 14. ] ] , index= [ 'b' , 'c' , 'd' , 'e' ] , columns= [ 'Missouri' , 'Alabama' ] )
right2
Missouri Alabama b 7.0 8.0 c 9.0 10.0 d 11.0 12.0 e 13.0 14.0
pd. merge( left2, right2, how= 'outer' , left_index= True , right_index= True )
Ohio Nevada Missouri Alabama a 1.0 2.0 NaN NaN b NaN NaN 7.0 8.0 c 3.0 4.0 9.0 10.0 d NaN NaN 11.0 12.0 e 5.0 6.0 13.0 14.0
left2. join( right2, how= 'outer' )
Ohio Nevada Missouri Alabama a 1.0 2.0 NaN NaN b NaN NaN 7.0 8.0 c 3.0 4.0 9.0 10.0 d NaN NaN 11.0 12.0 e 5.0 6.0 13.0 14.0
left1. join( right1, on= 'key' )
key value group_val 0 a 0 3.5 1 b 1 7.0 2 a 2 3.5 3 a 3 3.5 4 b 4 7.0 5 c 5 NaN
another = pd. DataFrame( [ [ 7. , 8. ] , [ 9. , 10. ] , [ 11. , 12. ] , [ 16. , 17. ] ] , index= [ 'a' , 'c' , 'e' , 'f' ] , columns= [ 'New York' , 'Oregon' ] )
another
New York Oregon a 7.0 8.0 c 9.0 10.0 e 11.0 12.0 f 16.0 17.0
left2. join( [ right2, another] )
Ohio Nevada Missouri Alabama New York Oregon a 1.0 2.0 NaN NaN 7.0 8.0 c 3.0 4.0 9.0 10.0 9.0 10.0 e 5.0 6.0 13.0 14.0 11.0 12.0
left2. join( [ right2, another] , how= 'outer' )
Ohio Nevada Missouri Alabama New York Oregon a 1.0 2.0 NaN NaN 7.0 8.0 c 3.0 4.0 9.0 10.0 9.0 10.0 e 5.0 6.0 13.0 14.0 11.0 12.0 b NaN NaN 7.0 8.0 NaN NaN d NaN NaN 11.0 12.0 NaN NaN f NaN NaN NaN NaN 16.0 17.0
2.3沿轴向连接
arr = np. arange( 12 ) . reshape( ( 3 , 4 ) )
arr
array([[ 0, 1, 2, 3],
[ 4, 5, 6, 7],
[ 8, 9, 10, 11]])
np. concatenate( [ arr, arr] , axis= 1 )
array([[ 0, 1, 2, 3, 0, 1, 2, 3],
[ 4, 5, 6, 7, 4, 5, 6, 7],
[ 8, 9, 10, 11, 8, 9, 10, 11]])
s1 = pd. Series( [ 0 , 1 ] , index= [ 'a' , 'b' ] )
s1
a 0
b 1
dtype: int64
s2 = pd. Series( [ 2 , 3 , 4 ] , index= [ 'c' , 'd' , 'e' ] )
s2
c 2
d 3
e 4
dtype: int64
s3 = pd. Series( [ 5 , 6 ] , index= [ 'f' , 'g' ] )
s3
f 5
g 6
dtype: int64
pd. concat( [ s1, s2, s3] )
a 0
b 1
c 2
d 3
e 4
f 5
g 6
dtype: int64
pd. concat( [ s1, s2, s3] , axis= 1 )
0 1 2 a 0.0 NaN NaN b 1.0 NaN NaN c NaN 2.0 NaN d NaN 3.0 NaN e NaN 4.0 NaN f NaN NaN 5.0 g NaN NaN 6.0
s4 = pd. concat( [ s1, s3] )
s4
a 0
b 1
f 5
g 6
dtype: int64
pd. concat( [ s1, s4] , axis= 1 )
pd. concat( [ s1, s4] , axis= 1 , join= 'inner' )
result = pd. concat( [ s1, s1, s3] , keys= [ 'one' , 'two' , 'three' ] )
result
one a 0
b 1
two a 0
b 1
three f 5
g 6
dtype: int64
result. unstack( )
a b f g one 0.0 1.0 NaN NaN two 0.0 1.0 NaN NaN three NaN NaN 5.0 6.0
pd. concat( [ s1, s2, s3] , axis= 1 , keys= [ 'one' , 'two' , 'three' ] )
one two three a 0.0 NaN NaN b 1.0 NaN NaN c NaN 2.0 NaN d NaN 3.0 NaN e NaN 4.0 NaN f NaN NaN 5.0 g NaN NaN 6.0
df1 = pd. DataFrame( np. arange( 6 ) . reshape( 3 , 2 ) , index= [ 'a' , 'b' , 'c' ] , columns= [ 'one' , 'two' ] )
df1
df2 = pd. DataFrame( 5 + np. arange( 4 ) . reshape( 2 , 2 ) , index= [ 'a' , 'c' ] , columns= [ 'three' , 'four' ] )
df2
pd. concat( [ df1, df2] , axis= 1 , keys= [ 'level1' , 'level2' ] )
level1 level2 one two three four a 0 1 5.0 6.0 b 2 3 NaN NaN c 4 5 7.0 8.0
pd. concat( { 'level1' : df1, 'level2' : df2} , axis= 1 )
level1 level2 one two three four a 0 1 5.0 6.0 b 2 3 NaN NaN c 4 5 7.0 8.0
pd. concat( [ df1, df2] , axis= 1 , keys= [ 'level1' , 'level2' ] , names= [ 'upper' , 'lower' ] )
upper level1 level2 lower one two three four a 0 1 5.0 6.0 b 2 3 NaN NaN c 4 5 7.0 8.0
df1 = pd. DataFrame( np. random. randn( 3 , 4 ) , columns= [ 'a' , 'b' , 'c' , 'd' ] )
df1
a b c d 0 0.030306 -0.665932 -0.180251 -0.562281 1 -0.562836 -1.362955 -1.928334 -0.991793 2 -1.382119 0.695608 0.638588 0.812288
df2 = pd. DataFrame( np. random. randn( 2 , 3 ) , columns= [ 'b' , 'd' , 'a' ] )
df2
b d a 0 0.323228 -1.229234 2.685688 1 -1.968068 -1.996343 -1.134667
pd. concat( [ df1, df2] , ignore_index= True )
a b c d 0 0.030306 -0.665932 -0.180251 -0.562281 1 -0.562836 -1.362955 -1.928334 -0.991793 2 -1.382119 0.695608 0.638588 0.812288 3 2.685688 0.323228 NaN -1.229234 4 -1.134667 -1.968068 NaN -1.996343
2.4联合重叠数据
a = pd. Series( [ np. nan, 2.5 , 0.0 , 3.5 , 4.5 , np. nan] , index= [ 'f' , 'e' , 'd' , 'c' , 'b' , 'a' ] )
a
f NaN
e 2.5
d 0.0
c 3.5
b 4.5
a NaN
dtype: float64
b = pd. Series( [ 0. , np. nan, 2. , np. nan, np. nan, 5. ] , index= [ 'a' , 'b' , 'c' , 'd' , 'e' , 'f' ] )
b
a 0.0
b NaN
c 2.0
d NaN
e NaN
f 5.0
dtype: float64
np. where( pd. isnull( a) , b, a)
array([0. , 2.5, 0. , 3.5, 4.5, 5. ])
b. combine_first( a)
a 0.0
b 4.5
c 2.0
d 0.0
e 2.5
f 5.0
dtype: float64
df1 = pd. DataFrame( { 'a' : [ 1. , np. nan, 5. , np. nan] , 'b' : [ np. nan, 2. , np. nan, 6. ] , 'c' : range ( 2 , 18 , 4 ) } )
df1
a b c 0 1.0 NaN 2 1 NaN 2.0 6 2 5.0 NaN 10 3 NaN 6.0 14
df2 = pd. DataFrame( { 'a' : [ 5. , 4. , np. nan, 3. , 7. ] , 'b' : [ np. nan, 3. , 4. , 6. , 8. ] } )
df2
a b 0 5.0 NaN 1 4.0 3.0 2 NaN 4.0 3 3.0 6.0 4 7.0 8.0
df1. combine_first( df2)
a b c 0 1.0 NaN 2.0 1 4.0 2.0 6.0 2 5.0 4.0 10.0 3 3.0 6.0 14.0 4 7.0 8.0 NaN
3.重塑和透视
3.1使用多层索引进行重塑
data = pd. DataFrame( np. arange( 6 ) . reshape( ( 2 , 3 ) ) , index= pd. Index( [ 'Ohio' , 'Colorado' ] , name= 'state' ) , columns= pd. Index( [ 'one' , 'two' , 'three' ] , name= 'number' ) )
data
number one two three state Ohio 0 1 2 Colorado 3 4 5
stack将列透视到行
result = data. stack( )
result
state number
Ohio one 0
two 1
three 2
Colorado one 3
two 4
three 5
dtype: int32
result. unstack( )
number one two three state Ohio 0 1 2 Colorado 3 4 5
result. unstack( 0 )
state Ohio Colorado number one 0 3 two 1 4 three 2 5
result. unstack( 'state' )
state Ohio Colorado number one 0 3 two 1 4 three 2 5
s1 = pd. Series( [ 0 , 1 , 2 , 3 ] , index= [ 'a' , 'b' , 'c' , 'd' ] )
s1
a 0
b 1
c 2
d 3
dtype: int64
s2 = pd. Series( [ 4 , 5 , 6 ] , index= [ 'c' , 'd' , 'e' ] )
s2
c 4
d 5
e 6
dtype: int64
data2 = pd. concat( [ s1, s2] , keys= [ 'one' , 'two' ] )
data2
one a 0
b 1
c 2
d 3
two c 4
d 5
e 6
dtype: int64
data2. unstack( )
a b c d e one 0.0 1.0 2.0 3.0 NaN two NaN NaN 4.0 5.0 6.0
data2. unstack( ) . stack( )
one a 0.0
b 1.0
c 2.0
d 3.0
two c 4.0
d 5.0
e 6.0
dtype: float64
data2. unstack( ) . stack( dropna= False )
one a 0.0
b 1.0
c 2.0
d 3.0
e NaN
two a NaN
b NaN
c 4.0
d 5.0
e 6.0
dtype: float64
df = pd. DataFrame( { 'left' : result, 'right' : result+ 5 } , columns= pd. Index( [ 'left' , 'right' ] , name= 'side' ) )
df
side left right state number Ohio one 0 5 two 1 6 three 2 7 Colorado one 3 8 two 4 9 three 5 10
df. unstack( 'state' )
side left right state Ohio Colorado Ohio Colorado number one 0 3 5 8 two 1 4 6 9 three 2 5 7 10
df. unstack( 'state' ) . stack( 'side' )
state Colorado Ohio number side one left 3 0 right 8 5 two left 4 1 right 9 6 three left 5 2 right 10 7
3.2将“长”透视为“宽”
data = pd. read_csv( 'examples/macrodata.csv' )
data. head( )
year quarter realgdp realcons realinv realgovt realdpi cpi m1 tbilrate unemp pop infl realint 0 1959 1 2710.349 1707.4 286.898 470.045 1886.9 28.98 139.7 2.82 5.8 177.146 0.00 0.00 1 1959 2 2778.801 1733.7 310.859 481.301 1919.7 29.15 141.7 3.08 5.1 177.830 2.34 0.74 2 1959 3 2775.488 1751.8 289.226 491.260 1916.4 29.35 140.5 3.82 5.3 178.657 2.74 1.09 3 1959 4 2785.204 1753.7 299.356 484.052 1931.3 29.37 140.0 4.33 5.6 179.386 0.27 4.06 4 1960 1 2847.699 1770.5 331.722 462.199 1955.5 29.54 139.6 3.50 5.2 180.007 2.31 1.19
periods = pd. PeriodIndex( year= data. year, quarter= data. quarter, name= 'date' )
columns = pd. Index( [ 'realgdp' , 'infl' , 'unemp' ] , name= 'item' )
data = data. reindex( columns= columns)
data. index = periods. to_timestamp( 'D' , 'end' )
ldata = data. stack( ) . reset_index( ) . rename( columns= { 0 : 'value' } )
ldata[ : 10 ]
date item value 0 1959-03-31 23:59:59.999999999 realgdp 2710.349 1 1959-03-31 23:59:59.999999999 infl 0.000 2 1959-03-31 23:59:59.999999999 unemp 5.800 3 1959-06-30 23:59:59.999999999 realgdp 2778.801 4 1959-06-30 23:59:59.999999999 infl 2.340 5 1959-06-30 23:59:59.999999999 unemp 5.100 6 1959-09-30 23:59:59.999999999 realgdp 2775.488 7 1959-09-30 23:59:59.999999999 infl 2.740 8 1959-09-30 23:59:59.999999999 unemp 5.300 9 1959-12-31 23:59:59.999999999 realgdp 2785.204
pivoted = ldata. pivot( 'date' , 'item' , 'value' )
pivoted
item infl realgdp unemp date 1959-03-31 23:59:59.999999999 0.00 2710.349 5.8 1959-06-30 23:59:59.999999999 2.34 2778.801 5.1 1959-09-30 23:59:59.999999999 2.74 2775.488 5.3 1959-12-31 23:59:59.999999999 0.27 2785.204 5.6 1960-03-31 23:59:59.999999999 2.31 2847.699 5.2 ... ... ... ... 2008-09-30 23:59:59.999999999 -3.16 13324.600 6.0 2008-12-31 23:59:59.999999999 -8.79 13141.920 6.9 2009-03-31 23:59:59.999999999 0.94 12925.410 8.1 2009-06-30 23:59:59.999999999 3.37 12901.504 9.2 2009-09-30 23:59:59.999999999 3.56 12990.341 9.6
203 rows × 3 columns
ldata[ 'value2' ] = np. random. randn( len ( ldata) )
ldata[ : 10 ]
date item value value2 0 1959-03-31 23:59:59.999999999 realgdp 2710.349 1.417751 1 1959-03-31 23:59:59.999999999 infl 0.000 -0.843318 2 1959-03-31 23:59:59.999999999 unemp 5.800 0.752850 3 1959-06-30 23:59:59.999999999 realgdp 2778.801 1.481983 4 1959-06-30 23:59:59.999999999 infl 2.340 0.912075 5 1959-06-30 23:59:59.999999999 unemp 5.100 -0.530813 6 1959-09-30 23:59:59.999999999 realgdp 2775.488 1.443973 7 1959-09-30 23:59:59.999999999 infl 2.740 1.977522 8 1959-09-30 23:59:59.999999999 unemp 5.300 0.621740 9 1959-12-31 23:59:59.999999999 realgdp 2785.204 -0.379998
pivoted = ldata. pivot( 'date' , 'item' )
pivoted[ : 5 ]
value value2 item infl realgdp unemp infl realgdp unemp date 1959-03-31 23:59:59.999999999 0.00 2710.349 5.8 -0.843318 1.417751 0.752850 1959-06-30 23:59:59.999999999 2.34 2778.801 5.1 0.912075 1.481983 -0.530813 1959-09-30 23:59:59.999999999 2.74 2775.488 5.3 1.977522 1.443973 0.621740 1959-12-31 23:59:59.999999999 0.27 2785.204 5.6 1.356540 -0.379998 -0.738881 1960-03-31 23:59:59.999999999 2.31 2847.699 5.2 1.273626 -1.512028 -0.234333
pivoted[ 'value' ] [ : 5 ]
item infl realgdp unemp date 1959-03-31 23:59:59.999999999 0.00 2710.349 5.8 1959-06-30 23:59:59.999999999 2.34 2778.801 5.1 1959-09-30 23:59:59.999999999 2.74 2775.488 5.3 1959-12-31 23:59:59.999999999 0.27 2785.204 5.6 1960-03-31 23:59:59.999999999 2.31 2847.699 5.2
unstacked = ldata. set_index( [ 'date' , 'item' ] ) . unstack( 'item' )
unstacked[ : 7 ]
value value2 item infl realgdp unemp infl realgdp unemp date 1959-03-31 23:59:59.999999999 0.00 2710.349 5.8 -0.843318 1.417751 0.752850 1959-06-30 23:59:59.999999999 2.34 2778.801 5.1 0.912075 1.481983 -0.530813 1959-09-30 23:59:59.999999999 2.74 2775.488 5.3 1.977522 1.443973 0.621740 1959-12-31 23:59:59.999999999 0.27 2785.204 5.6 1.356540 -0.379998 -0.738881 1960-03-31 23:59:59.999999999 2.31 2847.699 5.2 1.273626 -1.512028 -0.234333 1960-06-30 23:59:59.999999999 0.14 2834.390 5.2 1.304159 -0.886630 -0.178589 1960-09-30 23:59:59.999999999 2.70 2839.022 5.6 0.120983 -0.193590 0.747989
3.3将“宽”透视为“长”
df = pd. DataFrame( { 'key' : [ 'foo' , 'bar' , 'baz' ] , 'A' : [ 1 , 2 , 3 ] , 'B' : [ 4 , 5 , 6 ] , 'C' : [ 7 , 8 , 9 ] } )
df
key A B C 0 foo 1 4 7 1 bar 2 5 8 2 baz 3 6 9
melted = pd. melt( df, [ 'key' ] )
melted
key variable value 0 foo A 1 1 bar A 2 2 baz A 3 3 foo B 4 4 bar B 5 5 baz B 6 6 foo C 7 7 bar C 8 8 baz C 9
reshaped = melted. pivot( 'key' , 'variable' , 'value' )
reshaped
variable A B C key bar 2 5 8 baz 3 6 9 foo 1 4 7
reshaped. reset_index( )
variable key A B C 0 bar 2 5 8 1 baz 3 6 9 2 foo 1 4 7
pd. melt( df, value_vars= [ 'A' , 'B' , 'C' ] )
variable value 0 A 1 1 A 2 2 A 3 3 B 4 4 B 5 5 B 6 6 C 7 7 C 8 8 C 9
pd. melt( df, value_vars= [ 'key' , 'A' , 'B' ] )
variable value 0 key foo 1 key bar 2 key baz 3 A 1 4 A 2 5 A 3 6 B 4 7 B 5 8 B 6