## 等级索引
import pandas as pd
import numpy as np
data = pd.Series(np.random.randn(9 ),index=[['a' , 'a' , 'a' , 'b' , 'b' , 'c' , 'c' , 'd' , 'd' ], [1 , 2 , 3 , 1 , 3 , 1 , 2 , 2 , 3 ]])
data
a 1 1.831129 2 -0.339180 3 -1.048893 b 1 0.146483 3 -0.165387 c 1 1.173373 2 -2.067447 d 2 0.571641 3 0.249290 dtype: float64
data.index
MultiIndex(levels=[[‘a’, ‘b’, ‘c’, ‘d’], [1, 2, 3]], labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1, 1, 2]]) ### 选择
data.b
1 0.146483 3 -0.165387 dtype: float64
data['b' :'c' ]
b 1 0.146483 3 -0.165387 c 1 1.173373 2 -2.067447 dtype: float64
data[['b' ,'d' ]]
b 1 0.146483 3 -0.165387 d 2 0.571641 3 0.249290 dtype: float64
data.loc[:,2 ]
a -0.339180 c -2.067447 d 0.571641 dtype: float64 ### 聚合
data
a 1 1.831129 2 -0.339180 3 -1.048893 b 1 0.146483 3 -0.165387 c 1 1.173373 2 -2.067447 d 2 0.571641 3 0.249290 dtype: float64
data.unstack()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
1 2 3 a 1.831129 -0.339180 -1.048893 b 0.146483 NaN -0.165387 c 1.173373 -2.067447 NaN d NaN 0.571641 0.249290
data.unstack().stack()
a 1 1.831129 2 -0.339180 3 -1.048893 b 1 0.146483 3 -0.165387 c 1 1.173373 2 -2.067447 d 2 0.571641 3 0.249290 dtype: float64 ### dataframe对象
frame = pd.DataFrame(np.arange(12 ).reshape((4 , 3 )),
index=[['a' , 'a' , 'b' , 'b' ], [1 , 2 , 1 , 2 ]],
columns=[['Ohio' , 'Ohio' , 'Colorado' ],
['Green' , 'Red' , 'Green' ]])
frame
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead tr th { text-align: left; }
Ohio Colorado Green Red Green a 1 0 1 2 2 3 4 5 b 1 6 7 8 2 9 10 11
#### 行列名
frame.index.names = ['key1' ,'key2' ]
frame.columns.names = ['state' ,'color' ]
frame
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead tr th { text-align: left; } .dataframe thead tr:last-of-type th { text-align: right; }
state Ohio Colorado color Green Red Green key1 key2 a 1 0 1 2 2 3 4 5 b 1 6 7 8 2 9 10 11
#### 列选取
frame.Ohio
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
color Green Red key1 key2 a 1 0 1 2 3 4 b 1 6 7 2 9 10
## 记录和排序
frame.swaplevel('key1' ,'key2' )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead tr th { text-align: left; } .dataframe thead tr:last-of-type th { text-align: right; }
state Ohio Colorado color Green Red Green key2 key1 1 a 0 1 2 2 a 3 4 5 1 b 6 7 8 2 b 9 10 11
### 行索引排序
frame.sort_index(level=1 )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead tr th { text-align: left; } .dataframe thead tr:last-of-type th { text-align: right; }
state Ohio Colorado color Green Red Green key1 key2 a 1 0 1 2 b 1 6 7 8 a 2 3 4 5 b 2 9 10 11
frame.swaplevel(0 ,1 ).sort_index(level=0 )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead tr th { text-align: left; } .dataframe thead tr:last-of-type th { text-align: right; }
state Ohio Colorado color Green Red Green key2 key1 1 a 0 1 2 b 6 7 8 2 a 3 4 5 b 9 10 11
### 运算
frame.sum(level='key2' )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead tr th { text-align: left; } .dataframe thead tr:last-of-type th { text-align: right; }
state Ohio Colorado color Green Red Green key2 1 6 8 10 2 12 14 16
frame.sum(level='color' ,axis=1 )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
color Green Red key1 key2 a 1 2 1 2 8 4 b 1 14 7 2 20 10
## 列索引
frame = pd.DataFrame({'a' : range(7 ), 'b' : range(7 , 0 , -1 ),
'c' : ['one' , 'one' , 'one' , 'two' , 'two' ,
'two' , 'two' ],
'd' : [0 , 1 , 2 , 0 , 1 , 2 , 3 ]})
frame
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
a b c d 0 0 7 one 0 1 1 6 one 1 2 2 5 one 2 3 3 4 two 0 4 4 3 two 1 5 5 2 two 2 6 6 1 two 3
#### 变列为行索引
frame2 = frame.set_index(['c' ,'d' ])
frame2
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
a b c d one 0 0 7 1 1 6 2 2 5 two 0 3 4 1 4 3 2 5 2 3 6 1
frame.set_index(['c' ,'d' ],drop=False )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
a b c d c d one 0 0 7 one 0 1 1 6 one 1 2 2 5 one 2 two 0 3 4 two 0 1 4 3 two 1 2 5 2 two 2 3 6 1 two 3
frame2.reset_index()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
c d a b 0 one 0 0 7 1 one 1 1 6 2 one 2 2 5 3 two 0 3 4 4 two 1 4 3 5 two 2 5 2 6 two 3 6 1
## 数据集操作 ### 合并
df1 = pd.DataFrame({'key' : ['b' , 'b' , 'a' , 'c' , 'a' , 'a' , 'b' ], 'data1' : range(7 )})
df1
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
data1 key 0 0 b 1 1 b 2 2 a 3 3 c 4 4 a 5 5 a 6 6 b
df2 = pd.DataFrame({'key' : ['a' , 'b' , 'd' ],'data2' : range(3 )})
df2
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
pd.merge(df1,df2)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
data1 key data2 0 0 b 1 1 1 b 1 2 6 b 1 3 2 a 0 4 4 a 0 5 5 a 0
#### 指定列
pd.merge(df1,df2,on='key' )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
data1 key data2 0 0 b 1 1 1 b 1 2 6 b 1 3 2 a 0 4 4 a 0 5 5 a 0
#### 分别指定
df3 = pd.DataFrame({'lkey' : ['b' , 'b' , 'a' , 'c' , 'a' , 'a' , 'b' ],'data1' : range(7 )})
df4 = pd.DataFrame({'rkey' : ['a' , 'b' , 'd' ],'data2' : range(3 )})
pd.merge(df3,df4,left_on='lkey' ,right_on = 'rkey' )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
data1 lkey data2 rkey 0 0 b 1 b 1 1 b 1 b 2 6 b 1 b 3 2 a 0 a 4 4 a 0 a 5 5 a 0 a
#### 外部模式
pd.merge(df1,df2,how='outer' )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
data1 key data2 0 0.0 b 1.0 1 1.0 b 1.0 2 6.0 b 1.0 3 2.0 a 0.0 4 4.0 a 0.0 5 5.0 a 0.0 6 3.0 c NaN 7 NaN d 2.0
参数: Option Behavior ‘inner’ Use only the key combinations observed in both tables ‘left’ Use all key combinations found in the left table ‘right’ Use all key combinations found in the right table ‘output’ Use all key combinations observed in both tables together
#### 全部参数展示
df1 = pd.DataFrame({'key' : ['b' , 'b' , 'a' , 'c' , 'a' , 'b' ],'data1' : range(6 )})
df2 = pd.DataFrame({'key' : ['a' , 'b' , 'a' , 'b' , 'd' ], 'data2' : range(5 )})
df1
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
data1 key 0 0 b 1 1 b 2 2 a 3 3 c 4 4 a 5 5 b
df2
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
pd.merge(df1,df2,on='key' ,how='left' )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
data1 key data2 0 0 b 1.0 1 0 b 3.0 2 1 b 1.0 3 1 b 3.0 4 2 a 0.0 5 2 a 2.0 6 3 c NaN 7 4 a 0.0 8 4 a 2.0 9 5 b 1.0 10 5 b 3.0
pd.merge(df1,df2,how='inner' )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
data1 key data2 0 0 b 1 1 0 b 3 2 1 b 1 3 1 b 3 4 5 b 1 5 5 b 3 6 2 a 0 7 2 a 2 8 4 a 0 9 4 a 2
#### 多列
left = pd.DataFrame({'key1' : ['foo' , 'foo' , 'bar' ],'key2' : ['one' , 'two' , 'one' ],'lval' : [1 , 2 , 3 ]})
right = pd.DataFrame({'key1' : ['foo' , 'foo' , 'bar' , 'bar' ],'key2' : ['one' , 'one' , 'one' , 'two' ],'rval' : [4 , 5 , 6 , 7 ]})
pd.merge(left,right,on=['key1' ,'key2' ,],how='outer' )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
key1 key2 lval rval 0 foo one 1.0 4.0 1 foo one 1.0 5.0 2 foo two 2.0 NaN 3 bar one 3.0 6.0 4 bar two NaN 7.0
pd.merge(left,right,on='key1' )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
key1 key2_x lval key2_y rval 0 foo one 1 one 4 1 foo one 1 one 5 2 foo two 2 one 4 3 foo two 2 one 5 4 bar one 3 one 6 5 bar one 3 two 7
left
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
key1 key2 lval 0 foo one 1 1 foo two 2 2 bar one 3
right
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
key1 key2 rval 0 foo one 4 1 foo one 5 2 bar one 6 3 bar two 7
pd.merge(left,right,on='key1' ,suffixes = ('_left' ,'_right' ))
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
key1 key2_left lval key2_right rval 0 foo one 1 one 4 1 foo one 1 one 5 2 foo two 2 one 4 3 foo two 2 one 5 4 bar one 3 one 6 5 bar one 3 two 7
参数: Argument Description left DataFrame to be merged on the left side. right DataFrame to be merged on the right side. how One of ‘inner’, ‘outer’, ‘left’, or ‘right’; defaults to ‘inner’. on Column names to join on. Must be found in both DataFrame objects. If not specified and no other join keys given, will use the intersection of the column names in left and right as the join keys. left_on Columns in left DataFrame to use as join keys. right_on Analogous to left_on for left DataFrame. left_index Use row index in left as its join key (or keys, if a MultiIndex). right_index Analogous to left_index. sort Sort merged data lexicographically by join keys; True by default (disable to get better performance in some cases on large datasets). suffixes Tuple of string values to append to column names in case of overlap; defaults to (‘_x’, ‘_y’) (e.g., if ‘data’ in both DataFrame objects, would appear as ‘data_x’ and ‘data_y’ in result). copy If False, avoid copying data into resulting data structure in some exceptional cases; by default always copies. indicator Adds a special column _merge that indicates the source of each row; values will be ‘left_only’, ‘right_only’, or ‘both’ based on the origin of the joined data in each row.
### 索引联结
left1 = pd.DataFrame({'key' : ['a' , 'b' , 'a' , 'a' , 'b' , 'c' ],'value' : range(6 )})
right1 = pd.DataFrame({'group_val' : [3.5 , 7 ]}, index=['a' , 'b' ])
left1
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
key value 0 a 0 1 b 1 2 a 2 3 a 3 4 b 4 5 c 5
right1
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
pd.merge(left1,right1,left_on = 'key' ,right_index = True )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
key value group_val 0 a 0 3.5 2 a 2 3.5 3 a 3 3.5 1 b 1 7.0 4 b 4 7.0
pd.merge(left1,right1,left_on = 'key' ,right_index = True ,how= 'outer' )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
key value group_val 0 a 0 3.5 2 a 2 3.5 3 a 3 3.5 1 b 1 7.0 4 b 4 7.0 5 c 5 NaN
#### 加之嵌套索引
lefth = pd.DataFrame({'key1' : ['Ohio' , 'Ohio' , 'Ohio' , 'Nevada' , 'Nevada' ],'key2' : [2000 , 2001 , 2002 , 2001 , 2002 ], 'data' : np.arange(5. )})
lefth
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
data key1 key2 0 0.0 Ohio 2000 1 1.0 Ohio 2001 2 2.0 Ohio 2002 3 3.0 Nevada 2001 4 4.0 Nevada 2002
righth = pd.DataFrame(np.arange(12 ).reshape((6 , 2 )),index=[['Nevada' , 'Nevada' , 'Ohio' , 'Ohio' , 'Ohio' , 'Ohio' ],[2001 , 2000 , 2000 , 2000 , 2001 , 2002 ]],columns=['event1' , 'event2' ])
righth
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
event1 event2 Nevada 2001 0 1 2000 2 3 Ohio 2000 4 5 2000 6 7 2001 8 9 2002 10 11
pd.merge(lefth,righth,left_on = ['key1' ,'key2' ],right_index = True )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
data key1 key2 event1 event2 0 0.0 Ohio 2000 4 5 0 0.0 Ohio 2000 6 7 1 1.0 Ohio 2001 8 9 2 2.0 Ohio 2002 10 11 3 3.0 Nevada 2001 0 1
pd.merge(lefth,righth,left_on = ['key1' ,'key2' ],right_index = True ,how='outer' )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
data key1 key2 event1 event2 0 0.0 Ohio 2000 4.0 5.0 0 0.0 Ohio 2000 6.0 7.0 1 1.0 Ohio 2001 8.0 9.0 2 2.0 Ohio 2002 10.0 11.0 3 3.0 Nevada 2001 0.0 1.0 4 4.0 Nevada 2002 NaN NaN 4 NaN Nevada 2000 2.0 3.0
### 保留左右
left2 = pd.DataFrame([[1. , 2. ], [3. , 4. ], [5. , 6. ]],
....: index=['a' , 'c' , 'e' ],
....: columns=['Ohio' , 'Nevada' ])
right2 = pd.DataFrame([[7. , 8. ], [9. , 10. ], [11. , 12. ], [13 , 14 ]],
....: index=['b' , 'c' , 'd' , 'e' ],
....: columns=['Missouri' , 'Alabama' ])
left2
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
Ohio Nevada a 1.0 2.0 c 3.0 4.0 e 5.0 6.0
right2
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
Missouri Alabama b 7.0 8.0 c 9.0 10.0 d 11.0 12.0 e 13.0 14.0
pd.merge(left2,right2,how='outer' ,left_index = True ,right_index= True )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
Ohio Nevada Missouri Alabama a 1.0 2.0 NaN NaN b NaN NaN 7.0 8.0 c 3.0 4.0 9.0 10.0 d NaN NaN 11.0 12.0 e 5.0 6.0 13.0 14.0
left2.join(right2,how='outer' )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
Ohio Nevada Missouri Alabama a 1.0 2.0 NaN NaN b NaN NaN 7.0 8.0 c 3.0 4.0 9.0 10.0 d NaN NaN 11.0 12.0 e 5.0 6.0 13.0 14.0
left1.join(right1,on='key' )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
key value group_val 0 a 0 3.5 1 b 1 7.0 2 a 2 3.5 3 a 3 3.5 4 b 4 7.0 5 c 5 NaN
### 其他
another = pd.DataFrame([[7. , 8. ], [9. , 10. ], [11. , 12. ], [16. , 17. ]], index=['a' , 'c' , 'e' , 'f' ],columns=['New York' , 'Oregon' ])
another
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
New York Oregon a 7.0 8.0 c 9.0 10.0 e 11.0 12.0 f 16.0 17.0
left2.join([right2,another])
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
Ohio Nevada Missouri Alabama New York Oregon a 1.0 2.0 NaN NaN 7.0 8.0 c 3.0 4.0 9.0 10.0 9.0 10.0 e 5.0 6.0 13.0 14.0 11.0 12.0
left2.join([right2,another],how='outer' )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
Ohio Nevada Missouri Alabama New York Oregon a 1.0 2.0 NaN NaN 7.0 8.0 b NaN NaN 7.0 8.0 NaN NaN c 3.0 4.0 9.0 10.0 9.0 10.0 d NaN NaN 11.0 12.0 NaN NaN e 5.0 6.0 13.0 14.0 11.0 12.0 f NaN NaN NaN NaN 16.0 17.0
### 用轴联结
arr = np.arange(12 ).reshape((3 ,4 ))
arr
array([[ 0, 1, 2, 3], [ 4, 5, 6, 7], [ 8, 9, 10, 11]])
np.concatenate([arr,arr],axis=1 )
array([[ 0, 1, 2, 3, 0, 1, 2, 3], [ 4, 5, 6, 7, 4, 5, 6, 7], [ 8, 9, 10, 11, 8, 9, 10, 11]]) #### concat
s1 = pd.Series([0 , 1 ], index=['a' , 'b' ])
s2 = pd.Series([2 , 3 , 4 ], index=['c' , 'd' , 'e' ])
s3 = pd.Series([5 , 6 ], index=['f' , 'g' ])
pd.concat([s1,s2,s3])
a 0 b 1 c 2 d 3 e 4 f 5 g 6 dtype: int64
pd.concat([s1,s2,s3],axis=1 )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
0 1 2 a 0.0 NaN NaN b 1.0 NaN NaN c NaN 2.0 NaN d NaN 3.0 NaN e NaN 4.0 NaN f NaN NaN 5.0 g NaN NaN 6.0
#### 内联结
s4 = pd.concat([s1,s3])
s4
a 0 b 1 f 5 g 6 dtype: int64
pd.concat([s1,s4],axis=1 )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
pd.concat([s1,s4],axis=1 ,join='inner' )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
#### 指定行
pd.concat([s1,s4],axis=1 ,join_axes=[['a' ,'c' ,'b' ,'e' ]])
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
0 1 a 0.0 0.0 c NaN NaN b 1.0 1.0 e NaN NaN
#### 同时产生嵌套索引
result= pd.concat([s1,s1,s3],keys = ['one' ,'two' ,'three' ])
result
one a 0 b 1 two a 0 b 1 three f 5 g 6 dtype: int64
result.unstack()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
a b f g one 0.0 1.0 NaN NaN two 0.0 1.0 NaN NaN three NaN NaN 5.0 6.0
#### 列
pd.concat([s1, s2, s3], axis=1 , keys=['one' , 'two' , 'three' ])
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
one two three a 0.0 NaN NaN b 1.0 NaN NaN c NaN 2.0 NaN d NaN 3.0 NaN e NaN 4.0 NaN f NaN NaN 5.0 g NaN NaN 6.0
### dataframe对象
df1 = pd.DataFrame(np.arange(6 ).reshape(3 , 2 ), index=['a' , 'b' , 'c' ], columns=['one' , 'two' ])
df2 = pd.DataFrame(5 + np.arange(4 ).reshape(2 , 2 ), index=['a' , 'c' ], columns=['three' , 'four' ])
df1
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
df2
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
pd.concat([df1,df2],axis=1 ,keys = ['level1' ,'level2' ])
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead tr th { text-align: left; }
level1 level2 one two three four a 0 1 5.0 6.0 b 2 3 NaN NaN c 4 5 7.0 8.0
#### dict写法
pd.concat({'level1' :df1,'level2' :df2},axis=1 )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead tr th { text-align: left; }
level1 level2 one two three four a 0 1 5.0 6.0 b 2 3 NaN NaN c 4 5 7.0 8.0
#### 命名
pd.concat([df1,df2],axis=1 ,keys=['level1' ,'level2' ],names = ['upper' ,'lower' ])
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead tr th { text-align: left; }
upper level1 level2 lower one two three four a 0 1 5.0 6.0 b 2 3 NaN NaN c 4 5 7.0 8.0
### 不含任何相同索引
df1 = pd.DataFrame(np.random.randn(3 , 4 ), columns=['a' , 'b' , 'c' , 'd' ])
df2 = pd.DataFrame(np.random.randn(2 , 3 ), columns=['b' , 'd' , 'a' ])
df1
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
a b c d 0 0.636284 -0.256463 -0.297701 2.199012 1 0.154697 -1.487730 -0.009024 0.268773 2 1.072418 1.663895 0.114226 -0.298354
df2
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
b d a 0 -0.599669 -0.671364 1.454462 1 -0.132970 -2.897728 -2.880973
pd.concat([df1,df2],ignore_index=True )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
a b c d 0 0.636284 -0.256463 -0.297701 2.199012 1 0.154697 -1.487730 -0.009024 0.268773 2 1.072418 1.663895 0.114226 -0.298354 3 1.454462 -0.599669 NaN -0.671364 4 -2.880973 -0.132970 NaN -2.897728
concat参数: Argument Description objs List or dict of pandas objects to be concatenated; this is the only required argument axis Axis to concatenate along; defaults to 0 (along rows) join Either ‘inner’ or ‘outer’ (‘outer’ by default); whether to intersection (inner) or union (outer) together indexes along the other axes join_axes Specific indexes to use for the other n–1 axes instead of performing union/intersection logic keys Values to associate with objects being concatenated, forming a hierarchical index along the concatenation axis; can either be a list or array of arbitrary values, an array of tuples, or a list of arrays (if multiple-level arrays passed in levels) levels Specific indexes to use as hierarchical index level or levels if keys passed names Names for created hierarchical levels if keys and/or levels passed verify_integrity Check new axis in concatenated object for duplicates and raise exception if so; by default (False) allows duplicates ignore_index Do not preserve indexes along concatenation axis, instead producing a new range(total_length) index
### 含重复项
a = pd.Series([np.nan, 2.5 , np.nan, 3.5 , 4.5 , np.nan], index=['f' , 'e' , 'd' , 'c' , 'b' , 'a' ])
b = pd.Series(np.arange(len(a), dtype=np.float64), index=['f' , 'e' , 'd' , 'c' , 'b' , 'a' ])
b[-1 ] = np.nan
a
f NaN e 2.5 d NaN c 3.5 b 4.5 a NaN dtype: float64
b
f 0.0 e 1.0 d 2.0 c 3.0 b 4.0 a NaN dtype: float64 ### where
np.where(pd.isnull(a),b,a)
array([0. , 2.5, 2. , 3.5, 4.5, nan]) ### 不知道
b[:-2 ].combine_first(a[2 :])
a NaN b 4.5 c 3.0 d 2.0 e 1.0 f 0.0 dtype: float64 #### dataframe
df1 = pd.DataFrame({'a' : [1. , np.nan, 5. , np.nan],
.....: 'b' : [np.nan, 2. , np.nan, 6. ],
.....: 'c' : range(2 , 18 , 4 )})
df2 = pd.DataFrame({'a' : [5. , 4. , np.nan, 3. , 7. ],
.....: 'b' : [np.nan, 3. , 4. , 6. , 8. ]})
df1
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
a b c 0 1.0 NaN 2 1 NaN 2.0 6 2 5.0 NaN 10 3 NaN 6.0 14
df2
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
a b 0 5.0 NaN 1 4.0 3.0 2 NaN 4.0 3 3.0 6.0 4 7.0 8.0
df1.combine_first(df2)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
a b c 0 1.0 NaN 2.0 1 4.0 2.0 6.0 2 5.0 4.0 10.0 3 3.0 6.0 14.0 4 7.0 8.0 NaN
## 变形记 ### 嵌套索引变形 ### 列to行
data = pd.DataFrame(np.arange(6 ).reshape((2 , 3 )),index=pd.Index(['Ohio' , 'Colorado' ], name='state' ),columns=pd.Index(['one' , 'two' , 'three' ],name='number' ))
data
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
number one two three state Ohio 0 1 2 Colorado 3 4 5
result = data.stack()
result
state number Ohio one 0 two 1 three 2 Colorado one 3 two 4 three 5 dtype: int64 #### 行to列
result.unstack()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
number one two three state Ohio 0 1 2 Colorado 3 4 5
result.unstack(0 )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
state Ohio Colorado number one 0 3 two 1 4 three 2 5
result.unstack('state' )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
state Ohio Colorado number one 0 3 two 1 4 three 2 5
##### 找不到则为NA
s1 = pd.Series([0 , 1 , 2 , 3 ], index=['a' , 'b' , 'c' , 'd' ])
s2 = pd.Series([4 , 5 , 6 ], index=['c' , 'd' , 'e' ])
data2 = pd.concat([s1, s2], keys=['one' , 'two' ])
data2.unstack()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
a b c d e one 0.0 1.0 2.0 3.0 NaN two NaN NaN 4.0 5.0 6.0
data2.unstack().stack()
one a 0.0 b 1.0 c 2.0 d 3.0 two c 4.0 d 5.0 e 6.0 dtype: float64
data2.unstack().stack(dropna=False )
one a 0.0 b 1.0 c 2.0 d 3.0 e NaN two a NaN b NaN c 4.0 d 5.0 e 6.0 dtype: float64 #### dataframe
df = pd.DataFrame({'left' : result, 'right' : result + 5 }, columns=pd.Index(['left' , 'right' ], name='side' ))
df
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
side left right state number Ohio one 0 5 two 1 6 three 2 7 Colorado one 3 8 two 4 9 three 5 10
df.unstack('state' )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead tr th { text-align: left; } .dataframe thead tr:last-of-type th { text-align: right; }
side left right state Ohio Colorado Ohio Colorado number one 0 3 5 8 two 1 4 6 9 three 2 5 7 10
#### 同时指定
df.unstack('state' ).stack('side' )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
state Colorado Ohio number side one left 3 0 right 8 5 two left 4 1 right 9 6 three left 5 2 right 10 7
### 长变宽
data = pd.read_csv('/Users/meininghang/Downloads/pydata-book-2nd-edition/examples/macrodata.csv' )
data.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
year quarter realgdp realcons realinv realgovt realdpi cpi m1 tbilrate unemp pop infl realint 0 1959.0 1.0 2710.349 1707.4 286.898 470.045 1886.9 28.98 139.7 2.82 5.8 177.146 0.00 0.00 1 1959.0 2.0 2778.801 1733.7 310.859 481.301 1919.7 29.15 141.7 3.08 5.1 177.830 2.34 0.74 2 1959.0 3.0 2775.488 1751.8 289.226 491.260 1916.4 29.35 140.5 3.82 5.3 178.657 2.74 1.09 3 1959.0 4.0 2785.204 1753.7 299.356 484.052 1931.3 29.37 140.0 4.33 5.6 179.386 0.27 4.06 4 1960.0 1.0 2847.699 1770.5 331.722 462.199 1955.5 29.54 139.6 3.50 5.2 180.007 2.31 1.19
periods = pd.PeriodIndex(year=data.year,quarter=data.quarter,name='date' )
columns = pd.Index(['realgdp' ,'infl' ,'unemp' ],name='item' )
data = data.reindex(columns=columns)
data.index = periods.to_timestamp('D' ,'end' )
ldata = data.stack().reset_index().rename(columns={0 :'value' })
ldata.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
date item value 0 1959-03-31 realgdp 2710.349 1 1959-03-31 infl 0.000 2 1959-03-31 unemp 5.800 3 1959-06-30 realgdp 2778.801 4 1959-06-30 infl 2.340
### 宽变长
df = pd.DataFrame({'key' : ['foo' , 'bar' , 'baz' ],
.....: 'A' : [1 , 2 , 3 ],
.....: 'B' : [4 , 5 , 6 ],
.....: 'C' : [7 , 8 , 9 ]})
df
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A B C key 0 1 4 7 foo 1 2 5 8 bar 2 3 6 9 baz
melted = pd.melt(df,['key' ])
melted
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
key variable value 0 foo A 1 1 bar A 2 2 baz A 3 3 foo B 4 4 bar B 5 5 baz B 6 6 foo C 7 7 bar C 8 8 baz C 9
reshaped = melted.pivot('key' ,'variable' ,'value' )