pandas数据索引、连接、联合与重塑

自我学习学习笔记:pandas索引拼接重塑
1、分层索引
2、联合与合并数据集
3、重塑和透视

1分层索引

import pandas as pd
import numpy as np

生成数据

data = pd.Series(np.random.randn(9),index=[['a','a','a','b','b','c','c','d','d'],[1,2,3,1,3,1,2,2,3]])
data
a  1    0.269045
   2   -0.885298
   3    0.158806
b  1    0.378847
   3    0.301012
c  1   -1.255018
   2   -1.453824
d  2   -0.088124
   3   -1.746533
dtype: float64
data.index
MultiIndex([('a', 1),
            ('a', 2),
            ('a', 3),
            ('b', 1),
            ('b', 3),
            ('c', 1),
            ('c', 2),
            ('d', 2),
            ('d', 3)],
           )

分层索引

data['b']
1    0.378847
3    0.301012
dtype: float64
data['b':'c']
b  1    0.378847
   3    0.301012
c  1   -1.255018
   2   -1.453824
dtype: float64
data.loc[['b','c']]
b  1    0.378847
   3    0.301012
c  1   -1.255018
   2   -1.453824
dtype: float64

在内部层级中进行选择

data.loc[:, 2]
a   -0.885298
c   -1.453824
d   -0.088124
dtype: float64

使用unstack将数据重新排列

data.unstack()
123
a0.269045-0.8852980.158806
b0.378847NaN0.301012
c-1.255018-1.453824NaN
dNaN-0.088124-1.746533

unstack的反操作为stack

data.unstack().stack()
a  1    0.269045
   2   -0.885298
   3    0.158806
b  1    0.378847
   3    0.301012
c  1   -1.255018
   2   -1.453824
d  2   -0.088124
   3   -1.746533
dtype: float64

分层索引

frame = pd.DataFrame(np.arange(12).reshape((4,3)),index=[['a','a','b','b'],[1,2,1,2]],columns=[['Ohio','Ohio','Colorado'],['Green','Red','Gerrn']])
frame
OhioColorado
GreenRedGerrn
a1012
2345
b1678
291011

添加索引名称

frame.index.names = ['key1','key2']
frame.columns.names = ['state','color']
frame
stateOhioColorado
colorGreenRedGerrn
key1key2
a1012
2345
b1678
291011

列索引

frame['Ohio']
colorGreenRed
key1key2
a101
234
b167
2910
from pandas import MultiIndex

使用MultiIndex创造带有层级的列

MultiIndex.from_arrays([['Ohio','Ohio','Colorado'],['Green','Red','Gerrn']],names=['state','color'])
MultiIndex([(    'Ohio', 'Green'),
            (    'Ohio',   'Red'),
            ('Colorado', 'Gerrn')],
           names=['state', 'color'])

1.1重排序和层级排序

frame.swaplevel('key1','key2')
stateOhioColorado
colorGreenRedGerrn
key2key1
1a012
2a345
1b678
2b91011
frame.sort_index(level=1)
stateOhioColorado
colorGreenRedGerrn
key1key2
a1012
b1678
a2345
b291011
frame.swaplevel(0,1).sort_index(level=0)
stateOhioColorado
colorGreenRedGerrn
key2key1
1a012
b678
2a345
b91011

1.2按层级进行汇总统计

frame.sum(level='key2')
stateOhioColorado
colorGreenRedGerrn
key2
16810
2121416
frame.sum(level='color',axis=1)
colorGreenRedGerrn
key1key2
a1012
2345
b1678
291011

1.3使用列进行索引

frame = pd.DataFrame({'a':range(7),'b':range(7,0,-1),'c':['one','one','one','two','two','two','two'],'d':[0,1,2,0,1,2,3]})
frame
abcd
007one0
116one1
225one2
334two0
443two1
552two2
661two3
frame2 = frame.set_index(['c','d'])
frame2
ab
cd
one007
116
225
two034
143
252
361
frame.set_index(['c','d'],drop=False)
abcd
cd
one007one0
116one1
225one2
two034two0
143two1
252two2
361two3
frame2.reset_index()
cdab
0one007
1one116
2one225
3two034
4two143
5two252
6two361

2联合与合并数据集

2.1数据库分格的DateFrame连接

df1 = pd.DataFrame({'key':['b','b','a','c','a','a','b'],'data1':range(7)})
df1
keydata1
0b0
1b1
2a2
3c3
4a4
5a5
6b6
df2 = pd.DataFrame({'key':['a','b','d'],'data2':range(3)})
df2
keydata2
0a0
1b1
2d2
pd.merge(df1,df2)
keydata1data2
0b01
1b11
2b61
3a20
4a40
5a50
pd.merge(df1,df2,on='key')
keydata1data2
0b01
1b11
2b61
3a20
4a40
5a50
df3 = pd.DataFrame({'lkey':['b','b','a','c','a','a','b'],'data1':range(7)})
df3
lkeydata1
0b0
1b1
2a2
3c3
4a4
5a5
6b6
df4 = pd.DataFrame({'rkey':['a','b','d'],'data2':range(3)})
df4
rkeydata2
0a0
1b1
2d2
pd.merge(df3,df4,left_on='lkey',right_on='rkey')
lkeydata1rkeydata2
0b0b1
1b1b1
2b6b1
3a2a0
4a4a0
5a5a0
pd.merge(df1,df2,how='outer')
keydata1data2
0b0.01.0
1b1.01.0
2b6.01.0
3a2.00.0
4a4.00.0
5a5.00.0
6c3.0NaN
7dNaN2.0
df1 = pd.DataFrame({'key':['b','b','a','c','a','b'],'data1':range(6)})
df1
keydata1
0b0
1b1
2a2
3c3
4a4
5b5
df2 = pd.DataFrame({'key':['a','b','a','b','d'],'data2':range(5)})
df2
keydata2
0a0
1b1
2a2
3b3
4d4
pd.merge(df1,df2,on='key',how='left')
keydata1data2
0b01.0
1b03.0
2b11.0
3b13.0
4a20.0
5a22.0
6c3NaN
7a40.0
8a42.0
9b51.0
10b53.0
pd.merge(df1,df2,how='inner')
keydata1data2
0b01
1b03
2b11
3b13
4b51
5b53
6a20
7a22
8a40
9a42
left = pd.DataFrame({'key1':['foo','foo','bar'],'key2':['one','two','one'],'lval':[1,2,3]})
left
key1key2lval
0fooone1
1footwo2
2barone3
right = pd.DataFrame({'key1':['foo','foo','bar','bar'],'key2':['one','one','one','two'],'rval':[4,5,6,7]})
right
key1key2rval
0fooone4
1fooone5
2barone6
3bartwo7
pd.merge(left,right,on=['key1','key2'],how='outer')
key1key2lvalrval
0fooone1.04.0
1fooone1.05.0
2footwo2.0NaN
3barone3.06.0
4bartwoNaN7.0
pd.merge(left,right,on='key1')
key1key2_xlvalkey2_yrval
0fooone1one4
1fooone1one5
2footwo2one4
3footwo2one5
4barone3one6
5barone3two7
pd.merge(left,right,on='key1',suffixes=('_left','_right'))
key1key2_leftlvalkey2_rightrval
0fooone1one4
1fooone1one5
2footwo2one4
3footwo2one5
4barone3one6
5barone3two7

2.2根据索引合并

left1 = pd.DataFrame({'key':['a','b','a','a','b','c'],'value':range(6)})
left1
keyvalue
0a0
1b1
2a2
3a3
4b4
5c5
right1 = pd.DataFrame({'group_val':[3.5,7]},index=['a','b'])
right1
group_val
a3.5
b7.0
pd.merge(left1,right1,left_on='key',right_index=True)
keyvaluegroup_val
0a03.5
2a23.5
3a33.5
1b17.0
4b47.0
pd.merge(left1,right1,left_on='key',right_index=True,how='outer')
keyvaluegroup_val
0a03.5
2a23.5
3a33.5
1b17.0
4b47.0
5c5NaN
lefth = pd.DataFrame({'key1':['Ohio','Ohio','Ohio','Nevada','Nevada'],'key2':[2000,2001,2002,2001,2002],'data':np.arange(5.)})
lefth
key1key2data
0Ohio20000.0
1Ohio20011.0
2Ohio20022.0
3Nevada20013.0
4Nevada20024.0
righth = pd.DataFrame(np.arange(12).reshape((6,2)),index=[['Nevada','Nevada','Ohio','Ohio','Ohio','Ohio'],[2001,2000,2000,2000,2001,2002]],columns=['event1','event2'])
righth
event1event2
Nevada200101
200023
Ohio200045
200067
200189
20021011
pd.merge(lefth,righth,left_on=['key1','key2'],right_index=True)
key1key2dataevent1event2
0Ohio20000.045
0Ohio20000.067
1Ohio20011.089
2Ohio20022.01011
3Nevada20013.001
pd.merge(lefth,righth,left_on=['key1','key2'],right_index=True,how='outer')
key1key2dataevent1event2
0Ohio20000.04.05.0
0Ohio20000.06.07.0
1Ohio20011.08.09.0
2Ohio20022.010.011.0
3Nevada20013.00.01.0
4Nevada20024.0NaNNaN
4Nevada2000NaN2.03.0

使用两边的索引进行合并

left2 = pd.DataFrame([[1.,2.],[3.,4.],[5.,6.]],index=['a','c','e'],columns=['Ohio','Nevada'])
left2
OhioNevada
a1.02.0
c3.04.0
e5.06.0
right2 = pd.DataFrame([[7.,8.],[9.,10.],[11.,12.],[13.,14.]],index=['b','c','d','e'],columns=['Missouri','Alabama'])
right2
MissouriAlabama
b7.08.0
c9.010.0
d11.012.0
e13.014.0
pd.merge(left2,right2,how='outer',left_index=True,right_index=True)
OhioNevadaMissouriAlabama
a1.02.0NaNNaN
bNaNNaN7.08.0
c3.04.09.010.0
dNaNNaN11.012.0
e5.06.013.014.0
left2.join(right2,how='outer')
OhioNevadaMissouriAlabama
a1.02.0NaNNaN
bNaNNaN7.08.0
c3.04.09.010.0
dNaNNaN11.012.0
e5.06.013.014.0
left1.join(right1,on='key')
keyvaluegroup_val
0a03.5
1b17.0
2a23.5
3a33.5
4b47.0
5c5NaN
another = pd.DataFrame([[7.,8.],[9.,10.],[11.,12.],[16.,17.]],index=['a','c','e','f'],columns=['New York','Oregon'])
another
New YorkOregon
a7.08.0
c9.010.0
e11.012.0
f16.017.0
left2.join([right2,another])
OhioNevadaMissouriAlabamaNew YorkOregon
a1.02.0NaNNaN7.08.0
c3.04.09.010.09.010.0
e5.06.013.014.011.012.0
left2.join([right2,another],how='outer')
OhioNevadaMissouriAlabamaNew YorkOregon
a1.02.0NaNNaN7.08.0
c3.04.09.010.09.010.0
e5.06.013.014.011.012.0
bNaNNaN7.08.0NaNNaN
dNaNNaN11.012.0NaNNaN
fNaNNaNNaNNaN16.017.0

2.3沿轴向连接

arr = np.arange(12).reshape((3,4))
arr
array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])
np.concatenate([arr,arr],axis=1)
array([[ 0,  1,  2,  3,  0,  1,  2,  3],
       [ 4,  5,  6,  7,  4,  5,  6,  7],
       [ 8,  9, 10, 11,  8,  9, 10, 11]])
s1 = pd.Series([0,1],index=['a','b'])
s1
a    0
b    1
dtype: int64
s2 = pd.Series([2,3,4],index=['c','d','e'])
s2
c    2
d    3
e    4
dtype: int64
s3 = pd.Series([5,6],index=['f','g'])
s3
f    5
g    6
dtype: int64
pd.concat([s1,s2,s3])
a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64
pd.concat([s1,s2,s3],axis=1)#列
012
a0.0NaNNaN
b1.0NaNNaN
cNaN2.0NaN
dNaN3.0NaN
eNaN4.0NaN
fNaNNaN5.0
gNaNNaN6.0
s4 = pd.concat([s1,s3])
s4
a    0
b    1
f    5
g    6
dtype: int64
pd.concat([s1,s4],axis=1)
01
a0.00
b1.01
fNaN5
gNaN6
pd.concat([s1,s4],axis=1,join='inner')
01
a00
b11
result = pd.concat([s1,s1,s3],keys=['one','two','three'])
result
one    a    0
       b    1
two    a    0
       b    1
three  f    5
       g    6
dtype: int64
result.unstack()
abfg
one0.01.0NaNNaN
two0.01.0NaNNaN
threeNaNNaN5.06.0
pd.concat([s1,s2,s3],axis=1,keys=['one','two','three'])
onetwothree
a0.0NaNNaN
b1.0NaNNaN
cNaN2.0NaN
dNaN3.0NaN
eNaN4.0NaN
fNaNNaN5.0
gNaNNaN6.0
df1 = pd.DataFrame(np.arange(6).reshape(3,2),index=['a','b','c'],columns=['one','two'])
df1
onetwo
a01
b23
c45
df2 = pd.DataFrame(5+np.arange(4).reshape(2,2),index=['a','c'],columns=['three','four'])
df2
threefour
a56
c78
pd.concat([df1,df2],axis=1,keys=['level1','level2'])
level1level2
onetwothreefour
a015.06.0
b23NaNNaN
c457.08.0
pd.concat({'level1':df1,'level2':df2},axis=1)
level1level2
onetwothreefour
a015.06.0
b23NaNNaN
c457.08.0
pd.concat([df1,df2],axis=1,keys=['level1','level2'],names=['upper','lower'])
upperlevel1level2
loweronetwothreefour
a015.06.0
b23NaNNaN
c457.08.0
df1 = pd.DataFrame(np.random.randn(3,4),columns=['a','b','c','d'])
df1
abcd
00.030306-0.665932-0.180251-0.562281
1-0.562836-1.362955-1.928334-0.991793
2-1.3821190.6956080.6385880.812288
df2 = pd.DataFrame(np.random.randn(2,3),columns=['b','d','a'])
df2
bda
00.323228-1.2292342.685688
1-1.968068-1.996343-1.134667
pd.concat([df1,df2],ignore_index=True)
abcd
00.030306-0.665932-0.180251-0.562281
1-0.562836-1.362955-1.928334-0.991793
2-1.3821190.6956080.6385880.812288
32.6856880.323228NaN-1.229234
4-1.134667-1.968068NaN-1.996343

2.4联合重叠数据

a = pd.Series([np.nan,2.5,0.0,3.5,4.5,np.nan],index=['f','e','d','c','b','a'])
a
f    NaN
e    2.5
d    0.0
c    3.5
b    4.5
a    NaN
dtype: float64
b = pd.Series([0.,np.nan,2.,np.nan,np.nan,5.],index=['a','b','c','d','e','f'])
b
a    0.0
b    NaN
c    2.0
d    NaN
e    NaN
f    5.0
dtype: float64
np.where(pd.isnull(a),b,a)
array([0. , 2.5, 0. , 3.5, 4.5, 5. ])
b.combine_first(a)
a    0.0
b    4.5
c    2.0
d    0.0
e    2.5
f    5.0
dtype: float64
df1 = pd.DataFrame({'a':[1.,np.nan,5.,np.nan],'b':[np.nan,2.,np.nan,6.],'c':range(2,18,4)})
df1
abc
01.0NaN2
1NaN2.06
25.0NaN10
3NaN6.014
df2 = pd.DataFrame({'a':[5.,4.,np.nan,3.,7.],'b':[np.nan,3.,4.,6.,8.]})
df2
ab
05.0NaN
14.03.0
2NaN4.0
33.06.0
47.08.0
df1.combine_first(df2)
abc
01.0NaN2.0
14.02.06.0
25.04.010.0
33.06.014.0
47.08.0NaN

3.重塑和透视

3.1使用多层索引进行重塑

data = pd.DataFrame(np.arange(6).reshape((2,3)),index=pd.Index(['Ohio','Colorado'],name='state'),columns=pd.Index(['one','two','three'],name='number'))
data
numberonetwothree
state
Ohio012
Colorado345

stack将列透视到行

result = data.stack()
result
state     number
Ohio      one       0
          two       1
          three     2
Colorado  one       3
          two       4
          three     5
dtype: int32
result.unstack()
numberonetwothree
state
Ohio012
Colorado345
result.unstack(0)
stateOhioColorado
number
one03
two14
three25
result.unstack('state')
stateOhioColorado
number
one03
two14
three25
s1 = pd.Series([0,1,2,3],index=['a','b','c','d'])
s1
a    0
b    1
c    2
d    3
dtype: int64
s2 = pd.Series([4,5,6],index=['c','d','e'])
s2
c    4
d    5
e    6
dtype: int64
data2 = pd.concat([s1,s2],keys=['one','two'])
data2
one  a    0
     b    1
     c    2
     d    3
two  c    4
     d    5
     e    6
dtype: int64
data2.unstack()
abcde
one0.01.02.03.0NaN
twoNaNNaN4.05.06.0
data2.unstack().stack()
one  a    0.0
     b    1.0
     c    2.0
     d    3.0
two  c    4.0
     d    5.0
     e    6.0
dtype: float64
data2.unstack().stack(dropna=False)
one  a    0.0
     b    1.0
     c    2.0
     d    3.0
     e    NaN
two  a    NaN
     b    NaN
     c    4.0
     d    5.0
     e    6.0
dtype: float64
df = pd.DataFrame({'left':result,'right':result+5},columns=pd.Index(['left','right'],name='side'))
df
sideleftright
statenumber
Ohioone05
two16
three27
Coloradoone38
two49
three510
df.unstack('state')
sideleftright
stateOhioColoradoOhioColorado
number
one0358
two1469
three25710
df.unstack('state').stack('side')
stateColoradoOhio
numberside
oneleft30
right85
twoleft41
right96
threeleft52
right107

3.2将“长”透视为“宽”

data = pd.read_csv('examples/macrodata.csv')
data.head()
yearquarterrealgdprealconsrealinvrealgovtrealdpicpim1tbilrateunemppopinflrealint
0195912710.3491707.4286.898470.0451886.928.98139.72.825.8177.1460.000.00
1195922778.8011733.7310.859481.3011919.729.15141.73.085.1177.8302.340.74
2195932775.4881751.8289.226491.2601916.429.35140.53.825.3178.6572.741.09
3195942785.2041753.7299.356484.0521931.329.37140.04.335.6179.3860.274.06
4196012847.6991770.5331.722462.1991955.529.54139.63.505.2180.0072.311.19
periods = pd.PeriodIndex(year=data.year,quarter=data.quarter,name='date')
columns = pd.Index(['realgdp','infl','unemp'],name='item')
data = data.reindex(columns=columns)
data.index = periods.to_timestamp('D','end')
ldata = data.stack().reset_index().rename(columns={0: 'value'})
ldata[:10]
dateitemvalue
01959-03-31 23:59:59.999999999realgdp2710.349
11959-03-31 23:59:59.999999999infl0.000
21959-03-31 23:59:59.999999999unemp5.800
31959-06-30 23:59:59.999999999realgdp2778.801
41959-06-30 23:59:59.999999999infl2.340
51959-06-30 23:59:59.999999999unemp5.100
61959-09-30 23:59:59.999999999realgdp2775.488
71959-09-30 23:59:59.999999999infl2.740
81959-09-30 23:59:59.999999999unemp5.300
91959-12-31 23:59:59.999999999realgdp2785.204
pivoted = ldata.pivot('date','item','value')
pivoted
iteminflrealgdpunemp
date
1959-03-31 23:59:59.9999999990.002710.3495.8
1959-06-30 23:59:59.9999999992.342778.8015.1
1959-09-30 23:59:59.9999999992.742775.4885.3
1959-12-31 23:59:59.9999999990.272785.2045.6
1960-03-31 23:59:59.9999999992.312847.6995.2
............
2008-09-30 23:59:59.999999999-3.1613324.6006.0
2008-12-31 23:59:59.999999999-8.7913141.9206.9
2009-03-31 23:59:59.9999999990.9412925.4108.1
2009-06-30 23:59:59.9999999993.3712901.5049.2
2009-09-30 23:59:59.9999999993.5612990.3419.6

203 rows × 3 columns

ldata['value2'] = np.random.randn(len(ldata))
ldata[:10]
dateitemvaluevalue2
01959-03-31 23:59:59.999999999realgdp2710.3491.417751
11959-03-31 23:59:59.999999999infl0.000-0.843318
21959-03-31 23:59:59.999999999unemp5.8000.752850
31959-06-30 23:59:59.999999999realgdp2778.8011.481983
41959-06-30 23:59:59.999999999infl2.3400.912075
51959-06-30 23:59:59.999999999unemp5.100-0.530813
61959-09-30 23:59:59.999999999realgdp2775.4881.443973
71959-09-30 23:59:59.999999999infl2.7401.977522
81959-09-30 23:59:59.999999999unemp5.3000.621740
91959-12-31 23:59:59.999999999realgdp2785.204-0.379998
pivoted = ldata.pivot('date','item')
pivoted[:5]
valuevalue2
iteminflrealgdpunempinflrealgdpunemp
date
1959-03-31 23:59:59.9999999990.002710.3495.8-0.8433181.4177510.752850
1959-06-30 23:59:59.9999999992.342778.8015.10.9120751.481983-0.530813
1959-09-30 23:59:59.9999999992.742775.4885.31.9775221.4439730.621740
1959-12-31 23:59:59.9999999990.272785.2045.61.356540-0.379998-0.738881
1960-03-31 23:59:59.9999999992.312847.6995.21.273626-1.512028-0.234333
pivoted['value'][:5]
iteminflrealgdpunemp
date
1959-03-31 23:59:59.9999999990.002710.3495.8
1959-06-30 23:59:59.9999999992.342778.8015.1
1959-09-30 23:59:59.9999999992.742775.4885.3
1959-12-31 23:59:59.9999999990.272785.2045.6
1960-03-31 23:59:59.9999999992.312847.6995.2
unstacked = ldata.set_index(['date','item']).unstack('item')
unstacked[:7]
valuevalue2
iteminflrealgdpunempinflrealgdpunemp
date
1959-03-31 23:59:59.9999999990.002710.3495.8-0.8433181.4177510.752850
1959-06-30 23:59:59.9999999992.342778.8015.10.9120751.481983-0.530813
1959-09-30 23:59:59.9999999992.742775.4885.31.9775221.4439730.621740
1959-12-31 23:59:59.9999999990.272785.2045.61.356540-0.379998-0.738881
1960-03-31 23:59:59.9999999992.312847.6995.21.273626-1.512028-0.234333
1960-06-30 23:59:59.9999999990.142834.3905.21.304159-0.886630-0.178589
1960-09-30 23:59:59.9999999992.702839.0225.60.120983-0.1935900.747989

3.3将“宽”透视为“长”

df = pd.DataFrame({'key':['foo','bar','baz'],'A':[1,2,3],'B':[4,5,6],'C':[7,8,9]})
df
keyABC
0foo147
1bar258
2baz369
melted = pd.melt(df,['key'])
melted
keyvariablevalue
0fooA1
1barA2
2bazA3
3fooB4
4barB5
5bazB6
6fooC7
7barC8
8bazC9
reshaped = melted.pivot('key','variable','value')
reshaped
variableABC
key
bar258
baz369
foo147
reshaped.reset_index()
variablekeyABC
0bar258
1baz369
2foo147
pd.melt(df,value_vars=['A','B','C'])
variablevalue
0A1
1A2
2A3
3B4
4B5
5B6
6C7
7C8
8C9
pd.melt(df,value_vars=['key','A','B'])
variablevalue
0keyfoo
1keybar
2keybaz
3A1
4A2
5A3
6B4
7B5
8B6
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值