# Chapter 8 数据规整：聚合、合并和重塑

## 8.1 层次化索引

In [3]: data = pd.Series(np.random.randn(9),
index=[['a', 'a', 'a', 'b', 'b', 'c', 'c', 'd', 'd'],
[1, 2, 3, 1, 3, 1, 2, 2, 3]] )

In [4]: data
Out[4]:
a  1    0.519274
2    1.593852
3   -0.627557
b  1   -0.022029
3   -0.684012
c  1    0.101001
2    0.146661
d  2    0.013020
3    1.687800
dtype: float64


In [5]: data.index
Out[5]:
MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1, 1,
2]])



In [7]: data['c']
Out[7]:
1    0.101001
2    0.146661
dtype: float64

In [8]: data['c':'a']
Out[8]: Series([], dtype: float64)

In [9]: data['b':'d']
Out[9]:
b  1   -0.022029
3   -0.684012
c  1    0.101001
2    0.146661
d  2    0.013020
3    1.687800
dtype: float64

In [10]: data.loc[['b', 'c']]
Out[10]:
b  1   -0.022029
3   -0.684012
c  1    0.101001
2    0.146661
dtype: float64

In [11]: data.loc[['a', 'd']]
Out[11]:
a  1    0.519274
2    1.593852
3   -0.627557
d  2    0.013020
3    1.687800
dtype: float64



In [12]: data[:, 2]
Out[12]:
a    1.593852
c    0.146661
d    0.013020
dtype: float64

In [13]: data['a',2]
Out[13]: 1.5938517892256658



In [14]: data.unstack()
Out[14]:
1         2         3
a  0.519274  1.593852 -0.627557
b -0.022029       NaN -0.684012
c  0.101001  0.146661       NaN
d       NaN  0.013020  1.687800



unstack 的逆运算是stack：

In [15]: data.unstack().stack()
Out[15]:
a  1    0.519274
2    1.593852
3   -0.627557
b  1   -0.022029
3   -0.684012
c  1    0.101001
2    0.146661
d  2    0.013020
3    1.687800
dtype: float64

In [16]: data.unstack().unstack()
Out[16]:
1  a    0.519274
b   -0.022029
c    0.101001
d         NaN
2  a    1.593852
b         NaN
c    0.146661
d    0.013020
3  a   -0.627557
b   -0.684012
c         NaN
d    1.687800
dtype: float64

In [17]: data.unstack().unstack().unstack()
Out[17]:
a         b         c        d
1  0.519274 -0.022029  0.101001      NaN
2  1.593852       NaN  0.146661  0.01302
3 -0.627557 -0.684012       NaN  1.68780



stack 和 unstack 将在本章后面详细讲解。

In [21]: frame = pd.DataFrame(np.arange(12).reshape((4, 3)),
...:    ....:                      index=[['a', 'a', 'b', 'b'], [1,
...:  2, 1, 2]],
...:    ....:                      columns=[['Ohio', 'Ohio', 'Color
...:    ....:                               ['Green', 'Red', 'Green
...: ']])
...:

In [22]: frame
Out[22]:
Green Red    Green
a 1     0   1        2
2     3   4        5
b 1     6   7        8
2     9  10       11

# 第一列 a, b 是索引 ,第二列  1，2，1,2 也是索引  后三列是数据
# 列名 和行名一样， ohio 上面显示的 代表 第一列 和第二列都是 Ohio ,第三列是 Colorado

In [23]: np.arange(12).reshape((4,3))
Out[23]:
array([[ 0,  1,  2],
[ 3,  4,  5],
[ 6,  7,  8],
[ 9, 10, 11]])

In [24]: frame.index
Out[24]:
MultiIndex(levels=[['a', 'b'], [1, 2]],
labels=[[0, 0, 1, 1], [0, 1, 0, 1]])



In [28]: frame.index.names = ['key1', 'key2']

In [29]: frame.columns.names = ['state', 'color']

In [30]: frame
Out[30]:
color     Green Red    Green
key1 key2
a    1        0   1        2
2        3   4        5
b    1        6   7        8
2        9  10       11

In [31]: frame.unstack()
Out[31]:
color Green    Red        Green
key2      1  2   1   2        1   2
key1
a         0  3   1   4        2   5
b         6  9   7  10        8  11

In [32]: frame.unstack().stack()
Out[32]:
color        Green Green Red
key1 key2
a    1           2     0   1
2           5     3   4
b    1           8     6   7
2          11     9  10



In [36]: frame['Ohio']
Out[36]:
color      Green  Red
key1 key2
a    1         0    1
2         3    4
b    1         6    7
2         9   10

In [37]: frame['Ohio', 'Green']
Out[37]:
key1  key2
a     1       0
2       3
b     1       6
2       9
Name: (Ohio, Green), dtype: int32




In [41]: pd.MultiIndex.from_arrays([['Ohio', 'Ohio', 'Colorado'], ['Gre
...: en', 'Red', 'Green']], names=['state','color'])
Out[41]:
labels=[[1, 1, 0], [0, 1, 0]],
names=['state', 'color'])



## 重排与分级排序

In [42]: frame.swaplevel('key1','key2')
Out[42]:
color     Green Red    Green
key2 key1
1    a        0   1        2
2    a        3   4        5
1    b        6   7        8
2    b        9  10       11


In [45]: frame.sort_index(level=1)
Out[45]:
color     Green Red    Green
key1 key2
a    1        0   1        2
b    1        6   7        8
a    2        3   4        5
b    2        9  10       11

In [46]: frame.swaplevel(0,1)
Out[46]:
color     Green Red    Green
key2 key1
1    a        0   1        2
2    a        3   4        5
1    b        6   7        8
2    b        9  10       11

In [47]: frame.swaplevel(0,1).sort_index(level=0)
Out[47]:
color     Green Red    Green
key2 key1
1    a        0   1        2
b        6   7        8
2    a        3   4        5
b        9  10       11



## 根据级别汇总统计

In [55]: frame.sum(level='key2')
Out[55]:
color Green Red    Green
key2
1         6   8       10
2        12  14       16

In [56]: frame.sum(level='color', axis=1)
Out[56]:
color      Green  Red
key1 key2
a    1         2    1
2         8    4
b    1        14    7
2        20   10



## 使用 DataFrame 的列进行索引

In [57]: frame = pd.DataFrame({'a':range(7), 'b':range(7,0,-1),'c':['on
...: e', 'one', 'one', 'two', 'two',
...:    ....:                             'two', 'two'],
...:    ....:                       'd': [0, 1, 2, 0, 1, 2, 3]})
...:

In [58]: frame
Out[58]:
a  b    c  d
0  0  7  one  0
1  1  6  one  1
2  2  5  one  2
3  3  4  two  0
4  4  3  two  1
5  5  2  two  2
6  6  1  two  3



DataFrame 的 set_index 函数会将其一个或多个列转换为行索引，并创建一个新的 DataFrame ：

In [60]: frame2 = frame.set_index(['c', 'd'])

In [61]: frame2
Out[61]:
a  b
c   d
one 0  0  7
1  1  6
2  2  5
two 0  3  4
1  4  3
2  5  2
3  6  1

In [62]: frame2.unstack()
Out[62]:
a                   b
d      0    1    2    3    0    1    2    3
c
one  0.0  1.0  2.0  NaN  7.0  6.0  5.0  NaN
two  3.0  4.0  5.0  6.0  4.0  3.0  2.0  1.0



In [63]: frame.set_index(['c', 'd'], drop=False)
Out[63]:
a  b    c  d
c   d
one 0  0  7  one  0
1  1  6  one  1
2  2  5  one  2
two 0  3  4  two  0
1  4  3  two  1
2  5  2  two  2
3  6  1  two  3


reset_index 的功能跟 set_index 刚好相反，层次化索引的级别会被转移到列里面：

In [64]: frame2
Out[64]:
a  b
c   d
one 0  0  7
1  1  6
2  2  5
two 0  3  4
1  4  3
2  5  2
3  6  1

In [65]: frame2.reset_index()
Out[65]:
c  d  a  b
0  one  0  0  7
1  one  1  1  6
2  one  2  2  5
3  two  0  3  4
4  two  1  4  3
5  two  2  5  2
6  two  3  6  1



## 8.2 合并数据集

pandas 对象中的数据可以通过一些方式进行合并：

• pandas.merge 可根据一个或多个键将不同 DataFrame 中的行连接起来。SQL 或其他关系型数据库的用户对此应该会比较熟悉，因为它实现的就是数据库的 join 操作。
• pandas.concat 可以沿着一条轴将多个对象堆叠到一起。
• 实例方法 combine_first 可以将重复数据编接在一起，用一个对象中的值填充另一个对象中的缺失值。

## 数据库风格的 DataFrame 合并

In [35]: df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
....:                     'data1': range(7)})

In [36]: df2 = pd.DataFrame({'key': ['a', 'b', 'd'],
....:                     'data2': range(3)})

In [37]: df1
Out[37]:
data1 key
0      0   b
1      1   b
2      2   a
3      3   c
4      4   a
5      5   a
6      6   b

In [38]: df2
Out[38]:
data2 key
0      0   a
1      1   b
2      2   d



In [70]: pd.merge(df1, df2)
Out[70]:
data1 key  data2
0      0   b      1
1      1   b      1
2      6   b      1
3      2   a      0
4      4   a      0
5      5   a      0

# 第 一列的索引， 先不用管，然后以 key 列为准，b,b,b,a,a,a 的顺序 调整其他列的顺序


In [71]: pd.merge(df1, df2, on='key')
Out[71]:
data1 key  data2
0      0   b      1
1      1   b      1
2      6   b      1
3      2   a      0
4      4   a      0
5      5   a      0



In [72]: df3 = pd.DataFrame({'lkey':['b', 'b', 'a', 'c', 'a', 'a', 'b']
...: ,'data1':range(7)})

In [73]: df4 = pd.DataFrame({'rkey':['a','b','d'],'data2':range(3)})

In [74]: pd.merge(df3, df4, left_on='lkey', right_on='rkey')
Out[74]:
data1 lkey  data2 rkey
0      0    b      1    b
1      1    b      1    b
2      6    b      1    b
3      2    a      0    a
4      4    a      0    a
5      5    a      0    a

In [75]: df3
Out[75]:
data1 lkey
0      0    b
1      1    b
2      2    a
3      3    c
4      4    a
5      5    a
6      6    b

In [76]: df4
Out[76]:
data2 rkey
0      0    a
1      1    b
2      2    d


• 可能你已经注意到了，结果里面 c 和 d 以及与之相关的数据消失了。默认情况下，merge 做的是“内连接”；结果中的键是交集

• 其他方式还有”left”、”right”以及”outer”。外连接求取的是键的并集，组合了左连接和右连接的效果：

In [77]: pd.merge(df1, df2, how='outer')
Out[77]:
data1 key  data2
0    0.0   b    1.0
1    1.0   b    1.0
2    6.0   b    1.0
3    2.0   a    0.0
4    4.0   a    0.0
5    5.0   a    0.0
6    3.0   c    NaN
7    NaN   d    2.0



In [78]: df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],'dat
...: a1':range(6)})

In [79]: df2 = pd.DataFrame({'key': ['a', 'b', 'a', 'b', 'd'],'data2':r
...: ange(5)})

In [80]: df1
Out[80]:
data1 key
0      0   b
1      1   b
2      2   a
3      3   c
4      4   a
5      5   b

In [81]: df2
Out[81]:
data2 key
0      0   a
1      1   b
2      2   a
3      3   b
4      4   d

In [82]: pd.merge(df1, df2, on='key', how='left')
Out[82]:
data1 key  data2
0       0   b    1.0
1       0   b    3.0
2       1   b    1.0
3       1   b    3.0
4       2   a    0.0
5       2   a    2.0
6       3   c    NaN
7       4   a    0.0
8       4   a    2.0
9       5   b    1.0
10      5   b    3.0



data1  key
0   b
1   b

data2  key
1   b
3   b

# 笛卡尔积 2 *2 =4

data1 key  data2
0       0   b    1.0
1       0   b    3.0
2       1   b    1.0
3       1   b    3.0


In [83]: pd.merge(df1, df2, how='inner')
Out[83]:
data1 key  data2
0      0   b      1
1      0   b      3
2      1   b      1
3      1   b      3
4      5   b      1
5      5   b      3
6      2   a      0
7      2   a      2
8      4   a      0
9      4   a      2



In [51]: left = pd.DataFrame({'key1': ['foo', 'foo', 'bar'],
....:                      'key2': ['one', 'two', 'one'],
....:                      'lval': [1, 2, 3]})

In [52]: right = pd.DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'],
....:                       'key2': ['one', 'one', 'one', 'two'],
....:                       'rval': [4, 5, 6, 7]})

In [88]: left
Out[88]:
key1 key2  lval
0  foo  one     1
1  foo  two     2
2  bar  one     3

In [89]: right
Out[89]:
key1 key2  rval
0  foo  one     4
1  foo  one     5
2  bar  one     6
3  bar  two     7

In [53]: pd.merge(left, right, on=['key1', 'key2'], how='outer')
Out[53]:
key1 key2  lval  rval
0  foo  one   1.0   4.0
1  foo  one   1.0   5.0
2  foo  two   2.0   NaN
3  bar  one   3.0   6.0
4  bar  two   NaN   7.0



In [91]: pd.merge(left, right, on='key1')
Out[91]:
key1 key2_x  lval key2_y  rval
0  foo    one     1    one     4
1  foo    one     1    one     5
2  foo    two     2    one     4
3  foo    two     2    one     5
4  bar    one     3    one     6
5  bar    one     3    two     7

In [92]: left
Out[92]:
key1 key2  lval
0  foo  one     1
1  foo  two     2
2  bar  one     3

In [93]: right
Out[93]:
key1 key2  rval
0  foo  one     4
1  foo  one     5
2  bar  one     6
3  bar  two     7

# key2 的列明是重复的，默认合并后在末尾添加的是 _x ,_y

# 指定添加 后缀

In [94]: pd.merge(left, right, on='key1', suffixes=('_left', '_right'))
...:
Out[94]:
key1 key2_left  lval key2_right  rval
0  foo       one     1        one     4
1  foo       one     1        one     5
2  foo       two     2        one     4
3  foo       two     2        one     5
4  bar       one     3        one     6
5  bar       one     3        two     7



merge 的参数请参见表 8-2。使用 DataFrame 的行索引合并是下一节的主题。

indicator 添加特殊的列 _merge，它可以指明每个行的来源，它的值有 left_only、right_only 或 both，根据每行的合并数据的来源。

## 索引上的合并

In [95]: left1 = pd.DataFrame({'key': ['a', 'b', 'a', 'a', 'b', 'c'],'v
...: alue':range(6)})

In [96]: right1 = pd.DataFrame({'group_val':[3.5, 7]}, index=['a', 'b']
...: )

In [97]: left1
Out[97]:
key  value
0   a      0
1   b      1
2   a      2
3   a      3
4   b      4
5   c      5

In [98]: right1
Out[98]:
group_val
a        3.5
b        7.0

In [99]: pd.merge(left1, right1, left_on='key', right_index=True)
Out[99]:
key  value  group_val
0   a      0        3.5
2   a      2        3.5
3   a      3        3.5
1   b      1        7.0
4   b      4        7.0



In [101]: pd.merge(left1, right1, left_on='key', right_index=True, how=
...: 'outer')
Out[101]:
key  value  group_val
0   a      0        3.5
2   a      2        3.5
3   a      3        3.5
1   b      1        7.0
4   b      4        7.0
5   c      5        NaN


In [103]: lefth = pd.DataFrame({'key1': ['Ohio', 'Ohio', 'Ohio',
...:    ....:                       'key2': [2000, 2001, 2002, 200
...: 1, 2002],
...:    ....:                       'data': np.arange(5.)})
...:

In [104]: righth = pd.DataFrame(np.arange(12).reshape((6, 2)),
...: hio', 'Ohio',
...:    ....:                               'Ohio', 'Ohio'],
...:    ....:                              [2001, 2000, 2000, 2000
...: , 2001, 2002]],
...:    ....:                       columns=['event1', 'event2'])
...:
...:

In [105]:

In [105]: lefth
Out[105]:
data    key1  key2
0   0.0    Ohio  2000
1   1.0    Ohio  2001
2   2.0    Ohio  2002

In [106]: righth
Out[106]:
event1  event2
2000       2       3
Ohio   2000       4       5
2000       6       7
2001       8       9
2002      10      11



In [107]: pd.merge(lefth, righth, left_on=['key1', 'key2'], right_index
...: =True)
Out[107]:
data    key1  key2  event1  event2
0   0.0    Ohio  2000       4       5
0   0.0    Ohio  2000       6       7
1   1.0    Ohio  2001       8       9
2   2.0    Ohio  2002      10      11
3   3.0  Nevada  2001       0       1

In [108]: pd.merge(lefth, righth, left_on=['key1', 'key2'], right_index
...: =True, how='outer')
Out[108]:
data    key1  key2  event1  event2
0   0.0    Ohio  2000     4.0     5.0
0   0.0    Ohio  2000     6.0     7.0
1   1.0    Ohio  2001     8.0     9.0
2   2.0    Ohio  2002    10.0    11.0
3   3.0  Nevada  2001     0.0     1.0
4   4.0  Nevada  2002     NaN     NaN
4   NaN  Nevada  2000     2.0     3.0



In [109]: left2 = pd.DataFrame([[1., 2.], [3., 4.], [5., 6.]], index =
...: ['a', 'c', 'e'], columns=['Ohio', 'Nevada'])

In [110]: right2 = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 1
...: 4]], index=['b', 'c', 'd', 'e'], columns=['Missouri', 'Alabam
...: a'])

In [111]: left2
Out[111]:
a   1.0     2.0
c   3.0     4.0
e   5.0     6.0

In [113]: right2
Out[113]:
Missouri  Alabama
b       7.0      8.0
c       9.0     10.0
d      11.0     12.0

In [115]: pd.merge(left2, right2, how='outer', left_index=True, right_i
...: ndex=True)
Out[115]:
a   1.0     2.0       NaN      NaN
b   NaN     NaN       7.0      8.0
c   3.0     4.0       9.0     10.0
d   NaN     NaN      11.0     12.0
e   5.0     6.0      13.0     14.0



DataFrame 还有一个便捷的 join 实例方法，它能更为方便地实现按索引合并。它还可用于合并多个带有相同或相似索引的 DataFrame 对象，但要求没有重叠的列。在上面那个例子中，我们可以编写：

In [116]: left2.join(right2, how='outer')
Out[116]:
a   1.0     2.0       NaN      NaN
b   NaN     NaN       7.0      8.0
c   3.0     4.0       9.0     10.0
d   NaN     NaN      11.0     12.0
e   5.0     6.0      13.0     14.0



In [117]: left1.join(right1, on='key')
Out[117]:
key  value  group_val
0   a      0        3.5
1   b      1        7.0
2   a      2        3.5
3   a      3        3.5
4   b      4        7.0
5   c      5        NaN



In [119]: another = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [16.
...: , 17.]],
...:    ....:                        index=['a', 'c', 'e', 'f'],
...:    ....:                        columns=['New York',
...: 'Oregon'])
...:

In [120]: another
Out[120]:
New York  Oregon
a       7.0     8.0
c       9.0    10.0
e      11.0    12.0
f      16.0    17.0

In [121]: left2.join([right2, another])
Out[121]:
Ohio  Nevada  Missouri  Alabama  New York  Oregon
a   1.0     2.0       NaN      NaN       7.0     8.0
c   3.0     4.0       9.0     10.0       9.0    10.0
e   5.0     6.0      13.0     14.0      11.0    12.0

In [122]: left2.join([right2, another], how='outer')
Out[122]:
Ohio  Nevada  Missouri  Alabama  New York  Oregon
a   1.0     2.0       NaN      NaN       7.0     8.0
b   NaN     NaN       7.0      8.0       NaN     NaN
c   3.0     4.0       9.0     10.0       9.0    10.0
d   NaN     NaN      11.0     12.0       NaN     NaN
e   5.0     6.0      13.0     14.0      11.0    12.0
f   NaN     NaN       NaN      NaN      16.0    17.0



## 轴向连接

In [123]: arr = np.arange(12).reshape((3,4))

In [124]: arr
Out[124]:
array([[ 0,  1,  2,  3],
[ 4,  5,  6,  7],
[ 8,  9, 10, 11]])

In [125]: np.concatenate([arr,arr], axis=1)
Out[125]:
array([[ 0,  1,  2,  3,  0,  1,  2,  3],
[ 4,  5,  6,  7,  4,  5,  6,  7],
[ 8,  9, 10, 11,  8,  9, 10, 11]])



• 如果对象在其它轴上的索引不同，我们应该合并这些轴的不同元素还是只使用交集？
• 连接的数据集是否需要在结果对象中可识别？
• 连接轴中保存的数据是否需要保留？许多情况下，DataFrame 默认的整数标签最好在连接时删掉。

pandas 的 concat 函数提供了一种能够解决这些问题的可靠方式。我将给出一些例子来讲解其使用方式。假设有三个没有重叠索引的 Series：

In [126]: s1 = pd.Series([0, 1], index=['a', 'b'])

In [127]: s2 = pd.Series([2,3,4], index=['c', 'd', 'e'])

In [128]: s3 = pd.Series([5, 6], index=['f', 'g'])



In [129]: s1
Out[129]:
a    0
b    1
dtype: int64

In [130]: s2
Out[130]:
c    2
d    3
e    4
dtype: int64

In [131]: s3
Out[131]:
f    5
g    6
dtype: int64

In [132]: pd.concat([s1, s2, s3])
Out[132]:
a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64



In [133]: pd.concat([s1, s2, s3], axis=1)
Out[133]:
0    1    2
a  0.0  NaN  NaN
b  1.0  NaN  NaN
c  NaN  2.0  NaN
d  NaN  3.0  NaN
e  NaN  4.0  NaN
f  NaN  NaN  5.0
g  NaN  NaN  6.0



In [134]: s4 = pd.concat([s1, s3])

In [135]: s4
Out[135]:
a    0
b    1
f    5
g    6
dtype: int64

In [136]: pd.concat([s3, s4], axis=1)
Out[136]:
0  1
a  NaN  0
b  NaN  1
f  5.0  5
g  6.0  6

# 第一列 是 s3 的数据，第二列 是 s4 的数据

In [137]: s3
Out[137]:
f    5
g    6
dtype: int64

# s1 s4 的交集

In [138]: pd.concat([s1, s4], axis=1, join='inner')
Out[138]:
0  1
a  0  0
b  1  1

In [139]: s1
Out[139]:
a    0
b    1
dtype: int64



In [140]: pd.concat([s1, s4], axis=1, join_axes=[['a','c','b', 'e']])
Out[140]:
0    1
a  0.0  0.0
c  NaN  NaN
b  1.0  1.0
e  NaN  NaN



In [142]: result = pd.concat([s1, s2, s3], keys=['one', 'two','three'])
...:

In [143]: result
Out[143]:
one    a    0
b    1
two    c    2
d    3
e    4
three  f    5
g    6
dtype: int64

In [144]: result.unstack()
Out[144]:
a    b    c    d    e     f    g
one    0.0  1.0  NaN  NaN  NaN  NaN  NaN
two    NaN  NaN  2.0  3.0  4.0  NaN  NaN
three  NaN  NaN  NaN  NaN  NaN  5.0  6.0



In [152]: pd.concat([s1, s2, s3], axis=1, keys=['one','two', 'three'])
Out[152]:
one  two  three
a  0.0  NaN    NaN
b  1.0  NaN    NaN
c  NaN  2.0    NaN
d  NaN  3.0    NaN
e  NaN  4.0    NaN
f  NaN  NaN    5.0
g  NaN  NaN    6.0




In [145]: df1 = pd.DataFrame(np.arange(6).reshape(3,2), index=['a', 'b'
...: , 'c'],columns=['one', 'two'])

In [147]: df2 = pd.DataFrame(5 + np.arange(4).reshape(2,2), index=['a',
...: 'c'], columns=['three', 'four'])

In [148]: df1
Out[148]:
one  two
a    0    1
b    2    3
c    4    5

In [149]: df2
Out[149]:
three  four
a      5     6
c      7     8

In [150]: pd.concat([df1, df2], axis=1, keys=['level1','level2'])
Out[150]:
level1     level2
one two  three four
a      0   1    5.0  6.0
b      2   3    NaN  NaN
c      4   5    7.0  8.0


In [153]: pd.concat({'level':df1, 'level2': df2}, axis=1)
Out[153]:
level     level2
one two  three four
a     0   1    5.0  6.0
b     2   3    NaN  NaN
c     4   5    7.0  8.0



In [154]: pd.concat([df1, df2], axis=1, keys=['level1', 'level2'],names
...: =['upper', 'lower'])
Out[154]:
upper level1     level2
lower    one two  three four
a          0   1    5.0  6.0
b          2   3    NaN  NaN
c          4   5    7.0  8.0



In [155]:  df1 = pd.DataFrame(np.random.randn(3,4), columns=['a','b','c
...: ','d'])

In [156]: df2 = pd.DataFrame(np.random.randn(2,3), columns=['b', 'd', '
...: a'])

In [157]: df1
Out[157]:
a         b         c         d
0  0.830983 -1.507726  0.522686 -0.640847
1 -0.647306  0.028235  0.358099 -1.179401
2 -1.321904  0.986079  0.486219  0.061789

In [158]: df2
Out[158]:
b         d         a
0  0.071584  0.172593  0.849266
1  0.677618  0.680149  0.825373



In [159]: pd.concat([df1, df2], ignore_index=True)
Out[159]:
a         b         c         d
0  0.830983 -1.507726  0.522686 -0.640847
1 -0.647306  0.028235  0.358099 -1.179401
2 -1.321904  0.986079  0.486219  0.061789
3  0.849266  0.071584       NaN  0.172593
4  0.825373  0.677618       NaN  0.680149


## 合并重叠数据

In [4]: a = pd.Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan], index=['
...: f', 'e', 'd', 'c', 'b', 'a'])

In [6]: b = pd.Series(np.arange(len(a), dtype=np.float64), index = ['f'
...: , 'e', 'd', 'c', 'b', 'a'])

In [7]: a
Out[7]:
f    NaN
e    2.5
d    NaN
c    3.5
b    4.5
a    NaN
dtype: float64

In [8]: b
Out[8]:
f    0.0
e    1.0
d    2.0
c    3.0
b    4.0
a    5.0
dtype: float64

In [9]: b[-1]
Out[9]: 5.0

In [10]: b[-1] = np.nan

In [13]: b
Out[13]:
f    0.0
e    1.0
d    2.0
c    3.0
b    4.0
a    NaN
dtype: float64

In [11]: pd.isnull(a)
Out[11]:
f     True
e    False
d     True
c    False
b    False
a     True
dtype: bool

In [12]: np.where(pd.isnull(a), b, a)
Out[12]: array([0. , 2.5, 2. , 3.5, 4.5, nan])

# 相当于 if -else 伪代码

if pd.isnull(a): # 返回的值为 True
b # 从 b 中提取值
else:
a # 从 a 中提取值


Series 有一个 combine_first 方法，实现的也是一样的功能，还带有 pandas 的数据对齐：

In [16]: b[:-2].combine_first(a[2:])
Out[16]:
a    NaN
b    4.5
c    3.0
d    2.0
e    1.0
f    0.0
dtype: float64

In [17]: b[:-2]
Out[17]:
f    0.0
e    1.0
d    2.0
c    3.0
dtype: float64

In [18]: a[2:]
Out[18]:
d    NaN
c    3.5
b    4.5
a    NaN
dtype: float64



In [20]: df1 = pd.DataFrame({'a':[1., np.nan, 5., np.nan],'b':[np.nan,
...: 2., np.nan, 6.], 'c':range(2, 18, 4)})

In [21]: df2 = pd.DataFrame({'a':[5., 4., np.nan, 3., 7.],'b':[np.nan,
...: 3., 4., 6., 8.]})

In [22]: df1
Out[22]:
a    b   c
0  1.0  NaN   2
1  NaN  2.0   6
2  5.0  NaN  10
3  NaN  6.0  14

In [23]: df2
Out[23]:
a    b
0  5.0  NaN
1  4.0  3.0
2  NaN  4.0
3  3.0  6.0
4  7.0  8.0

In [24]: df1.combine_first(df2)
Out[24]:
a    b     c
0  1.0  NaN   2.0
1  4.0  2.0   6.0
2  5.0  4.0  10.0
3  3.0  6.0  14.0
4  7.0  8.0   NaN


## 8.3 重塑和轴向旋转

### 重塑层次化索引

• stack：将数据的列“旋转”为行。
• unstack ：将数据的行“旋转”为列。

In [25]: data = pd.DataFrame(np.arange(6).reshape((2,3)), index = pd.In
...: ', 'two', 'three'],name='number'))

In [26]: data
Out[26]:
number    one  two  three
state
Ohio        0    1      2



In [27]: result = data.stack()

In [28]: result
Out[28]:
state     number
Ohio      one       0
two       1
three     2
two       4
three     5
dtype: int32




In [29]: result.unstack()
Out[29]:
number    one  two  three
state
Ohio        0    1      2


In [30]: result.unstack(0)
Out[30]:
number
one        0         3
two        1         4
three      2         5

In [31]: result.unstack('state')
Out[31]:
number
one        0         3
two        1         4
three      2         5



In [36]: s1 = pd.Series([0,1,2,3],index=['a', 'b', 'c', 'd'])

In [37]: s2= pd.Series([4,5,6], index=['c', 'd', 'e'])

In [38]: data2 = pd.concat([s1, s2], keys=['one', 'two'])

In [39]: data2
Out[39]:
one  a    0
b    1
c    2
d    3
two  c    4
d    5
e    6
dtype: int64

In [42]: data2.unstack()
Out[42]:
a    b    c    d    e
one  0.0  1.0  2.0  3.0  NaN
two  NaN  NaN  4.0  5.0  6.0

# s1 中的 index 有 a,b,c,d 但是没有 e ，而 s2 中有 e ，但是没有 a,b 所以各自之间连接后 有缺失值



stack 默认会滤除缺失数据，因此该运算是可逆的：

In [43]: data2.unstack()
Out[43]:
a    b    c    d    e
one  0.0  1.0  2.0  3.0  NaN
two  NaN  NaN  4.0  5.0  6.0

In [44]: data2.unstack().stack()
Out[44]:
one  a    0.0
b    1.0
c    2.0
d    3.0
two  c    4.0
d    5.0
e    6.0
dtype: float64

In [45]: data2.unstack().stack(dropna=False)
Out[45]:
one  a    0.0
b    1.0
c    2.0
d    3.0
e    NaN
two  a    NaN
b    NaN
c    4.0
d    5.0
e    6.0
dtype: float64



In [46]: df = pd.DataFrame({'left':result, 'right': result +5}, columns
...: =pd.Index(['left', 'right'], name='side'))

In [47]: df
Out[47]:
side             left  right
state    number
Ohio     one        0      5
two        1      6
three      2      7
two        4      9
three      5     10

In [48]: result
Out[48]:
state     number
Ohio      one       0
two       1
three     2
two       4
three     5
dtype: int32

In [49]: df.unstack('state')
Out[49]:
side   left          right
number
one       0        3     5        8
two       1        4     6        9
three     2        5     7       10



In [51]: df
Out[51]:
side             left  right
state    number
Ohio     one        0      5
two        1      6
three      2      7
two        4      9
three      5     10

In [50]: df.unstack('state').stack('side')
Out[50]:
number side
one    left          3     0
right         8     5
two    left          4     1
right         9     6
three  left          5     2
right        10     7



## 将“长格式”旋转为“宽格式”

In [3]: data = pd.read_csv('examples/macrodata.csv')

Out[4]:
year  quarter   realgdp  realcons  realinv  realgovt  realdpi    cp
i  \
0  1959.0      1.0  2710.349    1707.4  286.898   470.045   1886.9  28.9
8
1  1959.0      2.0  2778.801    1733.7  310.859   481.301   1919.7  29.1
5
2  1959.0      3.0  2775.488    1751.8  289.226   491.260   1916.4  29.3
5
3  1959.0      4.0  2785.204    1753.7  299.356   484.052   1931.3  29.3
7
4  1960.0      1.0  2847.699    1770.5  331.722   462.199   1955.5  29.5
4

m1  tbilrate  unemp      pop  infl  realint
0  139.7      2.82    5.8  177.146  0.00     0.00
1  141.7      3.08    5.1  177.830  2.34     0.74
2  140.5      3.82    5.3  178.657  2.74     1.09
3  140.0      4.33    5.6  179.386  0.27     4.06
4  139.6      3.50    5.2  180.007  2.31     1.19

In [5]: periods = pd.PeriodIndex(year=data.year, quarter=data.quarter,
...: name='date')

In [6]: periods
Out[6]:
PeriodIndex(['1959Q1', '1959Q2', '1959Q3', '1959Q4', '1960Q1', '1960Q2',

'1960Q3', '1960Q4', '1961Q1', '1961Q2',
...
'2007Q2', '2007Q3', '2007Q4', '2008Q1', '2008Q2', '2008Q3',

'2008Q4', '2009Q1', '2009Q2', '2009Q3'],
dtype='period[Q-DEC]', name='date', length=203, freq='Q-DEC'
)

In [7]: columns = pd.Index(['realgdp', 'inf1', 'unemp'], name='otem')

In [8]: columns
Out[8]: Index(['realgdp', 'inf1', 'unemp'], dtype='object', name='otem')

In [11]: data = data.reindex(columns=columns)

In [12]: data
Out[12]:
otem    realgdp  inf1  unemp
0      2710.349   NaN    5.8
1      2778.801   NaN    5.1
2      2775.488   NaN    5.3
3      2785.204   NaN    5.6
4      2847.699   NaN    5.2

..          ...   ...    ...

195   13391.249   NaN    4.8
196   13366.865   NaN    4.9
197   13415.266   NaN    5.4
198   13324.600   NaN    6.0
199   13141.920   NaN    6.9
200   12925.410   NaN    8.1
201   12901.504   NaN    9.2
202   12990.341   NaN    9.6

[203 rows x 3 columns]

In [18]: data.index = periods.to_timestamp('D', 'end')
...:

In [19]: ldata = data.stack().reset_index().rename(columns={0:'value'})
...:

In [20]: ldata
Out[20]:
date     item      value
0   1959-03-31  realgdp   2710.349
1   1959-03-31    unemp      5.800
2   1959-06-30  realgdp   2778.801
3   1959-06-30    unemp      5.100
4   1959-09-30  realgdp   2775.488
5   1959-09-30    unemp      5.300

..         ...      ...        ...

401 2009-03-31    unemp      8.100
402 2009-06-30  realgdp  12901.504
403 2009-06-30    unemp      9.200
404 2009-09-30  realgdp  12990.341
405 2009-09-30    unemp      9.600

[406 rows x 3 columns]




In [20]: ldata
Out[20]:
date     item      value
0   1959-03-31  realgdp   2710.349
1   1959-03-31    unemp      5.800
2   1959-06-30  realgdp   2778.801
3   1959-06-30    unemp      5.100
4   1959-09-30  realgdp   2775.488
5   1959-09-30    unemp      5.300

..         ...      ...        ...

401 2009-03-31    unemp      8.100
402 2009-06-30  realgdp  12901.504
403 2009-06-30    unemp      9.200
404 2009-09-30  realgdp  12990.341
405 2009-09-30    unemp      9.600

In [21]: pivoted = ldata.pivot('date', 'item', 'value')

In [22]: pivoted
Out[22]:
item          realgdp  unemp
date
1959-03-31   2710.349    5.8
1959-06-30   2778.801    5.1
1959-09-30   2775.488    5.3
1959-12-31   2785.204    5.6
1960-03-31   2847.699    5.2
1960-06-30   2834.390    5.2
1960-09-30   2839.022    5.6
1960-12-31   2802.616    6.3
1961-03-31   2819.264    6.8

...               ...    ...

2007-06-30  13203.977    4.5
2007-09-30  13321.109    4.7
2007-12-31  13391.249    4.8
2008-03-31  13366.865    4.9
2008-06-30  13415.266    5.4
2008-09-30  13324.600    6.0
2008-12-31  13141.920    6.9
2009-03-31  12925.410    8.1
2009-06-30  12901.504    9.2
2009-09-30  12990.341    9.6

[203 rows x 2 columns]



In [23]: ldata['value2'] = np.random.randn(len(ldata))

In [24]: ldata[:10]
Out[24]:
date     item     value    value2
0 1959-03-31  realgdp  2710.349  1.692280
1 1959-03-31    unemp     5.800  0.171287
2 1959-06-30  realgdp  2778.801  0.987641
3 1959-06-30    unemp     5.100  0.978923
4 1959-09-30  realgdp  2775.488  0.005767
5 1959-09-30    unemp     5.300  1.177017
6 1959-12-31  realgdp  2785.204 -1.247027
7 1959-12-31    unemp     5.600 -0.276246
8 1960-03-31  realgdp  2847.699  1.331130
9 1960-03-31    unemp     5.200 -0.405204


In [25]: pivoted = ldata.pivot('date', 'item')

In [26]: pivoted[:5]
Out[26]:
value          value2
item         realgdp unemp   realgdp     unemp
date
1959-03-31  2710.349   5.8  1.692280  0.171287
1959-06-30  2778.801   5.1  0.987641  0.978923
1959-09-30  2775.488   5.3  0.005767  1.177017
1959-12-31  2785.204   5.6 -1.247027 -0.276246
1960-03-31  2847.699   5.2  1.331130 -0.405204

In [27]: pivoted['value'][:5]
Out[27]:
item         realgdp  unemp
date
1959-03-31  2710.349    5.8
1959-06-30  2778.801    5.1
1959-09-30  2775.488    5.3
1959-12-31  2785.204    5.6
1960-03-31  2847.699    5.2


In [28]: unstacked = ldata.set_index(['date', 'item']).unstack('item')

In [29]: unstacked[:7]
Out[29]:
value          value2
item         realgdp unemp   realgdp     unemp
date
1959-03-31  2710.349   5.8  1.692280  0.171287
1959-06-30  2778.801   5.1  0.987641  0.978923
1959-09-30  2775.488   5.3  0.005767  1.177017
1959-12-31  2785.204   5.6 -1.247027 -0.276246
1960-03-31  2847.699   5.2  1.331130 -0.405204
1960-06-30  2834.390   5.2  0.998744 -1.381110
1960-09-30  2839.022   5.6  0.022925  1.050719

In [30]: ldata.set_index(['date', 'item'])
Out[30]:
value    value2
date       item
1959-03-31 realgdp   2710.349  1.692280
unemp        5.800  0.171287
1959-06-30 realgdp   2778.801  0.987641
unemp        5.100  0.978923
1959-09-30 realgdp   2775.488  0.005767
unemp        5.300  1.177017
1959-12-31 realgdp   2785.204 -1.247027
unemp        5.600 -0.276246
1960-03-31 realgdp   2847.699  1.331130
unemp        5.200 -0.405204

...                       ...       ...

unemp        6.000  0.577148
2008-12-31 realgdp  13141.920 -1.034448
unemp        6.900  1.232211
2009-03-31 realgdp  12925.410 -1.340544
unemp        8.100  1.457841
2009-06-30 realgdp  12901.504  1.911584
unemp        9.200  0.509018
2009-09-30 realgdp  12990.341 -0.520676
unemp        9.600  1.106940

[406 rows x 2 columns]



## 将“宽格式”旋转为“长格式”

In [32]: df = pd.DataFrame({'key':['foo', 'bar', 'baz'],'A':[1, 2, 3],'
...: B':[4, 5, 6], 'C':[7, 8,9]})

In [33]: df
Out[33]:
A  B  C  key
0  1  4  7  foo
1  2  5  8  bar
2  3  6  9  baz



key 列可能是分组指标，其它的列是数据值。当使用 pandas.melt，我们必须指明哪些列是分组指标。下面使用 key 作为唯一的分组指标：

In [34]: df
Out[34]:
A  B  C  key
0  1  4  7  foo
1  2  5  8  bar
2  3  6  9  baz

In [35]: melted = pd.melt(df, ['key'])

In [36]: melted
Out[36]:
key variable  value
0  foo        A      1
1  bar        A      2
2  baz        A      3
3  foo        B      4
4  bar        B      5
5  baz        B      6
6  foo        C      7
7  bar        C      8
8  baz        C      9



In [37]: reshaped = melted.pivot('key','variable', 'value')

In [38]: reshaped
Out[38]:
variable  A  B  C
key
bar       2  5  8
baz       3  6  9
foo       1  4  7

In [39]: melted
Out[39]:
key variable  value
0  foo        A      1
1  bar        A      2
2  baz        A      3
3  foo        B      4
4  bar        B      5
5  baz        B      6
6  foo        C      7
7  bar        C      8
8  baz        C      9



In [40]: reshaped.reset_index()
Out[40]:
variable  key  A  B  C
0         bar  2  5  8
1         baz  3  6  9
2         foo  1  4  7



In [41]: pd.melt(df, id_vars=['key'], value_vars=['A','B'])
Out[41]:
key variable  value
0  foo        A      1
1  bar        A      2
2  baz        A      3
3  foo        B      4
4  bar        B      5
5  baz        B      6



pandas .melt也可以不用分组指标：

In [42]: pd.melt(df, value_vars=['A','B','C'])
Out[42]:
variable  value
0        A      1
1        A      2
2        A      3
3        B      4
4        B      5
5        B      6
6        C      7
7        C      8
8        C      9

In [43]: pd.melt(df, value_vars=['key','A', 'B'])
Out[43]:
variable value
0      key   foo
1      key   bar
2      key   baz
3        A     1
4        A     2
5        A     3
6        B     4
7        B     5
8        B     6