层次化索引(hierarchical indexing)
是pandas的一项重要功能,它使你能在一个轴上拥有多个(两个以上)索引级别
from pandas import Series,DataFrame
import numpy as np
由一个列表或数组组成的列表作为索引
data = Series(np.random.randn(10),index=[['a','a','a','b','b','b','c','c','d','d'],[1,2,3,1,2,3,1,2,2,3]])
data
a 1 0.663209
2 0.346579
3 -0.645685
b 1 -0.278141
2 -0.552655
3 -1.181881
c 1 1.083462
2 -0.976686
d 2 1.911491
3 0.905117
dtype: float64
data.index
MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
codes=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]])
data['b']
1 -0.278141
2 -0.552655
3 -1.181881
dtype: float64
data['b':'c']
b 1 -0.278141
2 -0.552655
3 -1.181881
c 1 1.083462
2 -0.976686
dtype: float64
data.ix[['b','d']]
b 1 -0.278141
2 -0.552655
3 -1.181881
d 2 1.911491
3 0.905117
dtype: float64
data[:,2]
a 0.346579
b -0.552655
c -0.976686
d 1.911491
dtype: float64
unstack() 方法被重新安排到DataFrame中
相当于透视表生成
左侧的索引需要能唯一区分,否则报错
与 stack() 方法互为逆运算
data.unstack()
| 1 | 2 | 3 |
---|
a | 0.663209 | 0.346579 | -0.645685 |
---|
b | -0.278141 | -0.552655 | -1.181881 |
---|
c | 1.083462 | -0.976686 | NaN |
---|
d | NaN | 1.911491 | 0.905117 |
---|
data.unstack().stack()
a 1 0.663209
2 0.346579
3 -0.645685
b 1 -0.278141
2 -0.552655
3 -1.181881
c 1 1.083462
2 -0.976686
d 2 1.911491
3 0.905117
dtype: float64
DataFrame 每条轴都可以分层索引
frame = DataFrame(np.arange(12).reshape((4,3)),
index=[['a','a','b','b'],[1,2,1,2]],
columns=[['Ohio','Ohio','Colorado'],['Green','Red','Green']])
frame
| | Ohio | Colorado |
---|
| | Green | Red | Green |
---|
a | 1 | 0 | 1 | 2 |
---|
2 | 3 | 4 | 5 |
---|
b | 1 | 6 | 7 | 8 |
---|
2 | 9 | 10 | 11 |
---|
各层 轴标签命名
frame.index.names = ['key1','key2']
frame.columns.names = ['state','color']
frame
| state | Ohio | Colorado |
---|
| color | Green | Red | Green |
---|
key1 | key2 | | | |
---|
a | 1 | 0 | 1 | 2 |
---|
2 | 3 | 4 | 5 |
---|
b | 1 | 6 | 7 | 8 |
---|
2 | 9 | 10 | 11 |
---|
frame['Ohio']
| color | Green | Red |
---|
key1 | key2 | | |
---|
a | 1 | 0 | 1 |
---|
2 | 3 | 4 |
---|
b | 1 | 6 | 7 |
---|
2 | 9 | 10 |
---|
重排分级顺序
swaplevel() 接受两个级别编号或名称,并返回一个互换了级别的新对象(但数据不会发生变化)
frame.swaplevel('key1','key2')
| state | Ohio | Colorado |
---|
| color | Green | Red | Green |
---|
key2 | key1 | | | |
---|
1 | a | 0 | 1 | 2 |
---|
2 | a | 3 | 4 | 5 |
---|
1 | b | 6 | 7 | 8 |
---|
2 | b | 9 | 10 | 11 |
---|
根据级别汇总统计
许多DataFrame和Series的描述和汇总统计都一个level选项,它用于指定在某条轴上求和的级别
frame = DataFrame(np.arange(12).reshape((4,3)),
index=[['a','a','b','b'],[1,2,1,2]],
columns=[['Ohio','Ohio','Colorado'],['Green','Red','Green']])
frame.index.names = ['key1','key2']
frame.columns.names = ['state','color']
frame
| state | Ohio | Colorado |
---|
| color | Green | Red | Green |
---|
key1 | key2 | | | |
---|
a | 1 | 0 | 1 | 2 |
---|
2 | 3 | 4 | 5 |
---|
b | 1 | 6 | 7 | 8 |
---|
2 | 9 | 10 | 11 |
---|
frame.sum(level='key2')
state | Ohio | Colorado |
---|
color | Green | Red | Green |
---|
key2 | | | |
---|
1 | 6 | 8 | 10 |
---|
2 | 12 | 14 | 16 |
---|
frame.sum(level='color',axis=1)
| color | Green | Red |
---|
key1 | key2 | | |
---|
a | 1 | 2 | 1 |
---|
2 | 8 | 4 |
---|
b | 1 | 14 | 7 |
---|
2 | 20 | 10 |
---|
set_index()将一个或多个列转换为行索引
默认情况下,那些列会从DataFrame中移除,但也可以将其保留下来drop=False
frame = DataFrame({'a':range(7),'b':range(7,0,-1),'c':['one','one','one','two','two','two','two'],'d':[0,1,2,0,1,2,3]})
frame
| a | b | c | d |
---|
0 | 0 | 7 | one | 0 |
---|
1 | 1 | 6 | one | 1 |
---|
2 | 2 | 5 | one | 2 |
---|
3 | 3 | 4 | two | 0 |
---|
4 | 4 | 3 | two | 1 |
---|
5 | 5 | 2 | two | 2 |
---|
6 | 6 | 1 | two | 3 |
---|
frame.set_index(['c','d'])
| | a | b |
---|
c | d | | |
---|
one | 0 | 0 | 7 |
---|
1 | 1 | 6 |
---|
2 | 2 | 5 |
---|
two | 0 | 3 | 4 |
---|
1 | 4 | 3 |
---|
2 | 5 | 2 |
---|
3 | 6 | 1 |
---|
frame
| a | b | c | d |
---|
0 | 0 | 7 | one | 0 |
---|
1 | 1 | 6 | one | 1 |
---|
2 | 2 | 5 | one | 2 |
---|
3 | 3 | 4 | two | 0 |
---|
4 | 4 | 3 | two | 1 |
---|
5 | 5 | 2 | two | 2 |
---|
6 | 6 | 1 | two | 3 |
---|
frame.set_index(['c','d'],drop=False)
| | a | b | c | d |
---|
c | d | | | | |
---|
one | 0 | 0 | 7 | one | 0 |
---|
1 | 1 | 6 | one | 1 |
---|
2 | 2 | 5 | one | 2 |
---|
two | 0 | 3 | 4 | two | 0 |
---|
1 | 4 | 3 | two | 1 |
---|
2 | 5 | 2 | two | 2 |
---|
3 | 6 | 1 | two | 3 |
---|
reset_index() 层次化索引的级别会被移到列里
frame
| a | b | c | d |
---|
0 | 0 | 7 | one | 0 |
---|
1 | 1 | 6 | one | 1 |
---|
2 | 2 | 5 | one | 2 |
---|
3 | 3 | 4 | two | 0 |
---|
4 | 4 | 3 | two | 1 |
---|
5 | 5 | 2 | two | 2 |
---|
6 | 6 | 1 | two | 3 |
---|
frame2 = frame.set_index(['c','d'])
frame2
| | a | b |
---|
c | d | | |
---|
one | 0 | 0 | 7 |
---|
1 | 1 | 6 |
---|
2 | 2 | 5 |
---|
two | 0 | 3 | 4 |
---|
1 | 4 | 3 |
---|
2 | 5 | 2 |
---|
3 | 6 | 1 |
---|
frame2.reset_index()
| c | d | a | b |
---|
0 | one | 0 | 0 | 7 |
---|
1 | one | 1 | 1 | 6 |
---|
2 | one | 2 | 2 | 5 |
---|
3 | two | 0 | 3 | 4 |
---|
4 | two | 1 | 4 | 3 |
---|
5 | two | 2 | 5 | 2 |
---|
6 | two | 3 | 6 | 1 |
---|