8.1层次化索引
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
data=pd.Series(np.random.randn(9),index=[['a','a','a','b','b','c','c','d','d'],
[1,2,3,1,3,1,2,2,3]])
data
a 1 -1.101088
2 0.238995
3 -1.005761
b 1 -0.490334
3 0.764787
c 1 0.944579
2 -1.152828
d 2 0.208618
3 0.802699
dtype: float64
data.index
MultiIndex([('a', 1),
('a', 2),
('a', 3),
('b', 1),
('b', 3),
('c', 1),
('c', 2),
('d', 2),
('d', 3)],
)
data['b']
1 -0.490334
3 0.764787
dtype: float64
data['b':'c']
b 1 -0.490334
3 0.764787
c 1 0.944579
2 -1.152828
dtype: float64
data.loc[['b','d']]
b 1 -0.490334
3 0.764787
d 2 0.208618
3 0.802699
dtype: float64
data.loc[:,2]
a 0.238995
c -1.152828
d 0.208618
dtype: float64
data.unstack()
|
1 |
2 |
3 |
a |
-1.101088 |
0.238995 |
-1.005761 |
b |
-0.490334 |
NaN |
0.764787 |
c |
0.944579 |
-1.152828 |
NaN |
d |
NaN |
0.208618 |
0.802699 |
data.unstack().stack()
a 1 -1.101088
2 0.238995
3 -1.005761
b 1 -0.490334
3 0.764787
c 1 0.944579
2 -1.152828
d 2 0.208618
3 0.802699
dtype: float64
frame=pd.DataFrame(np.arange(12).reshape((4,3)),
index=[['a','a','b','b'],[1,2,1,2]],
columns=[['Ohio', 'Ohio', 'Colorado'],['Green','Red','Green']])
frame
|
|
Ohio |
Colorado |
|
|
Green |
Red |
Green |
a |
1 |
0 |
1 |
2 |
2 |
3 |
4 |
5 |
b |
1 |
6 |
7 |
8 |
2 |
9 |
10 |
11 |
frame.index.names=['key1','key2']
frame.columns.names=['state','color']
frame
|
state |
Ohio |
Colorado |
|
color |
Green |
Red |
Green |
key1 |
key2 |
|
|
|
a |
1 |
0 |
1 |
2 |
2 |
3 |
4 |
5 |
b |
1 |
6 |
7 |
8 |
2 |
9 |
10 |
11 |
frame['Ohio']
|
color |
Green |
Red |
key1 |
key2 |
|
|
a |
1 |
0 |
1 |
2 |
3 |
4 |
b |
1 |
6 |
7 |
2 |
9 |
10 |
重排与分级排序
frame.swaplevel('key1','key2')
|
state |
Ohio |
Colorado |
|
color |
Green |
Red |
Green |
key2 |
key1 |
|
|
|
1 |
a |
0 |
1 |
2 |
2 |
a |
3 |
4 |
5 |
1 |
b |
6 |
7 |
8 |
2 |
b |
9 |
10 |
11 |
frame.sort_index(level=1)
|
state |
Ohio |
Colorado |
|
color |
Green |
Red |
Green |
key1 |
key2 |
|
|
|
a |
1 |
0 |
1 |
2 |
b |
1 |
6 |
7 |
8 |
a |
2 |
3 |
4 |
5 |
b |
2 |
9 |
10 |
11 |
frame.sort_index(level=0)
|
state |
Ohio |
Colorado |
|
color |
Green |
Red |
Green |
key1 |
key2 |
|
|
|
a |
1 |
0 |
1 |
2 |
2 |
3 |
4 |
5 |
b |
1 |
6 |
7 |
8 |
2 |
9 |
10 |
11 |
frame.swaplevel(0,1)
|
state |
Ohio |
Colorado |
|
color |
Green |
Red |
Green |
key2 |
key1 |
|
|
|
1 |
a |
0 |
1 |
2 |
2 |
a |
3 |
4 |
5 |
1 |
b |
6 |
7 |
8 |
2 |
b |
9 |
10 |
11 |
frame.swaplevel(0,1).sort_index(level=0)
|
state |
Ohio |
Colorado |
|
color |
Green |
Red |
Green |
key2 |
key1 |
|
|
|
1 |
a |
0 |
1 |
2 |
b |
6 |
7 |
8 |
2 |
a |
3 |
4 |
5 |
b |
9 |
10 |
11 |
根据级别汇总统计
frame.sum(level='key2')
state |
Ohio |
Colorado |
color |
Green |
Red |
Green |
key2 |
|
|
|
1 |
6 |
8 |
10 |
2 |
12 |
14 |
16 |
frame.sum(level='key1')
state |
Ohio |
Colorado |
color |
Green |
Red |
Green |
key1 |
|
|
|
a |
3 |
5 |
7 |
b |
15 |
17 |
19 |
frame.sum(level='color',axis=1)
|
color |
Green |
Red |
key1 |
key2 |
|
|
a |
1 |
2 |
1 |
2 |
8 |
4 |
b |
1 |
14 |
7 |
2 |
20 |
10 |
使用dataframe的列进行索引
frame=pd.DataFrame({
'a':range(7),
'b':range(7,0,-1),
'c':['one','one','one','two','two','two','two'],
'd':[0,1,2,0,1,2,3]})
frame
|
a |
b |
c |
d |
0 |
0 |
7 |
one |
0 |
1 |
1 |
6 |
one |
1 |
2 |
2 |
5 |
one |
2 |
3 |
3 |
4 |
two |
0 |
4 |
4 |
3 |
two |
1 |
5 |
5 |
2 |
two |
2 |
6 |
6 |
1 |
two |
3 |
frame2=frame.set_index(['c','d'])
frame2
|
|
a |
b |
c |
d |
|
|
one |
0 |
0 |
7 |
1 |
1 |
6 |
2 |
2 |
5 |
two |
0 |
3 |
4 |
1 |
4 |
3 |
2 |
5 |
2 |
3 |
6 |
1 |
frame.set_index(['c','d'],drop=False)
|
|
a |
b |
c |
d |
c |
d |
|
|
|
|
one |
0 |
0 |
7 |
one |
0 |
1 |
1 |
6 |
one |
1 |
2 |
2 |
5 |
one |
2 |
two |
0 |
3 |
4 |
two |
0 |
1 |
4 |
3 |
two |
1 |
2 |
5 |
2 |
two |
2 |
3 |
6 |
1 |
two |
3 |
frame2.reset_index()
|
c |
d |
a |
b |
0 |
one |
0 |
0 |
7 |
1 |
one |
1 |
1 |
6 |
2 |
one |
2 |
2 |
5 |
3 |
two |
0 |
3 |
4 |
4 |
two |
1 |
4 |
3 |
5 |
two |
2 |
5 |
2 |
6 |
two |
3 |
6 |
1 |
8.2合并数据集
数据库风格的dataframe合并
df1 = pd.DataFrame({
'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
'data1': range(7)})
df1
|
key |
data1 |
0 |
b |
0 |
1 |
b |
1 |
2 |
a |
2 |
3 |
c |
3 |
4 |
a |
4 |
5 |
a |
5 |
6 |
b |
6 |
df2 = pd.DataFrame({
'key': ['a', 'b', 'd'],
'data2': range(3)})
df2
|
key |
data2 |
0 |
a |
0 |
1 |
b |
1 |
2 |
d |
2 |
pandas.merge使用:https://zhuanlan.zhihu.com/p/102274476
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-yoEHLdR4-1650358450347)(./表8-1.jpg)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-pTEI2ZFv-1650358450348)(./表8-2.jpg)]
pd.merge(df1,df2)