import pandas as pd
import numpy as np
分层索引
data = pd.Series(
np.random.randn(9),
index = [['a','a','a','b','b','c','c','d','d'],
[1,2,3,1,3,1,2,2,3]]
)
data
a 1 0.516036
2 0.212449
3 -0.361522
b 1 0.078206
3 -0.680907
c 1 0.118010
2 -0.087036
d 2 0.416130
3 0.101482
dtype: float64
data.index
MultiIndex([('a', 1),
('a', 2),
('a', 3),
('b', 1),
('b', 3),
('c', 1),
('c', 2),
('d', 2),
('d', 3)],
)
data['b':'d']
b 1 0.078206
3 -0.680907
c 1 0.118010
2 -0.087036
d 2 0.416130
3 0.101482
dtype: float64
data.loc[['b','d']]
b 1 0.078206
3 -0.680907
d 2 0.416130
3 0.101482
dtype: float64
data.loc[:,3]
a -0.361522
b -0.680907
d 0.101482
dtype: float64
data.unstack()
| 1 | 2 | 3 |
---|
a | 0.516036 | 0.212449 | -0.361522 |
b | 0.078206 | NaN | -0.680907 |
c | 0.118010 | -0.087036 | NaN |
d | NaN | 0.416130 | 0.101482 |
data.unstack().stack()
a 1 0.516036
2 0.212449
3 -0.361522
b 1 0.078206
3 -0.680907
c 1 0.118010
2 -0.087036
d 2 0.416130
3 0.101482
dtype: float64
frame = pd.DataFrame(np.arange(12).reshape(4,3),
index=[['a','a','b','b'],
[1,2,1,2]],
columns=[['A','A','B'],
['G','R','G']])
frame
| | A | B |
---|
| | G | R | G |
---|
a | 1 | 0 | 1 | 2 |
2 | 3 | 4 | 5 |
b | 1 | 6 | 7 | 8 |
2 | 9 | 10 | 11 |
frame.index.names = ['key1','key2']
frame
| | A | B |
---|
| | G | R | G |
---|
key1 | key2 | | | |
---|
a | 1 | 0 | 1 | 2 |
2 | 3 | 4 | 5 |
b | 1 | 6 | 7 | 8 |
2 | 9 | 10 | 11 |
frame.columns.names = ['S','C']
frame
| S | A | B |
---|
| C | G | R | G |
---|
key1 | key2 | | | |
---|
a | 1 | 0 | 1 | 2 |
2 | 3 | 4 | 5 |
b | 1 | 6 | 7 | 8 |
2 | 9 | 10 | 11 |
frame.swaplevel('key1','key2')
| S | A | B |
---|
| C | G | R | G |
---|
key2 | key1 | | | |
---|
1 | a | 0 | 1 | 2 |
2 | a | 3 | 4 | 5 |
1 | b | 6 | 7 | 8 |
2 | b | 9 | 10 | 11 |
frame.sort_index(level=1)
| S | A | B |
---|
| C | G | R | G |
---|
key1 | key2 | | | |
---|
a | 1 | 0 | 1 | 2 |
b | 1 | 6 | 7 | 8 |
a | 2 | 3 | 4 | 5 |
b | 2 | 9 | 10 | 11 |
frame.sum(level='key2')
frame.sum(level='C',axis=1)
| C | G | R |
---|
key1 | key2 | | |
---|
a | 1 | 2 | 1 |
2 | 8 | 4 |
b | 1 | 14 | 7 |
2 | 20 | 10 |
frame = pd.DataFrame({
'a':range(7),
'b':range(7,0,-1),
'c':['one','one','one','two','two','two','two'],
'd':[0,1,2,0,1,2,3]
})
frame
| a | b | c | d |
---|
0 | 0 | 7 | one | 0 |
1 | 1 | 6 | one | 1 |
2 | 2 | 5 | one | 2 |
3 | 3 | 4 | two | 0 |
4 | 4 | 3 | two | 1 |
5 | 5 | 2 | two | 2 |
6 | 6 | 1 | two | 3 |
frame2 = frame.set_index(['c','d'])
frame2
| | a | b |
---|
c | d | | |
---|
one | 0 | 0 | 7 |
1 | 1 | 6 |
2 | 2 | 5 |
two | 0 | 3 | 4 |
1 | 4 | 3 |
2 | 5 | 2 |
3 | 6 | 1 |
frame2.reset_index()
| c | d | a | b |
---|
0 | one | 0 | 0 | 7 |
1 | one | 1 | 1 | 6 |
2 | one | 2 | 2 | 5 |
3 | two | 0 | 3 | 4 |
4 | two | 1 | 4 | 3 |
5 | two | 2 | 5 | 2 |
6 | two | 3 | 6 | 1 |
联合及合并数据
df1 = pd.DataFrame({
'key':['b','b','a','c','a','a','b'],
'data1':range(7)
})
df1
| key | data1 |
---|
0 | b | 0 |
1 | b | 1 |
2 | a | 2 |
3 | c | 3 |
4 | a | 4 |
5 | a | 5 |
6 | b | 6 |
df2 = pd.DataFrame({
'key':['a','b','d'],
'data2':range(3)
})
df2
inner内连接 为两表交集
outer外链接,为两表并集 笛卡尔积
left左连接,保留左表
right右连接,保留右表
pd.merge(df1,df2)
| key | data1 | data2 |
---|
0 | b | 0 | 1 |
1 | b | 1 | 1 |
2 | b | 6 | 1 |
3 | a | 2 | 0 |
4 | a | 4 | 0 |
5 | a | 5 | 0 |
pd.merge(df1,df2,left_on='key',right_on='key')
| key | data1 | data2 |
---|
0 | b | 0 | 1 |
1 | b | 1 | 1 |
2 | b | 6 | 1 |
3 | a | 2 | 0 |
4 | a | 4 | 0 |
5 | a | 5 | 0 |
pd.merge(df1,df2,how='outer')
| key | data1 | data2 |
---|
0 | b | 0.0 | 1.0 |
1 | b | 1.0 | 1.0 |
2 | b | 6.0 | 1.0 |
3 | a | 2.0 | 0.0 |
4 | a | 4.0 | 0.0 |
5 | a | 5.0 | 0.0 |
6 | c | 3.0 | NaN |
7 | d | NaN | 2.0 |
df1.set_index(df1['key'])
df2.set_index(df2['key'])
沿轴向连接
arr = np.arange(6).reshape(2,3)
arr
array([[0, 1, 2],
[3, 4, 5]])
np.concatenate([arr,arr],axis=0)
array([[0, 1, 2],
[3, 4, 5],
[0, 1, 2],
[3, 4, 5]])
np.concatenate([arr,arr],axis=1)
array([[0, 1, 2, 0, 1, 2],
[3, 4, 5, 3, 4, 5]])