层次化索引
import pandas as pd
import numpy as np
data = pd.Series(np.random.randn(9),index = [['a','a','a','b','b','c','c','d','d'],[1,2,3,1,3,1,2,2,3]])
data
>>>
a 1 -0.951339
2 -0.325023
3 -0.075059
b 1 0.924771
3 -0.091296
c 1 0.168038
2 0.691759
d 2 1.264849
3 -0.919560
dtype: float64
In [3]:
data.index
Out[3]:
MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1, 1, 2]])
In [4]:
data['b']
Out[4]:
1 0.924771
3 -0.091296
dtype: float64
In [5]:
data['b':'c']
Out[5]:
b 1 0.924771
3 -0.091296
c 1 0.168038
2 0.691759
dtype: float64
In [8]:
data.loc[:,2]
Out[8]:
a -0.325023
c 0.691759
d 1.264849
dtype: float64
In [9]:
data.unstack()
Out[9]:
1 2 3
a -0.951339 -0.325023 -0.075059
b 0.924771 NaN -0.091296
c 0.168038 0.691759 NaN
d NaN 1.264849 -0.919560
In [10]:
data.unstack().stack()
Out[10]:
a 1 -0.951339
2 -0.325023
3 -0.075059
b 1 0.924771
3 -0.091296
c 1 0.168038
2 0.691759
d 2 1.264849
3 -0.919560
dtype: float64
In [11]:
frame = pd.DataFrame(np.arange(12).reshape((4,3)),
index = [['a','a','b','b'],[1,2,1,2]],
columns = [['ohio','ohio','colorado'],['green','red','green']])
frame
Out[11]:
ohio colorado
green red green
a 1 0 1 2
2 3 4 5
b 1 6 7 8
2 9 10 11
In [12]:
# 指定名称
frame.index.names = ['key1','key2']
frame.columns.names = ['state','color']
frame
Out[12]:
state ohio colorado
color green red green
key1 key2
a 1 0 1 2
2 3 4 5
b 1 6 7 8
2 9 10 11
In [13]:
frame['ohio']
Out[13]:
color green red
key1 key2
a 1 0 1
2 3 4
b 1 6 7
2 9 10
重排与分级排序
# swaplevel接受两个级别编号或名称,并返回一个互换级别的新对象
frame.swaplevel('key1','key2') # key1、key2互换
>>>
state ohio colorado
color green red green
key2 key1
1 a 0 1 2
2 a 3 4 5
1 b 6 7 8
2 b 9 10 11
In [16]:
# sort_index根据单个级别中的值对数据排序
frame.sort_index(level = 1)
Out[16]:
state ohio colorado
color green red green
key1 key2
a 1 0 1 2
b 1 6 7 8
a 2 3 4 5
b 2 9 10 11
In [17]:
frame
Out[17]:
state ohio colorado
color green red green
key1 key2
a 1 0 1 2
2 3 4 5
b 1 6 7 8
2 9 10 11
In [18]:
frame.swaplevel(0,1)
Out[18]:
state ohio colorado
color green red green
key2 key1
1 a 0 1 2
2 a 3 4 5
1 b 6 7 8
2 b 9 10 11
In [20]:
frame.swaplevel(0,1).sort_index()
Out[20]:
state ohio colorado
color green red green
key2 key1
1 a 0 1 2
b 6 7 8
2 a 3 4 5
b 9 10 11
In [21]:
frame.swaplevel(0,1).sort_index(level = 0)
Out[21]:
state ohio colorado
color green red green
key2 key1
1 a 0 1 2
b 6 7 8
2 a 3 4 5
b 9 10 11
In [22]:
frame.swaplevel(0,1).sort_index(level = 1)
Out[22]:
state ohio colorado
color green red green
key2 key1
1 a 0 1 2
2 a 3 4 5
1 b 6 7 8
2 b 9 10 11
根据级别汇总统计
frame
>>>
state ohio colorado
color green red green
key1 key2
a 1 0 1 2
2 3 4 5
b 1 6 7 8
2 9 10 11
In [25]:
frame.sum(level = 'key2')
Out[25]:
state ohio colorado
color green red green
key2
1 6 8 10
2 12 14 16
In [26]:
frame.sum(level = 'key1')
Out[26]:
state ohio colorado
color green red green
key1
a 3 5 7
b 15 17 19
In [27]:
frame.sum(level = 'color',axis = 1)
Out[27]:
color green red
key1 key2
a 1 2 1
2 8 4
b 1 14 7
2 20 10
使用DataFrame的列进行索引
frame = pd.DataFrame({'a':range(7),'b':range(7,0,-1),
'c':['one','one','one','two','two','two','two'],
'd':[0,1,2,0,1,2,3]})
frame
>>>
a b c d
0 0 7 one 0
1 1 6 one 1
2 2 5 one 2
3 3 4 two 0
4 4 3 two 1
5 5 2 two 2
6 6 1 two 3
In [31]:
# set_index函数将列转化为行索引
frame2 = frame.set_index(['c','d'])
frame2
Out[31]:
a b
c d
one 0 0 7
1 1 6
2 2 5
two 0 3 4
1 4 3
2 5 2
3 6 1
In [33]:
# 这些列默认移除,也可以保留
frame.set_index(['c','d'],drop = False)
Out[33]:
a b c d
c d
one 0 0 7 one 0
1 1 6 one 1
2 2 5 one 2
two 0 3 4 two 0
1 4 3 two 1
2 5 2 two 2
3 6 1 two 3
In [35]:
# reset_index将层次化索引转移到列
frame2.reset_index()
Out[35]:
c d a b
0 one 0 0 7
1 one 1 1 6
2 one 2 2 5
3 two 0 3 4
4 two 1 4 3
5 two 2 5 2
6 two 3 6 1
合并数据集
merge函数合并
df1 = pd.DataFrame({'key':['b','b','a','c','a','a','b'],'datal':range(7)})
df2 = pd.DataFrame({'key':['a','b','d'],'data2':range(3)})
df1
>>>
key datal
0 b 0
1 b 1
2 a 2
3 c 3
4 a 4
5 a 5
6 b 6
In [39]:
df2
Out[39]:
key data2
0 a 0
1 b 1
2 d 2
In [40]:
# merge默认将重叠列的列名做键
pd.merge(df1,df2)
Out[40]:
key datal data2
0 b 0 1
1 b 1 1
2 b 6 1
3 a 2 0
4 a 4 0
5 a 5 0
In [41]:
# 指定列名做键
pd.merge(df1,df2,on = 'key')
Out[41]:
key datal data2
0 b 0 1
1 b 1 1
2 b 6 1
3 a 2 0
4 a 4 0
5 a 5 0
In [42]:
df3 = pd.DataFrame({'1key':['b','b','a','c','a','a','b'],'data1':range(7)})
df4 = pd.DataFrame({'rkey':['a','b','d'],'data2':range(3)})
pd.merge(df3,df4,left_on = '1key',right_on = 'rkey')
Out[42]:
1key data1 rkey data2
0 b 0 b 1
1 b 1 b 1
2 b 6 b 1
3 a 2 a 0
4 a 4 a 0
5 a 5 a 0
In [44]:
pd.merge(df1,df2,how = 'outer')
Out[44]:
key datal data2
0 b 0.0 1.0
1 b 1.0 1.0
2 b 6.0 1.0
3 a 2.0 0.0
4 a 4.0 0.0
5 a 5.0 0.0
6 c 3.0 NaN
7 d NaN 2.0
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],'data1': range(6)})
df2 = pd.DataFrame({'key': ['a', 'b', 'a', 'b', 'd'],'data2': range(5)})
df1
>>>
key data1
0 b 0
1 b 1
2 a 2
3 c 3
4 a 4
5 b 5
In [46]:
df2
Out[46]:
key data2
0 a 0
1 b 1
2 a 2
3 b 3
4 d 4
In [47]:
pd.merge(df1,df2, on = 'key',how = 'left')
Out[47]:
key data1 data2
0 b 0 1.0
1 b 0 3.0
2 b 1 1.0
3 b 1 3.0
4 a 2 0.0
5 a 2 2.0
6 c 3 NaN
7 a 4 0.0
8 a 4 2.0
9 b 5 1.0
10 b 5 3.0
In [48]:
pd.merge(df1,df2,how = 'inner')
Out[48]:
key data1 data2
0 b 0 1
1 b 0 3
2 b 1 1
3 b 1 3
4 b 5 1
5 b 5 3
6 a 2 0
7 a 2 2
8 a 4 0
9 a 4 2
In [49]:
left = pd.DataFrame({'key1': ['foo', 'foo', 'bar'],'key2': ['one', 'two', 'one'],'lval': [1, 2, 3]})
right = pd.DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'],'key2': ['one', 'one', 'one', 'two'],'rval': [4, 5, 6, 7]})
pd.merge(left,right,on = ['key1','key2'],how = 'outer')
Out[49]:
key1 key2 lval rval
0 foo one 1.0 4.0
1 foo one 1.0 5.0
2 foo two 2.0 NaN
3 bar one 3.0 6.0
4 bar two NaN 7.0
In [51]:
pd.merge(left,right,on = 'key1')
Out[51]:
key1 key2_x lval key2_y rval
0 foo one 1 one 4
1 foo one 1 one 5
2 foo two 2 one 4
3 foo two 2 one 5
4 bar one 3 one 6
5 bar one 3 two 7
In [52]:
pd.merge(left, right, on='key1', suffixes=('_left','_right'))
Out[52]:
key1 key2_left lval key2_right rval
0 foo one 1 one 4
1 foo one 1 one 5
2 foo two 2 one 4
3 foo two 2 one 5
4 bar one 3 one 6
5 bar one 3 two 7
merge函数的参数: