包含全部示例的代码仓库见GIthub
1 导入库
import pandas as pd
import numpy as np
2 横向合并
2.1 一列为准
data1 = pd.DataFrame({'one':['a','b','a','a','c','b'], 'two':range(6)})
data2 = pd.DataFrame({'one':['a','b','c','d'],'two':range(10,14)})
data1
# output
one two
0 a 0
1 b 1
2 a 2
3 a 3
4 c 4
5 b 5
data2
# output
one two
0 a 10
1 b 11
2 c 12
3 d 13
根据one列进行横向合并,只保留one中共有元素,交集
data = pd.merge(data1, data2, on='one')
data
# output
one two_x two_y
0 a 0 10
1 a 2 10
2 a 3 10
3 b 1 11
4 b 5 11
5 c 4 12
根据one列进行横向合并,并集
data = pd.merge(data1, data2, on='one', how='outer')
data
# output
one two_x two_y
0 a 0.0 10
1 a 2.0 10
2 a 3.0 10
3 b 1.0 11
4 b 5.0 11
5 c 4.0 12
6 d NaN 13
根据one列进行横向合并,以data1为准
data = pd.merge(data1, data2, on='one', how='left')
data
# output
one two_x two_y
0 a 0 10
1 b 1 11
2 a 2 10
3 a 3 10
4 c 4 12
5 b 5 11
根据one列进行横向合并,以data2为准
data = pd.merge(data1, data2, on='one', how='right')
data
# output
one two_x two_y
0 a 0.0 10
1 a 2.0 10
2 a 3.0 10
3 b 1.0 11
4 b 5.0 11
5 c 4.0 12
6 d NaN 13
2.2 两列为准
data1 = pd.DataFrame({'one1':['a','b','a','a','c','b'], 'two1':range(6)})
data2 = pd.DataFrame({'one2':['a','b','c','d'],'two2':range(10,14)})
data1
# output
one1 two1
0 a 0
1 b 1
2 a 2
3 a 3
4 c 4
5 b 5
data2
# output
one2 two2
0 a 10
1 b 11
2 c 12
3 d 13
根据one列进行横向合并,data1根据one1,data2根据one2
data = pd.merge(data1, data2, left_on='one1', right_on='one2')
data
# output
one1 two1 one2 two2
0 a 0 a 10
1 a 2 a 10
2 a 3 a 10
3 b 1 b 11
4 b 5 b 11
5 c 4 c 12
根据one列进行横向合并,根据index合并,并集
data = pd.merge(data1, data2, left_index=True, right_index=True, how='outer')
data
# output
one1 two1 one2 two2
0 a 0 a 10.0
1 b 1 b 11.0
2 a 2 c 12.0
3 a 3 d 13.0
4 c 4 NaN NaN
5 b 5 NaN NaN
根据one列进行横向合并,左边根据two1,右边根据index
data = pd.merge(data1, data2, left_on='two1', right_index=True, how='outer')
data
# output
one1 two1 one2 two2
0 a 0 a 10.0
1 b 1 b 11.0
2 a 2 c 12.0
3 a 3 d 13.0
4 c 4 NaN NaN
5 b 5 NaN NaN
快速合并,根据index,且没有相同的列,index不同也可以
data1 = data1.iloc[:4]
data1.join(data2)
以data1为准,join data2
data1 = pd.DataFrame({'one1':['a','b','a','a','c','b'], 'two1':range(6)})
data2 = pd.DataFrame({'one2':['a','b','c','d'],'two2':range(10,14)})
data1.join(data2)
# output
one1 two1 one2 two2
0 a 0 a 10.0
1 b 1 b 11.0
2 a 2 c 12.0
3 a 3 d 13.0
4 c 4 NaN NaN
5 b 5 NaN NaN
直接增加一列
data1.assign(three=np.arange(6))
# output
one1 two1 three
0 a 0 0
1 b 1 1
2 a 2 2
3 a 3 3
4 c 4 4
5 b 5 5
3 纵向合并
data1 = pd.DataFrame(np.random.randn(3,4), columns=['a','b','c','d'])
data2 = pd.DataFrame(np.random.randn(2,3), columns=['b','c','a'])
data1
# output
a b c d
0 -0.314425 -0.571201 0.202219 0.823889
1 1.307029 -1.119135 0.734455 -0.438920
2 -0.222125 -0.569837 -0.992926 -1.197502
data2
# output
b c a
0 0.247811 -0.043823 -0.729572
1 -0.681311 1.236160 0.956227
data = pd.concat([data1, data2])
data
# output
a b c d
0 -0.314425 -0.571201 0.202219 0.823889
1 1.307029 -1.119135 0.734455 -0.438920
2 -0.222125 -0.569837 -0.992926 -1.197502
0 -0.729572 0.247811 -0.043823 NaN
1 0.956227 -0.681311 1.236160 NaN
更新index
data = pd.concat([data1, data2], ignore_index=True)
data
# output
a b c d
0 -0.314425 -0.571201 0.202219 0.823889
1 1.307029 -1.119135 0.734455 -0.438920
2 -0.222125 -0.569837 -0.992926 -1.197502
3 -0.729572 0.247811 -0.043823 NaN
4 0.956227 -0.681311 1.236160 NaN