import pandas as pd
示例1:依据一组key合并
left1 = pd.DataFrame({
'key': ['K0', 'K1', 'K2', 'K3'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']
})
right1 = pd.DataFrame({
'key': ['K0', 'K1', 'K2', 'K3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']
})
left1
| A | B | key |
---|
0 | A0 | B0 | K0 |
---|
1 | A1 | B1 | K1 |
---|
2 | A2 | B2 | K2 |
---|
3 | A3 | B3 | K3 |
---|
right1
| C | D | key |
---|
0 | C0 | D0 | K0 |
---|
1 | C1 | D1 | K1 |
---|
2 | C2 | D2 | K2 |
---|
3 | C3 | D3 | K3 |
---|
pd.merge(left1, right1, on='key')
| A | B | key | C | D |
---|
0 | A0 | B0 | K0 | C0 | D0 |
---|
1 | A1 | B1 | K1 | C1 | D1 |
---|
2 | A2 | B2 | K2 | C2 | D2 |
---|
3 | A3 | B3 | K3 | C3 | D3 |
---|
示例2:依据两组key合并
** 合并时有4种方法how = [‘left’, ‘right’, ‘outer’, ‘inner’],预设值how=‘inner’ **
left2 = pd.DataFrame({
'key1': ['K0', 'K0', 'K1', 'K2'],
'key2': ['K0', 'K1', 'K0', 'K1'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']
})
right2 = pd.DataFrame({
'key1': ['K0', 'K1', 'K1', 'K2'],
'key2': ['K0', 'K0', 'K0', 'K0'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']
})
left2
| A | B | key1 | key2 |
---|
0 | A0 | B0 | K0 | K0 |
---|
1 | A1 | B1 | K0 | K1 |
---|
2 | A2 | B2 | K1 | K0 |
---|
3 | A3 | B3 | K2 | K1 |
---|
right2
| C | D | key1 | key2 |
---|
0 | C0 | D0 | K0 | K0 |
---|
1 | C1 | D1 | K1 | K0 |
---|
2 | C2 | D2 | K1 | K0 |
---|
3 | C3 | D3 | K2 | K0 |
---|
pd.merge(left2, right2, on=['key1', 'key2'], how='inner')
| A | B | key1 | key2 | C | D |
---|
0 | A0 | B0 | K0 | K0 | C0 | D0 |
---|
1 | A2 | B2 | K1 | K0 | C1 | D1 |
---|
2 | A2 | B2 | K1 | K0 | C2 | D2 |
---|
pd.merge(left2, right2, on=['key1', 'key2'], how='outer')
| A | B | key1 | key2 | C | D |
---|
0 | A0 | B0 | K0 | K0 | C0 | D0 |
---|
1 | A1 | B1 | K0 | K1 | NaN | NaN |
---|
2 | A2 | B2 | K1 | K0 | C1 | D1 |
---|
3 | A2 | B2 | K1 | K0 | C2 | D2 |
---|
4 | A3 | B3 | K2 | K1 | NaN | NaN |
---|
5 | NaN | NaN | K2 | K0 | C3 | D3 |
---|
pd.merge(left2, right2, on=['key1', 'key2'], how='left')
| A | B | key1 | key2 | C | D |
---|
0 | A0 | B0 | K0 | K0 | C0 | D0 |
---|
1 | A1 | B1 | K0 | K1 | NaN | NaN |
---|
2 | A2 | B2 | K1 | K0 | C1 | D1 |
---|
3 | A2 | B2 | K1 | K0 | C2 | D2 |
---|
4 | A3 | B3 | K2 | K1 | NaN | NaN |
---|
pd.merge(left2, right2, on=['key1', 'key2'], how='right')
| A | B | key1 | key2 | C | D |
---|
0 | A0 | B0 | K0 | K0 | C0 | D0 |
---|
1 | A2 | B2 | K1 | K0 | C1 | D1 |
---|
2 | A2 | B2 | K1 | K0 | C2 | D2 |
---|
3 | NaN | NaN | K2 | K0 | C3 | D3 |
---|
分割线 与left_index 对比
pd.merge(left2, right2, on=['key1', 'key2'], left_index=True, right_index=True, how='right')
| A | B | key1 | key2 | C | D |
---|
0 | A0 | B0 | K0 | K0 | C0 | D0 |
---|
1 | A1 | B1 | K0 | K1 | C1 | D1 |
---|
2 | A2 | B2 | K1 | K0 | C2 | D2 |
---|
3 | A3 | B3 | K2 | K1 | C3 | D3 |
---|
pd.merge(left2, right2, left_index=True, right_index=True, how='right')
| A | B | key1_x | key2_x | C | D | key1_y | key2_y |
---|
0 | A0 | B0 | K0 | K0 | C0 | D0 | K0 | K0 |
---|
1 | A1 | B1 | K0 | K1 | C1 | D1 | K1 | K0 |
---|
2 | A2 | B2 | K1 | K0 | C2 | D2 | K1 | K0 |
---|
3 | A3 | B3 | K2 | K1 | C3 | D3 | K2 | K0 |
---|
right2.index
RangeIndex(start=0, stop=4, step=1)
left2.index
RangeIndex(start=0, stop=4, step=1)
示例3:Indicator
** indicator=True会将合并的记录放在新的一列 **
left3 = pd.DataFrame({
'col1': [0, 1],
'col_left': ['a', 'b']
})
right3 = pd.DataFrame({
'col1': [1, 2, 2],
'col_right': [2, 2, 2]
})
left3
right3
pd.merge(left3, right3, on='col1', how='outer', indicator=True)
| col1 | col_left | col_right | _merge |
---|
0 | 0 | a | NaN | left_only |
---|
1 | 1 | b | 2.0 | both |
---|
2 | 2 | NaN | 2.0 | right_only |
---|
3 | 2 | NaN | 2.0 | right_only |
---|
pd.merge(left3, right3, on='col1', how='outer', indicator='indicator_column')
| col1 | col_left | col_right | indicator_column |
---|
0 | 0 | a | NaN | left_only |
---|
1 | 1 | b | 2.0 | both |
---|
2 | 2 | NaN | 2.0 | right_only |
---|
3 | 2 | NaN | 2.0 | right_only |
---|
示例4:依据index合并
left4 = pd.DataFrame(
{
'A': ['A0', 'A1', 'A2'],
'B': ['B0', 'B1', 'B2']
},
index=['K0', 'K1', 'K2']
)
right4 = pd.DataFrame(
{
'C': ['C0', 'C2', 'C3'],
'D': ['D0', 'D2', 'D3']
},
index=['K0', 'K2', 'K3']
)
left4
right4
left4.index
Index(['K0', 'K1', 'K2'], dtype='object')
right4.index
Index(['K0', 'K2', 'K3'], dtype='object')
pd.merge(left4, right4, left_index=True, right_index=True, how='outer')
| A | B | C | D |
---|
K0 | A0 | B0 | C0 | D0 |
---|
K1 | A1 | B1 | NaN | NaN |
---|
K2 | A2 | B2 | C2 | D2 |
---|
K3 | NaN | NaN | C3 | D3 |
---|
pd.merge(left4, right4, left_index=True, right_index=True, how='inner')
pd.merge(left4, right4, left_index=True, right_index=True, how='inner')
示例5:解决overlapping的问题
left5 = pd.DataFrame({
'k': ['K0', 'K1', 'K2'],
'age': [1, 2, 3]
})
right5 = pd.DataFrame({
'k': ['K0', 'K0', 'K3'],
'age': [4, 5, 6]
})
left5
right5
pd.merge(left5, right5, on='k', suffixes=['_left5', '_right5'], how='inner')
| age_left5 | k | age_right5 |
---|
0 | 1 | K0 | 4 |
---|
1 | 1 | K0 | 5 |
---|
left5.columns
Index(['age', 'k'], dtype='object')
left5.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
age 3 non-null int64
k 3 non-null object
dtypes: int64(1), object(1)
memory usage: 128.0+ bytes
left5.describe().T
| count | mean | std | min | 25% | 50% | 75% | max |
---|
age | 3.0 | 2.0 | 1.0 | 1.0 | 1.5 | 2.0 | 2.5 | 3.0 |
---|
参考链接
Pandas 合并 merge
PANDAS 数据合并与重塑(join/merge篇)