panda 表合并

import pandas as pd

示例1:依据一组key合并

# 定义资料集并打印出
left1 = pd.DataFrame({
    'key': ['K0', 'K1', 'K2', 'K3'],
    'A': ['A0', 'A1', 'A2', 'A3'],
    'B': ['B0', 'B1', 'B2', 'B3']
})
right1 = pd.DataFrame({
    'key': ['K0', 'K1', 'K2', 'K3'],
    'C': ['C0', 'C1', 'C2', 'C3'],
    'D': ['D0', 'D1', 'D2', 'D3']
})
left1
ABkey
0A0B0K0
1A1B1K1
2A2B2K2
3A3B3K3
right1
CDkey
0C0D0K0
1C1D1K1
2C2D2K2
3C3D3K3
pd.merge(left1, right1, on='key')
ABkeyCD
0A0B0K0C0D0
1A1B1K1C1D1
2A2B2K2C2D2
3A3B3K3C3D3

示例2:依据两组key合并

** 合并时有4种方法how = [‘left’, ‘right’, ‘outer’, ‘inner’],预设值how=‘inner’ **

# 定义资料集并打印出
left2 = pd.DataFrame({
    'key1': ['K0', 'K0', 'K1', 'K2'],
    'key2': ['K0', 'K1', 'K0', 'K1'],
    'A': ['A0', 'A1', 'A2', 'A3'],
    'B': ['B0', 'B1', 'B2', 'B3']
})
right2 = pd.DataFrame({
    'key1': ['K0', 'K1', 'K1', 'K2'],
    'key2': ['K0', 'K0', 'K0', 'K0'],
    'C': ['C0', 'C1', 'C2', 'C3'],
    'D': ['D0', 'D1', 'D2', 'D3']
})
left2
ABkey1key2
0A0B0K0K0
1A1B1K0K1
2A2B2K1K0
3A3B3K2K1
right2
CDkey1key2
0C0D0K0K0
1C1D1K1K0
2C2D2K1K0
3C3D3K2K0
pd.merge(left2, right2, on=['key1', 'key2'], how='inner')
ABkey1key2CD
0A0B0K0K0C0D0
1A2B2K1K0C1D1
2A2B2K1K0C2D2
pd.merge(left2, right2, on=['key1', 'key2'], how='outer')
ABkey1key2CD
0A0B0K0K0C0D0
1A1B1K0K1NaNNaN
2A2B2K1K0C1D1
3A2B2K1K0C2D2
4A3B3K2K1NaNNaN
5NaNNaNK2K0C3D3
pd.merge(left2, right2, on=['key1', 'key2'], how='left')
ABkey1key2CD
0A0B0K0K0C0D0
1A1B1K0K1NaNNaN
2A2B2K1K0C1D1
3A2B2K1K0C2D2
4A3B3K2K1NaNNaN
pd.merge(left2, right2, on=['key1', 'key2'], how='right')
ABkey1key2CD
0A0B0K0K0C0D0
1A2B2K1K0C1D1
2A2B2K1K0C2D2
3NaNNaNK2K0C3D3

分割线 与left_index 对比

# 如果 on 和 left_index=True, right_index=True 同时使用,以 left 的 on 为准
pd.merge(left2, right2, on=['key1', 'key2'], left_index=True, right_index=True,  how='right')
ABkey1key2CD
0A0B0K0K0C0D0
1A1B1K0K1C1D1
2A2B2K1K0C2D2
3A3B3K2K1C3D3
pd.merge(left2, right2, left_index=True, right_index=True,  how='right')
ABkey1_xkey2_xCDkey1_ykey2_y
0A0B0K0K0C0D0K0K0
1A1B1K0K1C1D1K1K0
2A2B2K1K0C2D2K1K0
3A3B3K2K1C3D3K2K0
right2.index
RangeIndex(start=0, stop=4, step=1)
left2.index
RangeIndex(start=0, stop=4, step=1)

示例3:Indicator

** indicator=True会将合并的记录放在新的一列 **

# 定义资料集并打印出
left3 = pd.DataFrame({
    'col1': [0, 1],
    'col_left': ['a', 'b']
})
right3 = pd.DataFrame({
    'col1': [1, 2, 2],
    'col_right': [2, 2, 2]
})
left3
col1col_left
00a
11b
right3
col1col_right
012
122
222
# 依据col1进行合并,并启用indicator=True,最后打印出
pd.merge(left3, right3, on='col1', how='outer', indicator=True)
col1col_leftcol_right_merge
00aNaNleft_only
11b2.0both
22NaN2.0right_only
32NaN2.0right_only
# 自定indicator column的名称,并打印出
pd.merge(left3, right3, on='col1', how='outer', indicator='indicator_column')
col1col_leftcol_rightindicator_column
00aNaNleft_only
11b2.0both
22NaN2.0right_only
32NaN2.0right_only

示例4:依据index合并

# 定义资料集并打印出
left4 = pd.DataFrame(
    {
        'A': ['A0', 'A1', 'A2'],
        'B': ['B0', 'B1', 'B2']
    },
    index=['K0', 'K1', 'K2']
)
right4 = pd.DataFrame(
    {
        'C': ['C0', 'C2', 'C3'],
        'D': ['D0', 'D2', 'D3']
    },
    index=['K0', 'K2', 'K3']
)
left4
AB
K0A0B0
K1A1B1
K2A2B2
right4
CD
K0C0D0
K2C2D2
K3C3D3
left4.index
Index(['K0', 'K1', 'K2'], dtype='object')
right4.index
Index(['K0', 'K2', 'K3'], dtype='object')
#依据左右资料集的index进行合并,how='outer',并打印出
pd.merge(left4, right4, left_index=True, right_index=True, how='outer')
ABCD
K0A0B0C0D0
K1A1B1NaNNaN
K2A2B2C2D2
K3NaNNaNC3D3
#依据左右资料集的index进行合并,how='inner',并打印出
pd.merge(left4, right4, left_index=True, right_index=True, how='inner')
ABCD
K0A0B0C0D0
K2A2B2C2D2
#依据左右资料集的index进行合并,how='inner',并打印出
pd.merge(left4, right4, left_index=True, right_index=True, how='inner')
ABCD
K0A0B0C0D0
K2A2B2C2D2

示例5:解决overlapping的问题

#定义资料集
left5 = pd.DataFrame({
    'k': ['K0', 'K1', 'K2'], 
    'age': [1, 2, 3]
})
right5 = pd.DataFrame({
    'k': ['K0', 'K0', 'K3'], 
    'age': [4, 5, 6]
})
left5
agek
01K0
12K1
23K2
right5
agek
04K0
15K0
26K3
# age 冲突的情况下,可以使用 suffixes 属性来重命名
pd.merge(left5, right5, on='k',  suffixes=['_left5', '_right5'], how='inner')
age_left5kage_right5
01K04
11K05
left5.columns
Index(['age', 'k'], dtype='object')
left5.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
age    3 non-null int64
k      3 non-null object
dtypes: int64(1), object(1)
memory usage: 128.0+ bytes
left5.describe().T
countmeanstdmin25%50%75%max
age3.02.01.01.01.52.02.53.0

参考链接

Pandas 合并 merge

PANDAS 数据合并与重塑(join/merge篇)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值