Pandas具有全功能的,高性能内存中连接操作,与SQL等关系数据库非常相似
一、join:直接通过“索引”链接合并2个DataFrame
x.join(self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False) -> DataFrame
1、案例01
import numpy as np
import pandas as pd
# pd.join() → 直接通过索引链接
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
'B': ['B0', 'B1', 'B2']},
index=['K0', 'K1', 'K2'])
right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
'D': ['D0', 'D2', 'D3']},
index=['K0', 'K2', 'K3'])
print("left = \n", left)
print('-' * 100)
print("right = \n", right)
print('-' * 200)
x1 = left.join(right)
print("x1 = \n", x1)
print('-' * 100)
# 等价于:pd.merge(left, right, left_index=True, right_index=True, how='outer')
x2 = left.join(right, how='outer')
print("x2 = \n", x2)
print('-' * 200)
打印结果:
left =
A B
K0 A0 B0
K1 A1 B1
K2 A2 B2
----------------------------------------------------------------------------------------------------
right =
C D
K0 C0 D0
K2 C2 D2
K3 C3 D3
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
x1 =
A B C D
K0 A0 B0 C0 D0
K1 A1 B1 NaN NaN
K2 A2 B2 C2 D2
----------------------------------------------------------------------------------------------------
x2 =
A B C D
K0 A0 B0 C0 D0
K1 A1 B1 NaN NaN
K2 A2 B2 C2 D2
K3 NaN NaN C3 D3
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Process finished with exit code 0
2、案例02
import numpy as np
import pandas as pd
# pd.join() → 直接通过索引链接
left = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A4'],
'B': ['B0', 'B1', 'B2', 'B4']},
index=['K0', 'K1', 'K2', 'K4'])
right = pd.DataFrame({'C': ['C0', 'C2', 'C3']},
index=['K0', 'K2', 'K3'])
print("left = \n", left)
print('-' * 100)
print("right = \n", right)
print('-' * 200)
x1 = left.join(right)
print("x1 = \n", x1)
print('-' * 100)
# 等价于:pd.merge(left, right, left_index=True, right_index=True, how='outer')
x2 = left.join(right, how='outer')
print("x2 = \n", x2)
print('-' * 200)
打印结果:
left =
A B
K0 A0 B0
K1 A1 B1
K2 A2 B2
K4 A4 B4
----------------------------------------------------------------------------------------------------
right =
C
K0 C0
K2 C2
K3 C3
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
x1 =
A B C
K0 A0 B0 C0
K1 A1 B1 NaN
K2 A2 B2 C2
K4 A4 B4 NaN
----------------------------------------------------------------------------------------------------
x2 =
A B C
K0 A0 B0 C0
K1 A1 B1 NaN
K2 A2 B2 C2
K3 NaN NaN C3
K4 A4 B4 NaN
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Process finished with exit code 0
3、x1连接x2的某一列
import pandas as pd
# pd.join() → 直接通过索引链接
df1 = pd.DataFrame({'key': list('bbacaab'),
'data1': [1, 3, 2, 4, 5, 9, 7]})
df2 = pd.DataFrame({'key': list('abd'),
'date2': [11, 2, 33]})
print("df1 = \n", df1)
print('-' * 100)
print("df2 = \n", df2)
print('-' * 200)
# df1只连接df2的date2列
df3 = df2['date2']
print("df3 = \n{0}\n type(df3) = {1}".format(df3, type(df3)))
print('-' * 100)
x2 = df1.join(df2['date2'])
print("x2 = \n", x2)
print('-' * 200)
打印结果:
df1 =
key data1
0 b 1
1 b 3
2 a 2
3 c 4
4 a 5
5 a 9
6 b 7
----------------------------------------------------------------------------------------------------
df2 =
key date2
0 a 11
1 b 2
2 d 33
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
df3 =
0 11
1 2
2 33
Name: date2, dtype: int64
type(df3) = <class 'pandas.core.series.Series'>
----------------------------------------------------------------------------------------------------
x2 =
key data1 date2
0 b 1 11.0
1 b 3 2.0
2 a 2 33.0
3 c 4 NaN
4 a 5 NaN
5 a 9 NaN
6 b 7 NaN
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Process finished with exit code 0
二、merge:通过“自定义外键”链接合并2个DataFrame
pd.merge(left, right, how=‘inner’, on=None, left_on=None, right_on=None, left_index=False, right_index=False, sort=True, suffixes=(‘_x’, ‘_y’), copy=True, indicator=False)
- 可以指定按照两组数据的共同键值对合并或者左右各自
left
: DataFrameright
: 另一个DataFrameon
: 指定的共同键how
:按照什么方式连接left_on
,right_on
,left_index
,right_index
:当连接键不为相同的列时,可以单独设置左键与右键
Merge method | SQL Join Name | Description |
---|---|---|
left | LEFT OUTER JOIN | Use keys from left frame only |
right | RIGHT OUTER JOIN | Use keys from right frame only |
outer | FULL OUTER JOIN | Use union of keys from both frames |
inner | INNER JOIN | Use intersection of keys from both frames |
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
'key2': ['K0', 'K1', 'K0', 'K1'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
'key2': ['K0', 'K0', 'K0', 'K0'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
import pandas as pd
# pd.join() → 直接通过索引链接
df1 = pd.DataFrame({'key': list('bbacaab'),
'data1': [1, 3, 2, 4, 5, 9, 7]})
df2 = pd.DataFrame({'key': list('abd'),
'date2': [11, 2, 33]})
print("df1 = \n", df1)
print('-' * 100)
print("df2 = \n", df2)
print('-' * 200)
# 以相同的键 'key' 为外键
x1 = pd.merge(df1, df2) # 默认 inner方式
print("x1 = \n", x1)
print('-' * 200)
# 开启 left_index=True, right_index=True 之后,默认 suffixes=('_x', '_y')
x2 = pd.merge(df1, df2, left_index=True, right_index=True, suffixes=('_x', '_y'))
print("x2 = \n", x2)
print('-' * 200)
# 开启 left_index=True, right_index=True 之后,默认 suffixes=('_x', '_y')
x3 = pd.merge(df1, df2, left_index=True, right_index=True, how='outer', suffixes=('_x', '_y'))
print("x3 = \n", x3)
print('-' * 200)
打印结果:
df1 =
key data1
0 b 1
1 b 3
2 a 2
3 c 4
4 a 5
5 a 9
6 b 7
----------------------------------------------------------------------------------------------------
df2 =
key date2
0 a 11
1 b 2
2 d 33
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
x1 =
key data1 date2
0 b 1 2
1 b 3 2
2 b 7 2
3 a 2 11
4 a 5 11
5 a 9 11
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
x2 =
key_x data1 key_y date2
0 b 1 a 11
1 b 3 b 2
2 a 2 d 33
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
x3 =
key_x data1 key_y date2
0 b 1 a 11.0
1 b 3 b 2.0
2 a 2 d 33.0
3 c 4 NaN NaN
4 a 5 NaN NaN
5 a 9 NaN NaN
6 b 7 NaN NaN
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Process finished with exit code 0
1、内连接(默认内连接)
inner:默认,取交集
# 默认内连接
result = pd.merge(left, right, on=['key1', 'key2'])
2、左连接
left:按照左表为参考合并,数据缺失范围NaN
result = pd.merge(left, right, how='left', on=['key1', 'key2'])
3、右连接
right:按照右表为参考合并,数据缺失范围NaN
result = pd.merge(left, right, how='right', on=['key1', 'key2'])
4、外链接
outer:取并集,数据缺失范围NaN
result = pd.merge(left, right, how='outer', on=['key1', 'key2'])
5、参数:left_on、right_on、left_index、right_index
当连接键不为相同的列时,单独设置左键&右键
import numpy as np
import pandas as pd
# 参数 left_on, right_on, left_index, right_index → 当连接键不为相同的列时,单独设置左键&右键
df1 = pd.DataFrame({'lkey': list('bbacaab'),
'data1': range(7)})
df2 = pd.DataFrame({'rkey': list('abd'),
'date2': range(3)})
print("df1 = \n", df1)
print('-' * 100)
print("df2 = \n", df2)
print('-' * 200)
# df1以‘lkey’为键,df2以‘rkey’为键
data1 = pd.merge(df1, df2, left_on='lkey', right_on='rkey')
print("data1 = \n", data1)
print('-' * 200)
df1 = pd.DataFrame({'key': list('abcdfeg'),
'data1': range(7)})
df2 = pd.DataFrame({'date2': range(100, 105)},
index=list('abcde'))
# df1以‘key’为键,df2以index为键
# left_index:为True时,第一个df以index为键,默认False
# right_index:为True时,第二个df以index为键,默认False
# 所以 left_on, right_on, left_index, right_index可以相互组合:left_on + right_on, left_on + right_index, left_index + right_on, left_index + right_index
data2 = pd.merge(df1, df2, left_on='key', right_index=True)
print("data2 = \n", data2)
print('-' * 200)
打印结果:
df1 =
lkey data1
0 b 0
1 b 1
2 a 2
3 c 3
4 a 4
5 a 5
6 b 6
----------------------------------------------------------------------------------------------------
df2 =
rkey date2
0 a 0
1 b 1
2 d 2
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
data1 =
lkey data1 rkey date2
0 b 0 b 1
1 b 1 b 1
2 b 6 b 1
3 a 2 a 0
4 a 4 a 0
5 a 5 a 0
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
data2 =
key data1 date2
0 a 0 100
1 b 1 101
2 c 2 102
3 d 3 103
5 e 5 104
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Process finished with exit code 0
5、参数:sort
import pandas as pd
# 参数 sort
df1 = pd.DataFrame({'key': list('bbacaab'),
'data1': [1, 3, 2, 4, 5, 9, 7]})
df2 = pd.DataFrame({'key': list('abd'),
'date2': [11, 2, 33]})
print("df1 = \n", df1)
print('-' * 100)
print("df2 = \n", df2)
print('-' * 200)
x1 = pd.merge(df1, df2, on='key', how='outer')
print("x1 = \n", x1)
print('-' * 100)
# sort:按照字典顺序通过 连接键 对结果DataFrame进行排序。默认为False,设置为False会大幅提高性能
x2 = pd.merge(df1, df2, on='key', how='outer', sort=True)
print("x2 = \n", x2)
print('-' * 100)
# 也可直接用Dataframe的排序方法:sort_values,sort_index
x3 = x2.sort_values('data1')
print("x3 = \n", x3)
print('-' * 200)
打印结果:
df1 =
key data1
0 b 1
1 b 3
2 a 2
3 c 4
4 a 5
5 a 9
6 b 7
----------------------------------------------------------------------------------------------------
df2 =
key date2
0 a 11
1 b 2
2 d 33
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
x1 =
key data1 date2
0 b 1.0 2.0
1 b 3.0 2.0
2 b 7.0 2.0
3 a 2.0 11.0
4 a 5.0 11.0
5 a 9.0 11.0
6 c 4.0 NaN
7 d NaN 33.0
----------------------------------------------------------------------------------------------------
x2 =
key data1 date2
0 a 2.0 11.0
1 a 5.0 11.0
2 a 9.0 11.0
3 b 1.0 2.0
4 b 3.0 2.0
5 b 7.0 2.0
6 c 4.0 NaN
7 d NaN 33.0
----------------------------------------------------------------------------------------------------
x3 =
key data1 date2
3 b 1.0 2.0
0 a 2.0 11.0
4 b 3.0 2.0
6 c 4.0 NaN
1 a 5.0 11.0
5 b 7.0 2.0
2 a 9.0 11.0
7 d NaN 33.0
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Process finished with exit code 0