python进阶 - numpy

weixin_40047026
已于 2023-12-11 22:23:33 修改
阅读量57
点赞数
文章标签： numpy python 开发语言
于 2023-12-07 23:23:46 首次发布
本文链接：https://blog.csdn.net/weixin_40047026/article/details/134867626
版权
import numpy as np

array = np.array(
    [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
)

print(array)
print(f"纬度：{array.ndim}")
print(f"形状：{array.shape}")
print(f"大小：{array.size}")
print(f"元素类型：{array.dtype}")

"""
创建array
"""
a = np.array([1, 2, 3], dtype=np.int32)

print(f"a 元素类型 ：{a.dtype}")  # a 元素类型 ：int32

b = np.array([1, 2, 3], dtype=float)
print(f"b 元素类型：{b.dtype}")
print(f"b == {b}")
# b 元素类型：float64
# b == [1. 2. 3.]

# 一维数组
c = np.array([1, 2, 3, 4, 5])
print(f"一维数组c: {c}")

# 二维数组
d = np.array([[1, 2, 3], [4, 5, 6]])
print(f"二维矩阵d：{d}")

zero = np.zeros((2, 3))  # 生成2行3列全为0的矩阵
print(zero)
# [[0. 0. 0.]
#  [0. 0. 0.]]
print(f"zero type is: {zero.dtype}")  # zero type is: float64

one = np.ones((3, 4))  # 生成3行4列全为1的矩阵
print(one)
"""
[[1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]]
"""

empty = np.empty((3, 2))  # 生成3行2列全都接近于0的矩阵 不等于0
print(empty)
"""
[[0. 0.]
 [0. 0.]
 [0. 0.]]
"""

e = np.arange(10)
print(f"e: {e}")  # e: [0 1 2 3 4 5 6 7 8 9]
f = np.arange(4, 12)
print(f"f: {f}")  # f: [ 4  5  6  7  8  9 10 11]
g = np.arange(1, 100, 2)
print(f"g: {g}")
"""
g: [ 1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31 33 35 37 39 41 43 45 47
 49 51 53 55 57 59 61 63 65 67 69 71 73 75 77 79 81 83 85 87 89 91 93 95
 97 99]
"""

h = np.arange(8).reshape(4, 2)  # 生成一个最大数为8的4行2列的二维数组
print(f"h: {h}")
"""
h: [[0 1]
 [2 3]
 [4 5]
 [6 7]]
"""

"""
随机数生成以及矩阵的运算
"""
sample = np.random.random((3, 2))  # 生成3行2列从0到1的随机数
print(f"sample: {sample}")
print(f"sample type is : {type(sample)}")  # sample type is : <class 'numpy.ndarray'>

sample2 = np.random.normal(size=(3, 2))  # 生成3行2列符合标准正态分布的随机数
print(f"sample2 : {sample2}")
"""
sample2 : [[-0.27950385  1.33901519]
 [-1.01804431 -1.18704597]
 [ 0.45076097  0.59535488]]
"""
sample3 = np.random.randint(0, 10, size=(3, 2))  # 生成3行2列从0到10的随机整数
print(f"sample3: {sample3}")
"""
sample3: [[5 8]
 [5 3]
 [7 6]]
"""

print(np.sum(h))  # 对一个二维数组求和
print(np.min(sample3))  # 求最小值
print(np.max(sample2))  # 求最大值

q = np.sum(sample3, axis=0)  # 对列求和 axis=0:代表对横轴操作，也就是第0轴, 在运算的过程中其运算的方向表现为纵向运算
print(f"q: {q}")
"""
sample3: [[4 1]
 [1 3]
 [2 4]]
q: [7 8]
"""

w = np.sum(sample3, axis=1)  # 对行求和 axis=1:代表对纵轴操作，也就是第1轴, 在运算的过程中其运算的方向表现为横向运算
print(f"w: {w}")
"""
sample3: [[6 5]
 [6 9]
 [7 6]]
w: [11 15 13]
"""

print(f"求最小值的索引: {np.argmin(sample3)}")
"""
sample3: [[8 3]
 [7 0]
 [2 2]]
求最小值的索引: 3

sample3: [[0 2]
 [6 3]
 [5 0]]
求最小值的索引: 0

sample3: [[6 4]
 [4 8]
 [1 5]]
求最小值的索引: 4
"""

print(f"求最大值的索引: {np.argmax(sample3)}")
"""
sample3: [[7 2]
 [5 4]
 [5 0]]
求最大值的索引: 0
"""

print(f"求平均数: {np.mean(sample3)}")
print(f"求平均数: {sample3.mean()}")
"""
求平均数: 4.333333333333333
求平均数: 4.333333333333333
"""

print(f"求中位数: {np.median(sample3)}")
"""
sample3: [[7 3]
 [8 3]
 [6 5]]
求中位数: 5.5
"""

kk = np.sqrt(sample3)  # 开方
print(f"kk: {kk}")
"""
sample3: [[1 7]
 [2 8]
 [9 3]]
 
kk: [[1.         2.64575131]
 [1.41421356 2.82842712]
 [3.         1.73205081]]
"""

sample4 = np.random.randint(0, 10, size=(1, 10))  # 1行10列 最大10
print(f"sample4: {sample4}")
print(np.sort(sample4))  # 排序
"""
sample4 : [[8 7 2 6 5 2 3 3 7 2]]
[[2 2 2 3 3 5 6 7 7 8]]
"""

print(np.clip(sample4, 2, 7))  # 小于2就变成2 大于7就变成7
"""
sample4: [[2 5 9 7 1 5 2 0 8 9]]
[[0 1 2 2 5 5 7 8 9 9]]
[[2 5 7 7 2 5 2 2 7 7]]
"""



"""
numpy 的索引
"""
import numpy as np

arr1 = np.arange(2, 14)

print(f"arr1: {arr1}")  # arr1: [ 2  3  4  5  6  7  8  9 10 11 12 13]

print(f"第二个位置的数据: {arr1[2]}")  # 第二个位置的数据: 4

print(f"第一到第四个位置的数据: {arr1[1:4]}")  # 第一到第四个位置的数据: [3 4 5]

print(f"前5个数据: {arr1[:5]}")  # 前5个数据: [2 3 4 5 6]

print(f"最后两个数据: {arr1[-2:]}")  # 最后两个数据: [12 13]

arr2 = arr1.reshape(3, 4)

print(f"arr2: {arr2}")
"""
arr2: [[ 2  3  4  5]
 [ 6  7  8  9]
 [10 11 12 13]]
"""

print(f"arr2[1]: {arr2[1]}")  # arr2[1]: [6 7 8 9]

print(f"arr2[1][2]: {arr2[1][2]}")  # arr2[1][2]: 8

# 迭代行
for x in arr2:
    print(f"x: {x}")
"""
x: [2 3 4 5]
x: [6 7 8 9]
x: [10 11 12 13]
"""

# 迭代列
for x in arr2.T:
    print(x)
"""
[ 2  6 10]
[ 3  7 11]
[ 4  8 12]
[ 5  9 13]
"""

"""
array 合并
"""
arr3 = np.array([1, 2, 3])
arr4 = np.array([4, 5, 6])
arr5 = np.vstack((arr3, arr4))  # 垂直合并
print(f"arr5: {arr5}")

"""
arr5: [[1 2 3]
 [4 5 6]]
"""
arr6 = np.hstack((arr3, arr4))  # 水平合并
print(f"arr6: {arr6}")  # arr6: [1 2 3 4 5 6]
print(f"arr6形状：{arr6.shape}")  # arr6形状：(6,)

arr6 = np.concatenate((arr3, arr4), axis=0)  # 合并的array纬度要相同，array形状要匹配 axis=0纵向合并
print(f"arr6: {arr6}")  # arr6: [1 2 3 4 5 6]

arr7 = np.concatenate((arr5, arr5), axis=1)  # 合并的array纬度要相同，array形状要匹配 axis=0横向合并
print(f"arr7: {arr7}")
"""
arr7: [[1 2 3 1 2 3]
 [4 5 6 4 5 6]]
"""

"""
分割
"""
arra = np.arange(12).reshape((3, 4))
print(f"arra: {arra}")
"""
arra: [[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]
"""

arrs, arrd = np.split(arra, 2, axis=1)  # 水平方向分割，分成2份
print(f"arrs: {arrs}")
print(f"arrd: {arrd}")
"""
arrs: [[0 1]
 [4 5]
 [8 9]]
arrd: [[ 2  3]
 [ 6  7]
 [10 11]]
"""

arrf, arrg, arrh = np.split(arra, 3, axis=0)  # 垂直方向分割，分成3份
print(f"arrf: {arrf}")
print(f"arrg: {arrg}")
print(f"arrh: {arrh}")
"""
arrf: [[0 1 2 3]]
arrg: [[4 5 6 7]]
arrh: [[ 8  9 10 11]]
"""

arrf, arrg, arrh = np.vsplit(arra, 3)  # 垂直分割
print(f"arrf: {arrf}")
print(f"arrg: {arrg}")
print(f"arrh: {arrh}")
"""
arrf: [[0 1 2 3]]
arrg: [[4 5 6 7]]
arrh: [[ 8  9 10 11]]
"""

"""
深拷贝和浅拷贝
"""
arrq = np.array([1, 2, 3])
arrw = arrq  # 浅拷贝：共享一块内存
arrw[0] = 10
print(f"arrq: {arrq}")  # arrq: [10  2  3] 已经被改变了

arre = arrq.copy()
arre[1] = 100
print(f"arrq: {arrq}")  # arrq: [10  2  3] 没有改变



import pandas as pd
import numpy as np

s1 = pd.Series([4, 5, 7, -3, 2])  # 创建一个series，索引为默认值

print(f"s1: {s1}")
"""
s1: 0    4
1    5
2    7
3   -3
4    2
dtype: int64
"""

print(s1.values)  # series的值: [ 4  5  7 -3  2]
print(s1.index)  # series的索引: RangeIndex(start=0, stop=5, step=1)

s2 = pd.Series([4.0, 5.2, 3.5, 6.7], index=['a', 'b', 'c', 'd'])
print(f"s2: {s2}")
"""
s2: a    4.0
b    5.2
c    3.5
d    6.7
dtype: float64
"""

# 根据索引取值
print(s2['a'])  # 4.0

print('b' in s2)  # True

print('e' in s2)  # False

print(4.0 in s2)  # False

# series 可以堪称是一个定长的有序字典
dict1 = {'apple': 100, 'pen': 3, 'orange': 20}
s3 = pd.Series(dict1)
print(f"s3: {s3}")
"""
s3: apple     100
pen         3
orange     20
dtype: int64
"""

# dataframe
data = {
    'year': [2014, 2015, 2016, 2017],
    'income': [1000, 10000, 100000, 1000000],
    'pay': [10000, 1000, 1000, 100]
}
df1 = pd.DataFrame(data)
print(f"df1: {df1}")
"""
df1:    year   income    pay
0  2014     1000  10000
1  2015    10000   1000
2  2016   100000   1000
3  2017  1000000    100
"""

df2 = pd.DataFrame(np.arange(12).reshape((3, 4)))
print(f"df2: {df2}")
"""
df2:    0  1   2   3
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11
"""

df3 = pd.DataFrame(np.arange(12).reshape((3, 4)), columns=[9, 8, 7, 6])
print(f"df3: {df3}")
"""
df3:    9  8   7   6
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11
"""

# 获取dataframe的列
print(f"df1: {df1.columns}")  # df1: Index(['year', 'income', 'pay'], dtype='object')
# 获取dataframe的行
print(f"df1: {df1.index}")  # df1: RangeIndex(start=0, stop=4, step=1)
# 获取dataframe的数据
print(f"df1 value: {df1.values}")
"""
df1 value: [[   2014    1000   10000]
 [   2015   10000    1000]
 [   2016  100000    1000]
 [   2017 1000000     100]]
"""

print(df1.describe())
"""
describe
              year          income           pay
count     4.000000        4.000000      4.000000
mean   2015.500000   277750.000000   3025.000000
std       1.290994   483570.315466   4669.314725
min    2014.000000     1000.000000    100.000000
25%    2014.750000     7750.000000    775.000000
50%    2015.500000    55000.000000   1000.000000
75%    2016.250000   325000.000000   3250.000000
max    2017.000000  1000000.000000  10000.000000
"""
print("-------------")
print(df1.sort_index(axis=1))  # 列排序
"""
    income    pay  year
0     1000  10000  2014
1    10000   1000  2015
2   100000   1000  2016
3  1000000    100  2017
"""

print(df1.sort_index(axis=0))  # 行排序
"""
   year   income    pay
0  2014     1000  10000
1  2015    10000   1000
2  2016   100000   1000
3  2017  1000000    100
"""

print("============================")

"""
选择数据
"""
dates = pd.date_range('20170101', periods=6)
df4 = pd.DataFrame(np.arange(24).reshape(6, 4), index=dates, columns=['a', 'b', 'c', 'd'])
print(f"df4: {df4}")
"""
df4:              a   b   c   d
2017-01-01   0   1   2   3
2017-01-02   4   5   6   7
2017-01-03   8   9  10  11
2017-01-04  12  13  14  15
2017-01-05  16  17  18  19
2017-01-06  20  21  22  23
"""

print(df4['a'])  # 将dataframe的列获取为一个series
"""
2017-01-01     0
2017-01-02     4
2017-01-03     8
2017-01-04    12
2017-01-05    16
2017-01-06    20
Freq: D, Name: a, dtype: int64
"""

print(df4.a)
"""
2017-01-01     0
2017-01-02     4
2017-01-03     8
2017-01-04    12
2017-01-05    16
2017-01-06    20
Freq: D, Name: a, dtype: int64
"""

print(df4[0:2])  # 获取前两行
"""
            a  b  c  d
2017-01-01  0  1  2  3
2017-01-02  4  5  6  7
"""

print(df4['20170102':'20170104'])  # 通过标签选择数据范围
"""
             a   b   c   d
2017-01-02   4   5   6   7
2017-01-03   8   9  10  11
2017-01-04  12  13  14  15
"""

print(df4.loc['20170102'])  # 选择某个标签的某一行数据
"""
a    4
b    5
c    6
d    7
Name: 2017-01-02 00:00:00, dtype: int64
"""

print(df4.loc['20170102', ['a', 'b']])
"""
a    4
b    5
Name: 2017-01-02 00:00:00, dtype: int64
"""

print(df4.loc[:, ['a', 'b']])
"""
             a   b
2017-01-01   0   1
2017-01-02   4   5
2017-01-03   8   9
2017-01-04  12  13
2017-01-05  16  17
2017-01-06  20  21
"""

print('-----------------')
# 通过位置选择数据
print(df4.iloc[2])  # 第3行
"""
-----------------
a     8
b     9
c    10
d    11
Name: 2017-01-03 00:00:00, dtype: int64
"""
###############行    列
print(df4.iloc[1:3, 2:4])
"""
df4:              a   b   c   d
2017-01-01   0   1   2   3
2017-01-02   4   5   6   7
2017-01-03   8   9  10  11
2017-01-04  12  13  14  15
2017-01-05  16  17  18  19
2017-01-06  20  21  22  23
             c   d
2017-01-02   6   7
2017-01-03  10  11
"""

print(df4.iloc[[1, 3, 4], [1, 3]])
"""
             b   d
2017-01-02   5   7
2017-01-04  13  15
2017-01-05  17  19
"""

############# 处理丢失数据
dates = np.arange(20170101, 20170105)
dfa = pd.DataFrame(np.arange(12).reshape((4, 3)), index=dates, columns=['A', 'B', 'C'])
print(f"dfa: {dfa}")
"""
          A   B   C
20170101  0   1   2
20170102  3   4   5
20170103  6   7   8
20170104  9  10  11
"""

df_nal = pd.DataFrame(dfa, index=dates, columns=['A', 'B', 'C', 'D', 'E'])
print(f"df_nal: {df_nal}")
"""
          A   B   C   D   E
20170101  0   1   2 NaN NaN
20170102  3   4   5 NaN NaN
20170103  6   7   8 NaN NaN
20170104  9  10  11 NaN NaN
"""

s_d = pd.Series([3, 4, 5], index=dates[:3])
s_e = pd.Series([32, 43, 54], index=dates[1:])
df_nal['D'] = s_d
df_nal['E'] = s_e
print(df_nal)
"""
          A   B   C    D     E
20170101  0   1   2  3.0   NaN
20170102  3   4   5  4.0  32.0
20170103  6   7   8  5.0  43.0
20170104  9  10  11  NaN  54.0
"""
dd = df_nal.dropna()  # dropna:处理缺失值 axis[0, 1] 0 代表行 1 代表列 how=['any', 'all'] any任意一个或多个 all全部
print(dd)
"""
          A  B  C    D     E
20170102  3  4  5  4.0  32.0
20170103  6  7  8  5.0  43.0
"""

print(df_nal.fillna(value=0))  # 将空值赋值为0
"""
          A   B   C    D     E
20170101  0   1   2  3.0   0.0
20170102  3   4   5  4.0  32.0
20170103  6   7   8  5.0  43.0
20170104  9  10  11  0.0  54.0
"""
print(df_nal.isnull())  # 为空 则输出true
"""
              A      B      C      D      E
20170101  False  False  False  False   True
20170102  False  False  False  False  False
20170103  False  False  False  False  False
20170104  False  False  False   True  False
"""

print(np.any(df_nal.isnull()))  # 只要有一个或多个空值就会返回true
print(np.all(df_nal.isnull()))  # 所有为空值才返回true

#########pandas读取及写入文件
# file = pd.read_csv('1960-2019全球GDP数据.csv', encoding='gbk')
# print(f"file: {file}")
"""
file:       year       GDP          rate
0     1960        美国  5.433000e+11
1     1960        英国  7.323397e+10
2     1960        法国  6.222548e+10
3     1960        中国  5.971647e+10
4     1960        日本  4.430734e+10
...    ...       ...           ...
9925  2019  圣多美和普林西比  4.186374e+08
9926  2019        帕劳  2.683549e+08
9927  2019      基里巴斯  1.946472e+08
9928  2019        瑙鲁  1.182234e+08
9929  2019       图瓦卢  4.727146e+07
"""

# file.iloc[0, 0] = '1971'  # 根据坐标修改数据
# print(file)
"""
      year       GDP          rate
0     1971        美国  5.433000e+11
1     1960        英国  7.323397e+10
2     1960        法国  6.222548e+10
3     1960        中国  5.971647e+10
4     1960        日本  4.430734e+10
...    ...       ...           ...
9925  2019  圣多美和普林西比  4.186374e+08
9926  2019        帕劳  2.683549e+08
9927  2019      基里巴斯  1.946472e+08
9928  2019        瑙鲁  1.182234e+08
9929  2019       图瓦卢  4.727146e+07
"""

# file.to_csv('1960-2019全球GDP数据.csv')  # 写入文件

###### 合并 #####
df1 = pd.DataFrame(np.arange(12).reshape((3, 4)), columns=['a', 'b', 'c', 'c'])
df2 = pd.DataFrame(np.arange(12, 24).reshape((3, 4)), columns=['a', 'b', 'c', 'c'])
df3 = pd.DataFrame(np.arange(24, 36).reshape((3, 4)), columns=['a', 'b', 'c', 'c'])

df4 = pd.concat([df1, df2, df3], axis=0)  # 纵向合并
print(f"df4: {df4}")
"""
df4:     a   b   c   c
0   0   1   2   3
1   4   5   6   7
2   8   9  10  11
0  12  13  14  15
1  16  17  18  19
2  20  21  22  23
0  24  25  26  27
1  28  29  30  31
2  32  33  34  35
"""

df4 = pd.concat([df1, df2, df3], axis=0, ignore_index=True)  # 纵向合并 不考虑原来的index
print(f"df4: {df4}")
"""
df4:     a   b   c   c
0   0   1   2   3
1   4   5   6   7
2   8   9  10  11
3  12  13  14  15
4  16  17  18  19
5  20  21  22  23
6  24  25  26  27
7  28  29  30  31
8  32  33  34  35
"""

df5 = pd.concat([df1, df2, df3], axis=1)  # 横向合并
print(f"df5: {df5}")
"""
df5:    a  b   c   c   a   b   c   c   a   b   c   c
0  0  1   2   3  12  13  14  15  24  25  26  27
1  4  5   6   7  16  17  18  19  28  29  30  31
2  8  9  10  11  20  21  22  23  32  33  34  35
"""

df5 = pd.concat([df1, df2, df3], axis=1, ignore_index=True)  # 横向合并
print(f"df5: {df5}")
"""
df5:    0   1   2   3   4   5   6   7   8   9   10  11
0   0   1   2   3  12  13  14  15  24  25  26  27
1   4   5   6   7  16  17  18  19  28  29  30  31
2   8   9  10  11  20  21  22  23  32  33  34  35
"""

df1 = pd.DataFrame(np.arange(12).reshape((3, 4)), columns=['a', 'b', 'c', 'f'])
df2 = pd.DataFrame(np.arange(12, 24).reshape((3, 4)), columns=['a', 'c', 'd', 'e'])

df3 = pd.concat([df1, df2], join='outer', ignore_index=True)  # 合并两个表 缺少的部分填充为nan
print(f"df3: {df3}")
"""
    a    b   c     f     d     e
0   0  1.0   2   3.0   NaN   NaN
1   4  5.0   6   7.0   NaN   NaN
2   8  9.0  10  11.0   NaN   NaN
3  12  NaN  13   NaN  14.0  15.0
4  16  NaN  17   NaN  18.0  19.0
5  20  NaN  21   NaN  22.0  23.0
"""

df3 = pd.concat([df1, df2], join='inner', ignore_index=True)  # 合并两个表 缺少的部分去掉
print(f"df3: {df3}")
"""
    a   c
0   0   2
1   4   6
2   8  10
3  12  13
4  16  17
5  20  21
"""

###   pandas合并merge   ###
left = pd.DataFrame({
    'key': ['k1', 'k2', 'k3'],
    'A': ['A1', 'A2', 'A3'],
    'B': ['B1', 'B2', 'B3'],
})

right = pd.DataFrame({
    'key': ['k1', 'k2', 'k3'],
    'C': ['C1', 'C2', 'C3'],
    'D': ['D1', 'D2', 'D3'],
})

res = pd.merge(left, right, on='key')
print(f"res: {res}")
"""
   key  A   B   C   D
0  k1  A1  B1  C1  D1
1  k2  A2  B2  C2  D2
2  k3  A3  B3  C3  D3
"""

left = pd.DataFrame({
    'key1': ['k0', 'k0', 'k1'],
    'key2': ['k1', 'k2', 'k1'],
    'A': ['A1', 'A2', 'A3'],
    'B': ['B1', 'B2', 'B3'],
})

right = pd.DataFrame({
    'key1': ['k1', 'k0', 'k0'],
    'key2': ['k1', 'k1', 'k3'],
    'C': ['C1', 'C2', 'C3'],
    'D': ['D1', 'D2', 'D3'],
})

res = pd.merge(left, right, on=['key1', 'key2'], how='outer')  # 默认是inner
print(res)
"""
  key1 key2    A    B    C    D
0   k0   k1   A1   B1   C2   D2
1   k0   k2   A2   B2  NaN  NaN
2   k1   k1   A3   B3   C1   D1
3   k0   k3  NaN  NaN   C3   D3
"""

res = pd.merge(left, right, on=['key1', 'key2'], how='inner')
print(res)
"""
  key1 key2   A   B   C   D
0   k0   k1  A1  B1  C2  D2
1   k1   k1  A3  B3  C1  D1
"""

res = pd.merge(left, right, on=['key1', 'key2'], how='left')
print(res)
"""
  key1 key2   A   B    C    D
0   k0   k1  A1  B1   C2   D2
1   k0   k2  A2  B2  NaN  NaN
2   k1   k1  A3  B3   C1   D1
"""

res = pd.merge(left, right, on=['key1', 'key2'], how='right')
print(res)
"""
  key1 key2    A    B   C   D
0   k1   k1   A3   B3  C1  D1
1   k0   k1   A1   B1  C2  D2
2   k0   k3  NaN  NaN  C3  D3
"""

res = pd.merge(left, right, on=['key1', 'key2'], how='outer', indicator=True)  # 显示merge信息
print(res)
"""
  key1 key2    A    B    C    D      _merge
0   k0   k1   A1   B1   C2   D2        both
1   k0   k2   A2   B2  NaN  NaN   left_only
2   k1   k1   A3   B3   C1   D1        both
3   k0   k3  NaN  NaN   C3   D3  right_only
"""

res = pd.merge(left, right, on=['key1', 'key2'], how='outer', indicator='indicator_column')  # 自定义
print(res)
"""
  key1 key2    A    B    C    D indicator_column
0   k0   k1   A1   B1   C2   D2             both
1   k0   k2   A2   B2  NaN  NaN        left_only
2   k1   k1   A3   B3   C1   D1             both
3   k0   k3  NaN  NaN   C3   D3       right_only
"""