Python Numpy & Pandas

最新推荐文章于 2024-03-14 09:13:15 发布

z2014z

最新推荐文章于 2024-03-14 09:13:15 发布

阅读量1.2k

点赞数

分类专栏： Python 文章标签： python 深度学习

本文链接：https://blog.csdn.net/z2014z/article/details/118006506

版权

Python 专栏收录该内容

5 篇文章 0 订阅

订阅专栏

Python Numpy & Pandas

需要安装Numpy 和 Pandas

Numpy

基础创建

import numpy as np
# 创建
# dtype : int 16, 32, 64; float 16, 32 ,64; complex
         
array = np.array([[1, 2, 3],
                [4, 5, 6]], dtype=np.int)

# 创建全部为0的矩阵
array = np.zeros((3, 4))
print(array)
'''
output:
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]
'''

# 创建全部为1的矩阵
array = np.ones((3, 4))
print(array)
'''
output:
[[1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]]
'''

# 创建未初始化的矩阵
array = np.empty( (2,3) )
print(array)
'''
[[6.23042070e-307 1.89146896e-307 1.37961302e-306]
 [1.05699242e-307 1.11261638e-306 1.24610927e-306]]
'''


# 9 numbers from 0 to 2
array = np.linspace( 0, 2, 9 )
print(array)
''''
output:
[0.   0.25 0.5  0.75 1.   1.25 1.5  1.75 2.  ]
'''

array = np.arange(15).reshape(3, 5)
print(array)
'''
output:
[[ 0  1  2  3  4]
 [ 5  6  7  8  9]
 [10 11 12 13 14]]
'''
print('number of dim: ', array.ndim)
print('shape: ', array.shape)
print('size: ', array.size)
print('type:', array.dtype)
'''
output:
number of dim:  2
shape:  (3, 5)
size:  15
type: int32
'''

基础运算

import numpy as np

a = np.array( [20,30,40,50] )
b = np.arange( 4 )
print(b)
# output: [0, 1, 2, 3]
# 对应位置元素相减
c = a-b
print(c)
# output: [20, 29, 38, 47]
# b每个元素平方
print(b**2)
# output: [0, 1, 4, 9]
print(10*np.sin(a))
# output: [ 9.12945251, -9.88031624,  7.4511316 , -2.62374854]
# 对于a的每个元素判断是否小于35,并输出结果
print(a<35)
# output: [ True,  True, False, False]

矩阵运算

import numpy as np
A = np.array( [[1,1],
               [0,1]] )
B = np.array( [[2,0],
               [3,4]] )
# 矩阵对应位置元素相乘
print(A * B)
'''
array([[2, 0],
       [0, 4]])
'''


# matrix product
print(A @ B)
'''
array([[5, 4],
       [3, 4]])
'''
# another matrix product
print(A.dot(B))
'''
array([[5, 4],
       [3, 4]])
'''

# 矩阵求和、最值
a = np.random.random((2,3))
'''
[[0.70284491 0.67667991 0.60919611]
 [0.65004259 0.98493693 0.23953326]]
'''
print(a)
print(a.sum())
print(a.min())
print(a.max())
'''
3.8632337099338896
0.23953325605165765
0.9849369271091678
'''
# axis: 1 对行操作， 0 对列操作
print(a.sum(axis=1))
print(a.min(axis=0))
print(a.max(axis=1))
'''
[1.98872094 1.87451277]
[0.65004259 0.67667991 0.23953326]
[0.70284491 0.98493693]
'''

索引、均值

import numpy as np

a = np.arange(2, 14).reshape((3, 4))
# 最值的索引
print(np.argmin(a))
print(np.argmax(a))
'''
0
11
'''
# 均值
print(a.mean())
print(np.average(a))
'''
7.5
7.5
'''
# 中位数
print(np.median(a))
# 7.5 
# 累加
print(np.cumsum(a))
'''
[ 2  5  9 14 20 27 35 44 54 65 77 90]
'''
# 累差
print(np.diff(a))
'''
[[1 1 1]
 [1 1 1]
 [1 1 1]]
'''
# 非零数 输出对应的行号、列号
print(np.nonzero(a))
# (array([0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2], dtype=int64), array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3], dtype=int64))
# 排序
print(np.sort(a))

# 矩阵转置
print(np.transpose(a))
print(a.T)
'''
[[ 2  6 10]
 [ 3  7 11]
 [ 4  8 12]
 [ 5  9 13]]
'''

print((a.T).dot(a))
'''
[[140 158 176 194]
 [158 179 200 221]
 [176 200 224 248]
 [194 221 248 275]]
'''
# numpy.clip(a, a_min, a_max, out=None)
# 将数组中的元素限制在a_min, a_max之间，大于a_max的就使得它等于 a_max，小于a_min,的就使得它等于a_min。
print(np.clip(a, 5, 9))
'''
[[5 5 5 5]
 [6 7 8 9]
 [9 9 9 9]]
'''

矩阵合并

import numpy as np

a = np.array([1, 1, 1])
b = np.array([2, 2, 2])

# 垂直合并
c = np.vstack((a, b))
print(c)
print(a.shape, c.shape)
'''
[[1 1 1]
 [2 2 2]]
(3,) (2, 3)
'''
# 水平合并
d = np.hstack((a, b))
print(d)
print(a.shape, d.shape)
'''
[1 1 1 2 2 2]
(3,) (6,)
'''

a = np.array([1, 1, 1])[:, np.newaxis]
b = np.array([2, 2, 2])[:, np.newaxis]

print(a)
'''
[[1]
 [1]
 [1]]
'''
# concatenate 可以合并多个矩阵
# axis 可以设置合并方向
e = np.concatenate((a, b, b, a), axis=0)
print(e)
'''
[[1]
 [1]
 [1]
 [2]
 [2]
 [2]
 [2]
 [2]
 [2]
 [1]
 [1]
 [1]]
'''
e = np.concatenate((a, b, b, a), axis=1)
print(e)
'''
[[1 2 2 1]
 [1 2 2 1]
 [1 2 2 1]]
'''

Pandas

基本操作

import numpy as np
import pandas as pd

s = pd.Series([1, 3, 5, np.nan, 6, 8])
print(s)
'''
0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64
'''

dates = pd.date_range("20210101", periods=6)
print(dates)

'''
DatetimeIndex(['2021-01-01', '2021-01-02', '2021-01-03', '2021-01-04',
               '2021-01-05', '2021-01-06'],
              dtype='datetime64[ns]', freq='D')
'''

df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))
print(df)

'''
                   A         B         C         D
2021-01-01 -0.817100  0.113590  0.401565 -1.007716
2021-01-02  1.241367  0.816412 -0.460833  0.275248
2021-01-03  1.780875 -1.638779  0.895724  1.522088
2021-01-04  0.872075 -0.136323 -0.828276 -1.705793
2021-01-05  0.939255 -0.459394 -0.331591  0.107772
2021-01-06  0.187501 -1.024592 -1.118402 -2.061401
'''

df2 = pd.DataFrame(np.arange(12).reshape(3, 4))
print(df2)
'''
   0  1   2   3
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11
'''
# 字典,以列的方式输入数据
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
       }
   )
print(df2)
'''
     A          B    C  D      E    F
0  1.0 2013-01-02  1.0  3   test  foo
1  1.0 2013-01-02  1.0  3  train  foo
2  1.0 2013-01-02  1.0  3   test  foo
3  1.0 2013-01-02  1.0  3  train  foo
'''
# 类型
print(df2.dtypes)
'''
A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object
'''
# 行号
print(df2.index)
'''
Int64Index([0, 1, 2, 3], dtype='int64')
'''

# 列号
print(df2.columns)
'''
Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')
'''

# 数值
print(df2.values)
'''
[[1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo']
 [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo']
 [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo']
 [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo']]
'''

# 统计信息
print(df2.describe())
'''
         A    C    D
count  4.0  4.0  4.0
mean   1.0  1.0  3.0
std    0.0  0.0  0.0
min    1.0  1.0  3.0
25%    1.0  1.0  3.0
50%    1.0  1.0  3.0
75%    1.0  1.0  3.0
max    1.0  1.0  3.0
'''

# 转置 适用于数值型
print(df2.T)
'''
                     0  ...                    3
A                    1  ...                    1
B  2013-01-02 00:00:00  ...  2013-01-02 00:00:00
C                    1  ...                    1
D                    3  ...                    3
E                 test  ...                train
F                  foo  ...                  foo

[6 rows x 4 columns]
'''

# 排序
# 按列 倒序
print(df2.sort_index(axis=1, ascending=False))
'''
     F      E  D    C          B    A
0  foo   test  3  1.0 2013-01-02  1.0
1  foo  train  3  1.0 2013-01-02  1.0
2  foo   test  3  1.0 2013-01-02  1.0
3  foo  train  3  1.0 2013-01-02  1.0
'''
# 按行 倒序
print(df2.sort_index(axis=0, ascending=False))
'''
     A          B    C  D      E    F
3  1.0 2013-01-02  1.0  3  train  foo
2  1.0 2013-01-02  1.0  3   test  foo
1  1.0 2013-01-02  1.0  3  train  foo
0  1.0 2013-01-02  1.0  3   test  foo
'''

# 按值排序
print(df2.sort_values(by='E'))
'''
     A          B    C  D      E    F
0  1.0 2013-01-02  1.0  3   test  foo
2  1.0 2013-01-02  1.0  3   test  foo
1  1.0 2013-01-02  1.0  3  train  foo
3  1.0 2013-01-02  1.0  3  train  foo
'''

选择数据

import numpy as np
import pandas as pd


dates = pd.date_range("20210101", periods=6)
df = pd.DataFrame(np.arange(24).reshape(6, 4), index=dates, columns=list("ABCD"))
print(df)
'''
 A   B   C   D
2021-01-01   0   1   2   3
2021-01-02   4   5   6   7
2021-01-03   8   9  10  11
2021-01-04  12  13  14  15
2021-01-05  16  17  18  19
2021-01-06  20  21  22  23
'''

# 选择某一列 两种方式
print(df['A'])
print(df.A)
'''
2021-01-01     0
2021-01-02     4
2021-01-03     8
2021-01-04    12
2021-01-05    16
2021-01-06    20
Freq: D, Name: A, dtype: int32
'''

# 切片
print(df[0:3])
print(df['20210102':'20210104'])
'''
            A  B   C   D
2021-01-01  0  1   2   3
2021-01-02  4  5   6   7
2021-01-03  8  9  10  11
             A   B   C   D
2021-01-02   4   5   6   7
2021-01-03   8   9  10  11
2021-01-04  12  13  14  15

'''

# select by label: loc
print(df.loc['20210102'])
print(df.loc[:,['A','B']])
print(df.loc['20210102', ['A','B']])
'''
A    4
B    5
C    6
D    7
Name: 2021-01-02 00:00:00, dtype: int32
----------------------------------------
 A   B
2021-01-01   0   1
2021-01-02   4   5
2021-01-03   8   9
2021-01-04  12  13
2021-01-05  16  17
2021-01-06  20  21
---------------------------------------
A    4
B    5
Name: 2021-01-02 00:00:00, dtype: int32

'''

# select by position: iloc
print(df.iloc[3])
'''
A    12
B    13
C    14
D    15
Name: 2021-01-04 00:00:00, dtype: int32
'''
print(df.iloc[3, 1])
# 13
print(df.iloc[3:5,0:2])
'''
             A   B
2021-01-04  12  13
2021-01-05  16  17
'''
print(df.iloc[[1,2,4],[0,2]])
'''
             A   C
2021-01-02   4   6
2021-01-03   8  10
2021-01-05  16  18
'''

# mixed selection: ix
print(df.ix[:3, ['A', 'C']])
'''
            A   C
2021-01-01  0   2
2021-01-02  4   6
2021-01-03  8  10
'''

# Boolean indexing
print(df[df.A > 0])
'''
             A   B   C   D
2021-01-02   4   5   6   7
2021-01-03   8   9  10  11
2021-01-04  12  13  14  15
2021-01-05  16  17  18  19
2021-01-06  20  21  22  23
'''

赋值

import numpy as np
import pandas as pd


dates = pd.date_range("20210101", periods=6)
df = pd.DataFrame(np.arange(24).reshape(6, 4), index=dates, columns=list("ABCD"))
print(df)
'''
             A   B   C   D
2021-01-01   0   1   2   3
2021-01-02   4   5   6   7
2021-01-03   8   9  10  11
2021-01-04  12  13  14  15
2021-01-05  16  17  18  19
2021-01-06  20  21  22  23
'''
df.iloc[2, 2] = 1111
df.iloc[2,2] = 1111
df.loc['2021-01-03', 'D'] = 2222
df.A[df.A>0] = 0
df['F'] = np.nan
df['G']  = pd.Series([1,2,3,4,5,6], index=pd.date_range('20210101', periods=6))
print(df)
'''
            A   B     C     D   F  G
2021-01-01  0   1     2     3 NaN  1
2021-01-02  0   5     6     7 NaN  2
2021-01-03  0   9  1111  2222 NaN  3
2021-01-04  0  13    14    15 NaN  4
2021-01-05  0  17    18    19 NaN  5
2021-01-06  0  21    22    23 NaN  6
'''

处理丢失数据

import pandas as pd
import numpy as np

dates = pd.date_range('20210101', periods=6)
df = pd.DataFrame(np.arange(24).reshape((6,4)), index=dates, columns=['A', 'B', 'C', 'D'])

df.iloc[0,1] = np.nan
df.iloc[1,2] = np.nan
print(df)
'''
             A     B     C   D
2021-01-01   0   NaN   2.0   3
2021-01-02   4   5.0   NaN   7
2021-01-03   8   9.0  10.0  11
2021-01-04  12  13.0  14.0  15
2021-01-05  16  17.0  18.0  19
2021-01-06  20  21.0  22.0  23
'''

# 丢掉数据： 按行,存在空值丢掉该行
print(df.dropna(axis=0, how='any'))   # how={'any', 'all'}
'''
             A     B     C   D
2021-01-03   8   9.0  10.0  11
2021-01-04  12  13.0  14.0  15
2021-01-05  16  17.0  18.0  19
2021-01-06  20  21.0  22.0  23
'''

# 丢掉数据： 按列,存在空值丢掉该列
print(df.dropna(axis=1, how='any'))   # how={'any', 'all'}
'''
             A   D
2021-01-01   0   3
2021-01-02   4   7
2021-01-03   8  11
2021-01-04  12  15
2021-01-05  16  19
2021-01-06  20  23
'''

# 空值补零
print(df.fillna(value=0))
'''
             A     B     C   D
2021-01-01   0   0.0   2.0   3
2021-01-02   4   5.0   0.0   7
2021-01-03   8   9.0  10.0  11
2021-01-04  12  13.0  14.0  15
2021-01-05  16  17.0  18.0  19
2021-01-06  20  21.0  22.0  23
'''

# 检查是否缺失数据
print(pd.isnull(df))
'''
                A      B      C      D
2021-01-01  False   True  False  False
2021-01-02  False  False   True  False
2021-01-03  False  False  False  False
2021-01-04  False  False  False  False
2021-01-05  False  False  False  False
2021-01-06  False  False  False  False
'''

# 如果数据太多，使用这种方法进行检查空值
# True 表示至少存在一个空值
print(np.any(df.isnull()))
# output: True

导入导出数据

支持的数据类型:

CSV 推荐使用

HDF5

Excel

Gotchas

import pandas as pd

# read from
data = pd.read_csv('student.csv')
print(data)

# save to
data.to_pickle('student.pickle')

数据合并

concat

import pandas as pd
import numpy as np

# concatenating
# ignore index
df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
df3 = pd.DataFrame(np.ones((3,4))*2, columns=['a','b','c','d'])
# print(df1)
'''
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
'''
# print(df2)
'''
     a    b    c    d
0  1.0  1.0  1.0  1.0
1  1.0  1.0  1.0  1.0
2  1.0  1.0  1.0  1.0

'''
# print(df3)
'''
     a    b    c    d
0  2.0  2.0  2.0  2.0
1  2.0  2.0  2.0  2.0
2  2.0  2.0  2.0  2.0

'''
# 按行合并
res = pd.concat([df1, df2, df3], axis=0, ignore_index=True)
# print(res)
'''
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
3  1.0  1.0  1.0  1.0
4  1.0  1.0  1.0  1.0
5  1.0  1.0  1.0  1.0
6  2.0  2.0  2.0  2.0
7  2.0  2.0  2.0  2.0
8  2.0  2.0  2.0  2.0
'''

# join, ('inner', 'outer')
df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'], index=[1,2,3])
df2 = pd.DataFrame(np.ones((3,4))*1, columns=['b','c','d', 'e'], index=[2,3,4])
print(df1)
'''
     a    b    c    d
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
3  0.0  0.0  0.0  0.0
'''

print(df2)
'''
     b    c    d    e
2  1.0  1.0  1.0  1.0
3  1.0  1.0  1.0  1.0
4  1.0  1.0  1.0  1.0
'''

#  join outer 保留所有列，没有的补NaN
res = pd.concat([df1, df2], axis=0, join='outer')
print(res)
'''
     a    b    c    d    e
1  0.0  0.0  0.0  0.0  NaN
2  0.0  0.0  0.0  0.0  NaN
3  0.0  0.0  0.0  0.0  NaN
2  NaN  1.0  1.0  1.0  1.0
3  NaN  1.0  1.0  1.0  1.0
4  NaN  1.0  1.0  1.0  1.0
'''
# join inner - 只保留相交的部分
res = pd.concat([df1, df2], axis=0, join='inner')
print(res)
'''
     b    c    d
1  0.0  0.0  0.0
2  0.0  0.0  0.0
3  0.0  0.0  0.0
2  1.0  1.0  1.0
3  1.0  1.0  1.0
4  1.0  1.0  1.0
'''

# join_axes
# 按列，根据df1的行号进行合并，没有的补NaN
res = pd.concat([df1, df2], axis=1, join_axes=[df1.index])
print(res)
'''
     a    b    c    d    b    c    d    e
1  0.0  0.0  0.0  0.0  NaN  NaN  NaN  NaN
2  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
3  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0

'''
# append
df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
df3 = pd.DataFrame(np.ones((3,4))*1, columns=['b','c','d', 'e'], index=[2,3,4])
res = df1.append(df2, ignore_index=True)
res = df1.append([df2, df3])
print(res)

'''
     a    b    c    d    e
0  0.0  0.0  0.0  0.0  NaN
1  0.0  0.0  0.0  0.0  NaN
2  0.0  0.0  0.0  0.0  NaN
0  1.0  1.0  1.0  1.0  NaN
1  1.0  1.0  1.0  1.0  NaN
2  1.0  1.0  1.0  1.0  NaN
2  NaN  1.0  1.0  1.0  1.0
3  NaN  1.0  1.0  1.0  1.0
4  NaN  1.0  1.0  1.0  1.0

'''
#
s1 = pd.Series([1,2,3,4], index=['a','b','c','d'])
res = df1.append(s1, ignore_index=True)
print(res)
'''
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
3  1.0  2.0  3.0  4.0
'''

merge

import pandas as pd

# merging two df by key/keys. (may be used in database)
# simple example
left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                     'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                        'C': ['C0', 'C1', 'C2', 'C3'],
                        'D': ['D0', 'D1', 'D2', 'D3']})
print(left)
'''
  key   A   B
0  K0  A0  B0
1  K1  A1  B1
2  K2  A2  B2
3  K3  A3  B3
'''
print(right)
'''
  key   C   D
0  K0  C0  D0
1  K1  C1  D1
2  K2  C2  D2
3  K3  C3  D3
'''
# on 根据key列进行合并
res = pd.merge(left, right, on='key')
print(res)
'''
  key   A   B   C   D
0  K0  A0  B0  C0  D0
1  K1  A1  B1  C1  D1
2  K2  A2  B2  C2  D2
3  K3  A3  B3  C3  D3
'''

# consider two keys
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
                     'key2': ['K0', 'K1', 'K0', 'K1'],
                     'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
                       'key2': ['K0', 'K0', 'K0', 'K0'],
                       'C': ['C0', 'C1', 'C2', 'C3'],
                       'D': ['D0', 'D1', 'D2', 'D3']})
print(left)
'''
  key1 key2   A   B
0   K0   K0  A0  B0
1   K0   K1  A1  B1
2   K1   K0  A2  B2
3   K2   K1  A3  B3
'''
print(right)
'''
  key1 key2   C   D
0   K0   K0  C0  D0
1   K1   K0  C1  D1
2   K1   K0  C2  D2
3   K2   K0  C3  D3
'''

# default for how='inner'
# 仅保留 两个key值相同的数据
res = pd.merge(left, right, on=['key1', 'key2'], how='inner')
print(res)
'''
  key1 key2   A   B   C   D
0   K0   K0  A0  B0  C0  D0
1   K1   K0  A2  B2  C1  D1
2   K1   K0  A2  B2  C2  D2
'''

# how = ['left', 'right', 'outer', 'inner']
# outer 保留所有行，数值不存在的补NaN
res = pd.merge(left, right, on=['key1', 'key2'], how='outer')
print(res)
'''
  key1 key2    A    B    C    D
0   K0   K0   A0   B0   C0   D0
1   K0   K1   A1   B1  NaN  NaN
2   K1   K0   A2   B2   C1   D1
3   K1   K0   A2   B2   C2   D2
4   K2   K1   A3   B3  NaN  NaN
5   K2   K0  NaN  NaN   C3   D3
'''
# 根据 left的key进行合并
res = pd.merge(left, right, on=['key1', 'key2'], how='left')
print(res)
'''
  key1 key2   A   B    C    D
0   K0   K0  A0  B0   C0   D0
1   K0   K1  A1  B1  NaN  NaN
2   K1   K0  A2  B2   C1   D1
3   K1   K0  A2  B2   C2   D2
4   K2   K1  A3  B3  NaN  NaN
'''

# 根据 right的key进行合并
res = pd.merge(left, right, on=['key1', 'key2'], how='right')
print(res)
'''
  key1 key2    A    B   C   D
0   K0   K0   A0   B0  C0  D0
1   K1   K0   A2   B2  C1  D1
2   K1   K0   A2   B2  C2  D2
3   K2   K0  NaN  NaN  C3  D3
'''

# indicator 显示合并的依据
df1 = pd.DataFrame({'col1':[0,1], 'col_left':['a','b']})
df2 = pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]})
print(df1)
'''
   col1 col_left
0     0        a
1     1        b
'''
print(df2)
'''
   col1  col_right
0     1          2
1     2          2
2     2          2
'''
#
res = pd.merge(df1, df2, on='col1', how='outer', indicator=True)
print(res)
'''
   col1 col_left  col_right      _merge
0     0        a        NaN   left_only
1     1        b        2.0        both
2     2      NaN        2.0  right_only
3     2      NaN        2.0  right_only
'''
# give the indicator a custom name
res = pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_column')
print(res)
'''
   col1 col_left  col_right indicator_column
0     0        a        NaN        left_only
1     1        b        2.0             both
2     2      NaN        2.0       right_only
3     2      NaN        2.0       right_only
'''

# merged by index
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
                     'B': ['B0', 'B1', 'B2']},
                     index=['K0', 'K1', 'K2'])
right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
                      'D': ['D0', 'D2', 'D3']},
                       index=['K0', 'K2', 'K3'])
print(left)
'''
     A   B
K0  A0  B0
K1  A1  B1
K2  A2  B2
'''
print(right)
'''
     C   D
K0  C0  D0
K2  C2  D2
K3  C3  D3
'''
# left_index and right_index
# 根据行号合并
res = pd.merge(left, right, left_index=True, right_index=True, how='outer')
print(res)
'''
      A    B    C    D
K0   A0   B0   C0   D0
K1   A1   B1  NaN  NaN
K2   A2   B2   C2   D2
K3  NaN  NaN   C3   D3
'''
res = pd.merge(left, right, left_index=True, right_index=True, how='inner')
print(res)
'''
     A   B   C   D
K0  A0  B0  C0  D0
K2  A2  B2  C2  D2

'''


# handle overlapping

boys = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]})
print(boys)
'''
    k  age
0  K0    1
1  K1    2
2  K2    3
'''
girls = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]})
print(girls)
'''
    k  age
0  K0    4
1  K0    5
2  K3    6
'''
res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='inner')
print(res)
'''
    k  age_boy  age_girl
0  K0        1         4
1  K0        1         5
'''

res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='outer')
print(res)
'''
    k  age_boy  age_girl
0  K0      1.0       4.0
1  K0      1.0       5.0
2  K1      2.0       NaN
3  K2      3.0       NaN
4  K3      NaN       6.0
'''

绘图

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# plot data

# Series
data = pd.Series(np.random.randn(1000), index=np.arange(1000))
# 数据累加
data = data.cumsum()
data.plot()

# DataFrame
data = pd.DataFrame(np.random.randn(1000, 4), index=np.arange(1000), columns=list("ABCD"))
data = data.cumsum()
# plot methods:
# 'bar', 'hist', 'box', 'kde', 'area', scatter', hexbin', 'pie'
ax = data.plot.scatter(x='A', y='B', color='DarkBlue', label="Class 1")
data.plot.scatter(x='A', y='C', color='LightGreen', label='Class 2', ax=ax)

plt.show()