dataframe的concatenate 与 merge

import numpy as np
import pandas as pd
import matplotlib as mpl
#concatenate 函数
df1 = pd.DataFrame(np.random.randn(4,3),index=list("abcd"),columns=list("ABC"))
df1
ABC
a0.2834560.6054280.145223
b1.782473-1.0431450.844106
c0.2363972.063387-0.573410
d0.2046410.5619901.709260
df2 = pd.DataFrame(np.random.randn(3,3),index=list("abd"),columns=list("CDE"))
df2
CDE
a0.044766-1.580057-0.512200
b-1.1876900.870686-1.442228
d0.3900940.222911-0.807610
pd.concat(
    objs:Union[Iterable[~FrameOrSeries], Mapping[collections.abc.Hashable, ~FrameOrSeries]],
    axis=0,
    join='outer',
    ignore_index:bool=False,
    keys=None,
    levels=None,
    names=None,
    verify_integrity:bool=False,
    sort:bool=False,
    copy:bool=True,
)
pd.concat([df1,df2],axis=0) #axis=0表示按行堆叠
ABCDE
a0.2834560.6054280.145223NaNNaN
b1.782473-1.0431450.844106NaNNaN
c0.2363972.063387-0.573410NaNNaN
d0.2046410.5619901.709260NaNNaN
aNaNNaN0.044766-1.580057-0.512200
bNaNNaN-1.1876900.870686-1.442228
dNaNNaN0.3900940.222911-0.807610
pd.concat([df1,df2],axis=0,keys=["X","Y"]) #keys真的好用,相当于给dataframe命名;当axis=1时相当于给columns命名
ABCDE
Xa0.2834560.6054280.145223NaNNaN
b1.782473-1.0431450.844106NaNNaN
c0.2363972.063387-0.573410NaNNaN
d0.2046410.5619901.709260NaNNaN
YaNaNNaN0.044766-1.580057-0.512200
bNaNNaN-1.1876900.870686-1.442228
dNaNNaN0.3900940.222911-0.807610
pd.concat([df1,df2],axis=0,ignore_index=True) #index 消除了
ABCDE
00.2834560.6054280.145223NaNNaN
11.782473-1.0431450.844106NaNNaN
20.2363972.063387-0.573410NaNNaN
30.2046410.5619901.709260NaNNaN
4NaNNaN0.044766-1.580057-0.512200
5NaNNaN-1.1876900.870686-1.442228
6NaNNaN0.3900940.222911-0.807610
pd.concat([df1,df2],axis=0,join="inner") #如果concatenate方式为inner,则只取重叠的数据列
C
a0.145223
b0.844106
c-0.573410
d1.709260
a0.044766
b-1.187690
d0.390094
#换个轴试一下
pd.concat([df1,df2],axis=1) #算是左右合并吧,注意默认的join方式是outer
ABCCDE
a0.2834560.6054280.1452230.044766-1.580057-0.512200
b1.782473-1.0431450.844106-1.1876900.870686-1.442228
c0.2363972.063387-0.573410NaNNaNNaN
d0.2046410.5619901.7092600.3900940.222911-0.807610
pd.concat([df1,df2],axis=1,join="inner") #仅把共有的行左右合并了
ABCCDE
a0.2834560.6054280.1452230.044766-1.580057-0.512200
b1.782473-1.0431450.844106-1.1876900.870686-1.442228
d0.2046410.5619901.7092600.3900940.222911-0.807610
#concat与reindex联用
pd.concat([df1,df2.reindex(df1.index)],axis=1)  #reindex先把df1与df2共有的部分提取出来,然后在左右合并
ABCCDE
a0.2834560.6054280.1452230.044766-1.580057-0.512200
b1.782473-1.0431450.844106-1.1876900.870686-1.442228
c0.2363972.063387-0.573410NaNNaNNaN
d0.2046410.5619901.7092600.3900940.222911-0.807610
df1.append(df2)  #append方法其实是concat的实例,axis=0 ,join=outer
ABCDE
a0.2834560.6054280.145223NaNNaN
b1.782473-1.0431450.844106NaNNaN
c0.2363972.063387-0.573410NaNNaN
d0.2046410.5619901.709260NaNNaN
aNaNNaN0.044766-1.580057-0.512200
bNaNNaN-1.1876900.870686-1.442228
dNaNNaN0.3900940.222911-0.807610
#merge()函数
pd.merge(
    left,
    right,
    how="inner",
    on=None,
    left_on=None,
    right_on=None,
    left_index=False,
    right_index=False,
    sort=True,
    suffixes=("_x", "_y"),
    copy=True,
    indicator=False,
    validate=None,
)
#其实最重要的问题是搞懂,左右dataframe merge时关键列是一对一,还是一对多或者多对一或者多对多
#上述四种情况下merge函数是如何处理的呢?
left = pd.DataFrame({"key1": ["K0", "K0", "K1", "K2"], "key2": ["K0", "K1", "K0", "K1"],"A": ["A0", "A1", "A2", "A3"],"B": ["B0", "B1", "B2", "B3"]})
left
key1key2AB
0K0K0A0B0
1K0K1A1B1
2K1K0A2B2
3K2K1A3B3
right = pd.DataFrame({"key1": ["K0", "K1", "K1", "K2"],"key2": ["K0", "K0", "K0", "K0"],"C": ["C0", "C1", "C2", "C3"],"D": ["D0", "D1", "D2", "D3"]})
right
key1key2CD
0K0K0C0D0
1K1K0C1D1
2K1K0C2D2
3K2K0C3D3
pd.merge(left,right,on=["key1","key2"]) #how="inner" ,注意这里的的一对多的关系,个人理解how="inner"会把两个dataframe的keys的交际列出来
key1key2ABCD
0K0K0A0B0C0D0
1K1K0A2B2C1D1
2K1K0A2B2C2D2
pd.merge(left,right,on=["key1","key2"],how="outer") #how="outer" ,个人理解how="outer"会把两个dataframe的keys都列出来
key1key2ABCD
0K0K0A0B0C0D0
1K0K1A1B1NaNNaN
2K1K0A2B2C1D1
3K1K0A2B2C2D2
4K2K1A3B3NaNNaN
5K2K0NaNNaNC3D3
pd.merge(left,right,on=["key1","key2"],how="left")#how="left",应该是key以左边的为主,而且一对多的行也会列出来,但是右边多的key则不会列出来
key1key2ABCD
0K0K0A0B0C0D0
1K0K1A1B1NaNNaN
2K1K0A2B2C1D1
3K1K0A2B2C2D2
4K2K1A3B3NaNNaN
pd.merge(left,right,on=["key1","key2"],how="right")#how="right",应该是key以右边的为主,而且一对多的行也会列出来,但是左边多的key则不会列出来
key1key2ABCD
0K0K0A0B0C0D0
1K1K0A2B2C1D1
2K1K0A2B2C2D2
3K2K0NaNNaNC3D3
#Here is another example with duplicate join keys in DataFrames:
left = pd.DataFrame({"A": [1, 2], "B": [2, 2]})
right = pd.DataFrame({"A": [4, 5, 6], "B": [2, 2, 2]})
result = pd.merge(left, right, on="B", how="outer")
print(left)
print(right)
print(result)
   A  B
0  1  2
1  2  2
   A  B
0  4  2
1  5  2
2  6  2
   A_x  B  A_y
0    1  2    4
1    1  2    5
2    1  2    6
3    2  2    4
4    2  2    5
5    2  2    6
#the merge indicator
df1 = pd.DataFrame({"col1": [0, 1], "col_left": ["a", "b"]})
df2 = pd.DataFrame({"col1": [1, 2, 2], "col_right": [2, 2, 2]})
pd.merge(df1, df2, on="col1", how="outer", indicator=True)
col1col_leftcol_right_merge
00aNaNleft_only
11b2.0both
22NaN2.0right_only
32NaN2.0right_only
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值