数据合并是经常要用的操作。pandas的join和merge 非常好用。
1. pandas 的DataFrame构建数组
2.join (2,4)join(3,3)=(2,7)
(3,3)join(2,4)=(3,7)空行用NULL补齐
3 merge 根据相同值取并集
4.并集交集补集的概念
1. pandas 的DataFrame构建数组
df1=pd.DataFrame(np.ones((2,4)),index=["a","b"],columns=["A","B","C","D"])
# print(df1)
"""
A B C D
a 1.0 1.0 1.0 1.0
b 1.0 1.0 1.0 1.0
"""
df2=pd.DataFrame(np.zeros((3,3)),index=list("abc"),columns=list("XYZ"))
# print(df2)
"""
X Y Z
a 0.0 0.0 0.0
b 0.0 0.0 0.0
c 0.0 0.0 0.0
"""
df3 = pd.DataFrame(np.arange(9).reshape((3,3)),columns=list("fax"))
print(df3)
"""
f a x
0 0 1 2
1 3 4 5
2 6 7 8
"""
2.join (2,4)join(3,3)=(2,7)
(3,3)join(2,4)=(3,7)空行用NULL补齐
df3=df1.join(df2)
# print(df3)
"""
A B C D X Y Z
a 1.0 1.0 1.0 1.0 0.0 0.0 0.0
b 1.0 1.0 1.0 1.0 0.0 0.0 0.0
"""
#3.(3,3)join(2,4)=(3,7)
df4 = df2.join(df1)
print(df4)
"""
X Y Z A B C D
a 0.0 0.0 0.0 1.0 1.0 1.0 1.0
b 0.0 0.0 0.0 1.0 1.0 1.0 1.0
c 0.0 0.0 0.0 NaN NaN NaN NaN
"""
3 merge 根据相同值取并集
df1.merge(df3.on="a") 并集 df1 中找一colums="a"的值 [] df3 找一colums="a"值[] 两个要相同 整行合并 out 全部 right 取df3 "a"值,left 取 df1 "a"值 df1.merge(df3, on="a", how="out") df1.merge(df3, on="a", how="right") df1.merge(df3, on="a", how="left") how=left 取df1的a值 how=right df3的a值
def fun_merge():
# 1.定义赋值
# df1 = pd.DataFrame(np.zeros((2, 4)), index=list("AB"), columns=list("abcd"))
df1 = pd.DataFrame(np.ones((2, 4)), index=list("AB"), columns=list("abcd"))
# print(df1)
"""
a b c d
A 1.0 1.0 1.0 1.0
B 1.0 1.0 1.0 1.0
"""
#df2 = pd.DataFrame(np.zeros((3, 3)), index=list("ABC"), columns=list("xyz"))
# print(df2)
"""
x y z
A 0.0 0.0 0.0
B 0.0 0.0 0.0
C 0.0 0.0 0.0
"""
df3 = pd.DataFrame(np.arange(9).reshape((3,3)),columns=list("fax"))
#df3 = pd.DataFrame(np.zeros((3, 3)), columns=list("fax"))
# print(df3)
"""
"""
#merge 是列操作
#1.元素完全不同时无法实现并集,此时为空
# df1(2,4)全1 df3 (3,3)全0 on 按照a合并 但是a 上值完全不一样 此时为空
# df4=df1.merge(df3,on="a")
# print(df4)
"""
Empty DataFrame
Columns: [a, b, c, d, f, x]
Index: []
"""
#2.df1(2,4)全1 df3(3,3)1-9 有一个1
# 结果 (2,4+3-1) value=1时的 index 此处为0 value-f=0 value-x=2
# df1中有a值有两个1所以结果是两行
#df5 = df1.merge(df3, on="a")
#print(df3)
"""
f a x
0 0 1 2
1 3 4 5
2 6 7 8
"""
#print(df5)
#df1中有a值有两个1所以结果是两行
"""
a b c d f x
0 1.0 1.0 1.0 1.0 0 2
1 1.0 1.0 1.0 1.0 0 2
"""
# 3.df3 index=1 columns="a" 改为1 则再合并
# (2,4)merge (3,3)=(4,6)
# df3.loc[1,"a"]=1
# df6 =df1.merge(df3,on="a")
# print(df6)
"""
a b c d f x
0 1.0 1.0 1.0 1.0 0 2
1 1.0 1.0 1.0 1.0 3 5
2 1.0 1.0 1.0 1.0 0 2
3 1.0 1.0 1.0 1.0 3 5
"""
#4.df1.merge(df3,on="a",how="outer")
df1.loc["A", "a"] = 100
# df7 = df1.merge(df3, on="a", how="outer")
# print(df1)
# print(df3)
# print(df7)
"""
a b c d
A 100.0 1.0 1.0 1.0
B 1.0 1.0 1.0 1.0
f a x
0 0 1 2
1 3 4 5
2 6 7 8
a b c d f x
0 100.0 1.0 1.0 1.0 NaN NaN
1 1.0 1.0 1.0 1.0 0.0 2.0
2 4.0 NaN NaN NaN 3.0 5.0
3 7.0 NaN NaN NaN 6.0 8.0
"""
#5.how=left 取df1的a值 how=right df3的a值
df8 = df1.merge(df3, on="a", how="left")
df9=df1.merge(df3, on="a", how="right")
print(df8)
print(df9)
"""
a b c d f x
0 100.0 1.0 1.0 1.0 NaN NaN
1 1.0 1.0 1.0 1.0 0.0 2.0
a b c d f x
0 1.0 1.0 1.0 1.0 0 2
1 4.0 NaN NaN NaN 3 5
2 7.0 NaN NaN NaN 6 8
"""
4.并集交集补集的概念
假设有三个集合,A{1,2,3,4,5} , B{3,4,5,6,7} , C{1,2,3,4,5,6,7,8,9} 交集:A交B为:{3,4,5},就是集合当中共同具有的那一部分。 并集:A并B并C:{1,2,3,4,5,6,7,8,9}就是包含的所有的元素的总和。 补集:C对A的补集为:{6,7,8,9},就是集合C中A以外的元素。