merge
#导入包
import pandas as pd
import numpy as np
#定义一个类,对融合数据进行显示
class display(object):
template = """<div style="float: left; padding: 10px;">
<p style='font-family:"Courier New", Courier, monospace'>{0}</p >{1}
</div>"""
def __init__(self, *args):
self.args = args
def _repr_html_(self):
return 'n'.join(self.template.format(a, eval(a)._repr_html_())
for a in self.args)
def __repr__(self):
return 'nn'.join(a + 'n' + repr(eval(a))
for a in self.args)
1)根据字段来融合
(1)左侧和右侧的相同内容字段名字一致且只有一个相同字段
df1 = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue'],
'group': ['Accounting', 'Engineering', 'Engineering', 'HR']})
df2 = pd.DataFrame({'employee': ['Lisa', 'Bob', 'Jake', 'Sue'],
'hire_date': [2004, 2008, 2012, 2014]})
display('df1','df2','pd.merge(df1,df2,on="employee")') #也可以写成pd.merge(df1,df2)
(2)左侧和右侧的相同内容字段名字不一致
df1 = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue'],
'group': ['Accounting', 'Engineering', 'Engineering', 'HR']})
df3 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
'salary': [70000, 80000, 120000, 90000]})
display('df1','df3','pd.merge(df1,df3,left_on="employee",right_on="name")')
1)设置索引
(1)索引相同
df1 = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue'],
'group': ['Accounting', 'Engineering', 'Engineering', 'HR']})
df2 = pd.DataFrame({'employee': ['Lisa', 'Bob', 'Jake', 'Sue'],
'hire_date': [2004, 2008, 2012, 2014]})
df1a=df1.set_index('employee')
df2a=df2.set_index('employee')
display('df1a','df2a','pd.merge(df1a,df2a,left_index=True,right_index=True)')
(2)索引不同
df1 = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue'],
'group': ['Accounting', 'Engineering', 'Engineering', 'HR']})
df3 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
'salary': [70000, 80000, 120000, 90000]})
df1a=df1.set_index('employee')
df3a=df3.set_index('name')
display('df1a','df3a','pd.merge(df1a,df3a,left_index=True,right_index=True)')
2)根据连接方式
df6 = pd.DataFrame({'name': ['Peter', 'Paul', 'Mary'],
'food': ['fish', 'beans', 'bread']},
columns=['name', 'food'])
df7 = pd.DataFrame({'name': ['Mary', 'Joseph'],
'drink': ['wine', 'beer']},
columns=['name', 'drink'])
(1)inner #默认
display('df6','df7','pd.merge(df6,df7,how="inner")')
(2)outer
display('df6','df7','pd.merge(df6,df7,how="outer")')
(3)left
display('df6','df7','pd.merge(df6,df7,how="left")')
(4)right
display('df6','df7','pd.merge(df6,df7,how="right")')
3)列重叠
(1)未添加suffixes
df8 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
'rank': [1, 2, 3, 4]})
df9 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
'rank': [3, 1, 4, 2]})
display('df8','df9','pd.merge(df8,df9,on="name")')
(2)添加suffixes
display('df8','df9','pd.merge(df8,df9,on="name",suffixes=["_yuwen","_shuxue"])')
总结:
1.字段
(1)左侧和右侧的相同内容字段名字一致且只有一个相同字段
pd.merge(df1,df2)或pd.merge(df1,df2,on='x')
(2)左侧和右侧的相同内容字段名字不一致
pd.merge(df1,df2,left_on='x',right_on='y')
2.索引
不适用列进行融合,可以选择索引进行融合
pd.merge(df1,df2,left_index=True,right_index=True)
3. 不同的融合方式
pd.merge(df1,df2,how='outer|inner|left|right')
4列重叠
pd.merge(df1,df2,on='',suffixes=['_x','_y','_z'])