DataFrame的级联&合并操作
pd.concat pd.append
pandas 使用pd.concat函数与np.concatenate 函数相似,只是多了一些参数
objs
axis=0
keys
join = 'outer' / 'inner':表示的是级联的方式,outer会将所有的项进行级联(忽略匹配与不匹配),而inner只会将匹配的项级联到一起,不匹配的不级联
ingnore_index = False
import numpy as np
import pandas as pd
from pandas import DataFrame
df1 = DataFrame(data = np.random.randint(0,100,size=(5,3)),columns = ['A','B','C'])
df2 = DataFrame(data = np.random.randint(0,100,size=(5,3)),columns = ['A','D','C'])
df1
| A | B | C |
---|
0 | 21 | 88 | 16 |
---|
1 | 17 | 43 | 51 |
---|
2 | 59 | 50 | 4 |
---|
3 | 25 | 77 | 19 |
---|
4 | 5 | 96 | 96 |
---|
df2
| A | D | C |
---|
0 | 9 | 73 | 90 |
---|
1 | 89 | 20 | 73 |
---|
2 | 27 | 49 | 67 |
---|
3 | 40 | 10 | 56 |
---|
4 | 64 | 38 | 8 |
---|
pd.concat((df1,df2),axis=1)
| A | B | C | A | D | C |
---|
0 | 21 | 88 | 16 | 9 | 73 | 90 |
---|
1 | 17 | 43 | 51 | 89 | 20 | 73 |
---|
2 | 59 | 50 | 4 | 27 | 49 | 67 |
---|
3 | 25 | 77 | 19 | 40 | 10 | 56 |
---|
4 | 5 | 96 | 96 | 64 | 38 | 8 |
---|
pd.concat((df1,df2),axis=0,join = 'inner')
| A | C |
---|
0 | 21 | 16 |
---|
1 | 17 | 51 |
---|
2 | 59 | 4 |
---|
3 | 25 | 19 |
---|
4 | 5 | 96 |
---|
0 | 9 | 90 |
---|
1 | 89 | 73 |
---|
2 | 27 | 67 |
---|
3 | 40 | 56 |
---|
4 | 64 | 8 |
---|
"""
merge与concat的区别在于,merge需要依据某个共同列来进行合并
使用pd.merge()合并时,会自动根据两者相同的column名称的那一列,作为key来进行合并
"""
df1 = DataFrame({'employee':['Bob','Jake','Lisa'],'group':['Accounting','Engineering','Engineering'],})
df1
| employee | group |
---|
0 | Bob | Accounting |
---|
1 | Jake | Engineering |
---|
2 | Lisa | Engineering |
---|
df2 = DataFrame({'employee':['Lisa','Bob','Jake'],'hire_date':[2004,2008,2012]})
df2
| employee | hire_date |
---|
0 | Lisa | 2004 |
---|
1 | Bob | 2008 |
---|
2 | Jake | 2012 |
---|
pd.merge(df1,df2,on='employee')
| employee | group | hire_date |
---|
0 | Bob | Accounting | 2008 |
---|
1 | Jake | Engineering | 2012 |
---|
2 | Lisa | Engineering | 2004 |
---|
df3 = DataFrame({'employee':['Jake','Lisa'],
'group':['Accounting','Engineering'],
'hire_date':[2004,2016]})
df3
| employee | group | hire_date |
---|
0 | Jake | Accounting | 2004 |
---|
1 | Lisa | Engineering | 2016 |
---|
df4 = DataFrame({'group':['Accounting','Engineering','Engineering'],
'supervisor':['Carly','Guido','Steve']})
df4
| group | supervisor |
---|
0 | Accounting | Carly |
---|
1 | Engineering | Guido |
---|
2 | Engineering | Steve |
---|
pd.merge(df3,df4)
| employee | group | hire_date | supervisor |
---|
0 | Jake | Accounting | 2004 | Carly |
---|
1 | Lisa | Engineering | 2016 | Guido |
---|
2 | Lisa | Engineering | 2016 | Steve |
---|
df5 = DataFrame({'group':['Accounting','Engineering','HR'],
'supervisor':['Carly','Guido','Steve']})
df5
| group | supervisor |
---|
0 | Accounting | Carly |
---|
1 | Engineering | Guido |
---|
2 | HR | Steve |
---|
pd.merge(df1,df5)
| employee | group | supervisor |
---|
0 | Bob | Accounting | Carly |
---|
1 | Jake | Engineering | Guido |
---|
2 | Lisa | Engineering | Guido |
---|
pd.merge(df1,df5,how='outer')
| employee | group | supervisor |
---|
0 | Bob | Accounting | Carly |
---|
1 | Jake | Engineering | Guido |
---|
2 | Lisa | Engineering | Guido |
---|
3 | NaN | HR | Steve |
---|
pd.merge(df1,df5,how='left')
| employee | group | supervisor |
---|
0 | Bob | Accounting | Carly |
---|
1 | Jake | Engineering | Guido |
---|
2 | Lisa | Engineering | Guido |
---|
df2 = DataFrame({'employee':['Jack','Bob','Jack'],
'group':['Accounting','sell','CEO'],
'hire_date':[2003,2007,2012]
})
pd.merge(df1,df2)
| employee | group | hire_date |
---|
0 | Jack | Accounting | 2003 |
---|
pd.merge(df1,df2,on='group')
| employee_x | group | hire_date_x | employee_y | hire_date_y |
---|
0 | Jack | Accounting | 2003 | Jack | 2003 |
---|
df5 = DataFrame({'name':['Lisa','Bob','Bill'],
'hire_date':[1998,2016,2007]})
df5
| name | hire_date |
---|
0 | Lisa | 1998 |
---|
1 | Bob | 2016 |
---|
2 | Bill | 2007 |
---|
pd.merge(df1,df5,left_on='employee',right_on='name')
| employee | group | hire_date_x | name | hire_date_y |
---|
0 | Bob | Engineering | 2007 | Bob | 2016 |
---|