Pandas数据处理3:数据合并(join、merge),累计与分组(groupby)

Pandas数据合并(join、merge),累计与分组(groupby)

import pandas as pd 
import numpy as np
def make_df(cols, ind): 
 """一个简单的DataFrame""" 
 data = {c: [str(c) + str(i) for i in ind]  for c in cols} 
 return pd.DataFrame(data, index=ind) 
# DataFrame示例
make_df('ABC', range(3))
ABC
0A0B0C0
1A1B1C1
2A2B2C2
#np数组的合并
x = [[1, 2], [3, 4]] 
np.concatenate([x, x], axis=1)
array([[1, 2, 1, 2],
       [3, 4, 3, 4]])
#pandas合并:通过pd.concat实现简易合并
#series
ser1 = pd.Series(['A', 'B', 'C'], index=[1, 2, 3]) 
ser2 = pd.Series(['D', 'E', 'F'], index=[4, 5, 6]) 
pd.concat([ser1, ser2])
1    A
2    B
3    C
4    D
5    E
6    F
dtype: object
df1 = make_df('AB', [1, 2]) 
df2 = make_df('AB', [3, 4]) 
print(df1); print(df2); print(pd.concat([df1, df2]))
    A   B
1  A1  B1
2  A2  B2
    A   B
3  A3  B3
4  A4  B4
    A   B
1  A1  B1
2  A2  B2
3  A3  B3
4  A4  B4
#默认情况下,DataFrame 的合并都是逐行进行的(默认设置是 axis=0)。与 np.concatenate()一样,pd.concat 也可以设置合并坐标轴
df3 = make_df('AB', [0, 1]) 
df4 = make_df('CD', [0, 1]) 
print(df3); print(df4); print(pd.concat([df3, df4], axis=0))
print(pd.concat([df3, df4], axis=1))# 按列进行
    A   B
0  A0  B0
1  A1  B1
    C   D
0  C0  D0
1  C1  D1
     A    B    C    D
0   A0   B0  NaN  NaN
1   A1   B1  NaN  NaN
0  NaN  NaN   C0   D0
1  NaN  NaN   C1   D1
    A   B   C   D
0  A0  B0  C0  D0
1  A1  B1  C1  D1
#在合并时会保留索引,即使索引是重复的!
x = make_df('AB', [0, 1]) 
y = make_df('AB', [2, 3]) 
y.index = x.index # 复制索引
print(x); print(y); print(pd.concat([x, y]))
    A   B
0  A0  B0
1  A1  B1
    A   B
0  A2  B2
1  A3  B3
    A   B
0  A0  B0
1  A1  B1
0  A2  B2
1  A3  B3
#如果你想要检测 pd.concat() 合并的结果中是否出现了重复的索引,可以设置 verify_integrity 参数
try: 
 pd.concat([x, y], verify_integrity=True) 
except ValueError as e: 
 print("ValueError:", e)
ValueError: Indexes have overlapping values: Int64Index([0, 1], dtype='int64')
#有时索引无关紧要,那么合并时就可以忽略它们,可以通过设置 ignore_index 参数来实现。如果将参数设置为 True,那么合并时将会创建一个新的整数索引。
print(x); print(y); print(pd.concat([x, y], ignore_index=True))
    A   B
0  A0  B0
1  A1  B1
    A   B
0  A2  B2
1  A3  B3
    A   B
0  A0  B0
1  A1  B1
2  A2  B2
3  A3  B3
#另一种处理索引重复的方法是通过 keys 参数为数据源设置多级索引标签,这样结果数据就会带上多级索引
print(x); print(y); print(pd.concat([x, y], keys=['x', 'y']))
    A   B
0  A0  B0
1  A1  B1
    A   B
0  A2  B2
1  A3  B3
      A   B
x 0  A0  B0
  1  A1  B1
y 0  A2  B2
  1  A3  B3
#类似join的合并
#默认情况下,某个位置上缺失的数据会用 NaN 表示
df5 = make_df('ABC', [1, 2]) 
df6 = make_df('BCD', [3, 4]) 
print(df5); print(df6); print(pd.concat([df5, df6]))
    A   B   C
1  A1  B1  C1
2  A2  B2  C2
    B   C   D
3  B3  C3  D3
4  B4  C4  D4
     A   B   C    D
1   A1  B1  C1  NaN
2   A2  B2  C2  NaN
3  NaN  B3  C3   D3
4  NaN  B4  C4   D4
#默认的合并方式是对所有输入列进行并集合并(join='outer'),当然也可以用 join='inner' 实现对输入列的交集合并
print(df5); print(df6); 
print(pd.concat([df5, df6], join='inner'))
    A   B   C
1  A1  B1  C1
2  A2  B2  C2
    B   C   D
3  B3  C3  D3
4  B4  C4  D4
    B   C
1  B1  C1
2  B2  C2
3  B3  C3
4  B4  C4
#可以使用 df1.append(df2),效果与pd.concat([df1, df2]) 一样
print(df1); print(df2); print(df1.append(df2))
    A   B
1  A1  B1
2  A2  B2
    A   B
3  A3  B3
4  A4  B4
    A   B
1  A1  B1
2  A2  B2
3  A3  B3
4  A4  B4
#merge
df1 = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue'], 'group': ['Accounting', 'Engineering', 'Engineering', 'HR']}) 
df2 = pd.DataFrame({'employee': ['Lisa', 'Bob', 'Jake', 'Sue'], 'hire_date': [2004, 2008, 2012, 2014]}) 
print(df1); print(df2)
df3 = pd.merge(df1, df2) #pd.merge() 方法会发现两个 DataFrame 都有“employee”列,并会自动以这列作为键进行连接。
df3
  employee        group
0      Bob   Accounting
1     Jake  Engineering
2     Lisa  Engineering
3      Sue           HR
  employee  hire_date
0     Lisa       2004
1      Bob       2008
2     Jake       2012
3      Sue       2014
employeegrouphire_date
0BobAccounting2008
1JakeEngineering2012
2LisaEngineering2004
3SueHR2014
#merge:设置数据合并的键
#最简单的方法就是直接将参数 on 设置为一个列名字符串或者一个包含多列名称的列表
print(df1); print(df2); print(pd.merge(df1, df2, on='employee'))
#这个参数只能在两个 DataFrame 有共同列名的时候才可以使用
  employee        group
0      Bob   Accounting
1     Jake  Engineering
2     Lisa  Engineering
3      Sue           HR
  employee  hire_date
0     Lisa       2004
1      Bob       2008
2     Jake       2012
3      Sue       2014
  employee        group  hire_date
0      Bob   Accounting       2008
1     Jake  Engineering       2012
2     Lisa  Engineering       2004
3      Sue           HR       2014
#left_on与right_on参数
df3 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'], 'salary': [70000, 80000, 120000, 90000]}) 
print(df1); print(df3); 
print(pd.merge(df1, df3, left_on="employee", right_on="name"))
#获取的结果中会有一个多余的列,可以通过 DataFrame 的 drop() 方法将这列去掉
pd.merge(df1, df3, left_on="employee", right_on="name").drop('name', axis=1)
  employee        group
0      Bob   Accounting
1     Jake  Engineering
2     Lisa  Engineering
3      Sue           HR
   name  salary
0   Bob   70000
1  Jake   80000
2  Lisa  120000
3   Sue   90000
  employee        group  name  salary
0      Bob   Accounting   Bob   70000
1     Jake  Engineering  Jake   80000
2     Lisa  Engineering  Lisa  120000
3      Sue           HR   Sue   90000
employeegroupsalary
0BobAccounting70000
1JakeEngineering80000
2LisaEngineering120000
3SueHR90000
#left_index与right_index参数:可以通过设置 pd.merge() 中的 left_index 和 / 或 right_index 参数将索引设置为键来实现合并
df1a = df1.set_index('employee') 
df2a = df2.set_index('employee') 
print(df1a); print(df2a)
print(pd.merge(df1a, df2a, left_index=True, right_index=True))
                group
employee             
Bob        Accounting
Jake      Engineering
Lisa      Engineering
Sue                HR
          hire_date
employee           
Lisa           2004
Bob            2008
Jake           2012
Sue            2014
                group  hire_date
employee                        
Bob        Accounting       2008
Jake      Engineering       2012
Lisa      Engineering       2004
Sue                HR       2014
#join: DataFrame 实现了 join() 方法,它可以按照索引进行数据合并
print(df1a); print(df2a); print(df1a.join(df2a))
                group
employee             
Bob        Accounting
Jake      Engineering
Lisa      Engineering
Sue                HR
          hire_date
employee           
Lisa           2004
Bob            2008
Jake           2012
Sue            2014
                group  hire_date
employee                        
Bob        Accounting       2008
Jake      Engineering       2012
Lisa      Engineering       2004
Sue                HR       2014
#内连接和外连接
df6 = pd.DataFrame({'name': ['Peter', 'Paul', 'Mary'], 'food': ['fish', 'beans', 'bread']},  columns=['name', 'food']) 
df7 = pd.DataFrame({'name': ['Mary', 'Joseph'],  'drink': ['wine', 'beer']},  columns=['name', 'drink']) 
print(df6); print(df7); print(pd.merge(df6, df7))  #默认inner
print(pd.merge(df6, df7,how='outer'))

#左连接(left join)和右连接(right join)返回的结果分别只包含左列和右列
print(pd.merge(df6, df7, how='left'))
    name   food
0  Peter   fish
1   Paul  beans
2   Mary  bread
     name drink
0    Mary  wine
1  Joseph  beer
   name   food drink
0  Mary  bread  wine
     name   food drink
0   Peter   fish   NaN
1    Paul  beans   NaN
2    Mary  bread  wine
3  Joseph    NaN  beer
    name   food drink
0  Peter   fish   NaN
1   Paul  beans   NaN
2   Mary  bread  wine
#重复列名: suffixes参数
df8 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'], 'rank': [1, 2, 3, 4]}) 
df9 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'], 'rank': [3, 1, 4, 2]}) 
print(df8); print(df9); print(pd.merge(df8, df9, on="name"))
print(pd.merge(df8, df9, on="name", suffixes=["_L", "_R"]))
   name  rank
0   Bob     1
1  Jake     2
2  Lisa     3
3   Sue     4
   name  rank
0   Bob     3
1  Jake     1
2  Lisa     4
3   Sue     2
   name  rank_x  rank_y
0   Bob       1       3
1  Jake       2       1
2  Lisa       3       4
3   Sue       4       2
   name  rank_L  rank_R
0   Bob       1       3
1  Jake       2       1
2  Lisa       3       4
3   Sue       4       2
#累计与分组
rng = np.random.RandomState(42) 
ser = pd.Series(rng.rand(5))
#DataFrame 的累计函数默认对每列进行统计
df = pd.DataFrame({'A': rng.rand(5),  'B': rng.rand(5)})
print(df)
print(df.mean())
print(df.mean(axis=1))  #对行统计
print(df.describe())  #打印若干常用统计
          A         B
0  0.155995  0.020584
1  0.058084  0.969910
2  0.866176  0.832443
3  0.601115  0.212339
4  0.708073  0.181825
A    0.477888
B    0.443420
dtype: float64
0    0.088290
1    0.513997
2    0.849309
3    0.406727
4    0.444949
dtype: float64
              A         B
count  5.000000  5.000000
mean   0.477888  0.443420
std    0.353125  0.426952
min    0.058084  0.020584
25%    0.155995  0.181825
50%    0.601115  0.212339
75%    0.708073  0.832443
max    0.866176  0.969910
#GroupBy:分割、应用和组合
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'], 'data': range(6)}, columns=['key', 'data'])
print(df)
df.groupby('key')  #一个DataFrameGroupBy对象  -“延迟计算”策略
print(df.groupby('key').sum())
  key  data
0   A     0
1   B     1
2   C     2
3   A     3
4   B     4
5   C     5
     data
key      
A       3
B       5
C       7
#按组迭代
for (key, group) in df.groupby('key'): 
    print('------------')
    print(key)
    print(group)
------------
A
  key  data
0   A     0
3   A     3
------------
B
  key  data
1   B     1
4   B     4
------------
C
  key  data
2   C     2
5   C     5
rng = np.random.RandomState(0) 
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'], 'data1': range(6), 'data2': rng.randint(0, 10, 6)}, 
columns = ['key', 'data1', 'data2'])
df
keydata1data2
0A05
1B10
2C23
3A33
4B47
5C59
#aggregate支持更复杂的操作
print(df.groupby('key').aggregate(['min', np.median, max]))
print(df.groupby('key').aggregate({'data1': 'min', 'data2': 'max'}))
    data1            data2           
      min median max   min median max
key                                  
A       0    1.5   3     3    4.0   5
B       1    2.5   4     0    3.5   7
C       2    3.5   5     3    6.0   9
     data1  data2
key              
A        0      5
B        1      7
C        2      9
#过滤
def filter_func(x): 
 return x['data2'].std() > 4  #标准差大于4
print(df); print(df.groupby('key').std()); 
print(df.groupby('key').filter(filter_func))
  key  data1  data2
0   A      0      5
1   B      1      0
2   C      2      3
3   A      3      3
4   B      4      7
5   C      5      9
       data1     data2
key                   
A    2.12132  1.414214
B    2.12132  4.949747
C    2.12132  4.242641
  key  data1  data2
1   B      1      0
2   C      2      3
4   B      4      7
5   C      5      9
#转换
df.groupby('key').transform(lambda x: x - x.mean())  #减去均值
data1data2
0-1.51.0
1-1.5-3.5
2-1.5-3.0
31.5-1.0
41.53.5
51.53.0
#apply() 方法让你可以在每个组上应用任意方法
def norm_by_data2(x): 
    # x是一个分组数据的DataFrame 
    x['data1'] /= x['data2'].sum() 
    return x
print(df); print(df.groupby('key').apply(norm_by_data2))
  key  data1  data2
0   A      0      5
1   B      1      0
2   C      2      3
3   A      3      3
4   B      4      7
5   C      5      9
  key     data1  data2
0   A  0.000000      5
1   B  0.142857      0
2   C  0.166667      3
3   A  0.375000      3
4   B  0.571429      7
5   C  0.416667      9
#将列表、数组、Series 或索引作为分组键。分组键可以是长度与 DataFrame 匹配的任意Series 或列表,例如:
L = [0, 1, 0, 1, 2, 0] 
print(df); print(df.groupby(L).sum())
  key  data1  data2
0   A      0      5
1   B      1      0
2   C      2      3
3   A      3      3
4   B      4      7
5   C      5      9
   data1  data2
0      7     17
1      4      3
2      4      7
#用字典或 Series 将索引映射到分组名称
df2 = df.set_index('key') 
mapping = {'A': 'vowel', 'B': 'consonant', 'C': 'consonant'} 
print(df2); print(df2.groupby(mapping).sum())
     data1  data2
key              
A        0      5
B        1      0
C        2      3
A        3      3
B        4      7
C        5      9
           data1  data2
consonant     12     19
vowel          3      8
#任意 Python 函数。与前面的字典映射类似,你可以将任意 Python 函数传入 groupby,函数映射到索引,然后新的分组输出
#多个有效键构成的列表。此外,任意之前有效的键都可以组合起来进行分组,从而返回一个多级索引的分组结果
print(df2)
print(df2.groupby(str.lower).mean())
df2.groupby([str.lower, mapping]).mean()
     data1  data2
key              
A        0      5
B        1      0
C        2      3
A        3      3
B        4      7
C        5      9
   data1  data2
a    1.5    4.0
b    2.5    3.5
c    3.5    6.0
data1data2
avowel1.54.0
bconsonant2.53.5
cconsonant3.56.0

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值