pandas groupby学习2——聚合

'''
groupby 聚合练习
'''
import pandas as pd
import numpy as np

df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                   'key2' : ['one', 'two', 'one', 'two', 'one'],
                   'data1' : np.arange(5),
                   'data2' : np.arange(1,6)})

gp1=df.groupby('key1')
'''
gp1内容:
a
  key1 key2  data1  data2
0    a  one      0      1
1    a  two      1      2
4    a  one      4      5
b
  key1 key2  data1  data2
2    b  one      2      3
3    b  two      3      4
'''
def minus(group):
    return group.max()-group.min()
#print(gp1.data1.agg(minus))
'''
key1
a    4
b    1
'''
content=pd.read_csv('example/tips.csv')
content['tip_pct']=content['tip']/content['total_bill']
grouped=content.groupby(['day', 'smoker'])
grouped_pct=grouped['tip_pct']
#print(grouped_pct.agg('mean'))  #计算grouped_pct(也即grouped的'tip_pct'列)的平均值(mean)
'''
day   smoker
Fri   No        0.151650
      Yes       0.174783
Sat   No        0.158048
      Yes       0.147906
Sun   No        0.160113
      Yes       0.187250
Thur  No        0.160298
      Yes       0.163863
'''
#print(grouped_pct.agg([('foo', 'mean'), ('bar', np.std)]))
#也可以传入一个列表,包括两个元组,每个元组第一个参数('foo'和'bar')表示group中的列名,第二个参数('mean'和np.std)是计算方法
'''
                  foo       bar
day  smoker                    
Fri  No      0.151650  0.028123
     Yes     0.174783  0.051293
Sat  No      0.158048  0.039767
     Yes     0.147906  0.061375
Sun  No      0.160113  0.042347
     Yes     0.187250  0.154134
Thur No      0.160298  0.038774
     Yes     0.163863  0.039389
'''
functions = ['count', 'mean', 'max']    #可以应用于全部列的函数列表
result = grouped['tip_pct', 'total_bill'].agg(functions)    #对'tip_pct', 'total_bill'列用functions列表中的函数计算
'''
            tip_pct                     total_bill                  
              count      mean       max      count       mean    max
day  smoker                                                         
Fri  No           4  0.151650  0.187735          4  18.420000  22.75
     Yes         15  0.174783  0.263480         15  16.813333  40.17
Sat  No          45  0.158048  0.291990         45  19.661778  48.33
     Yes         42  0.147906  0.325733         42  21.276667  50.81
Sun  No          57  0.160113  0.252672         57  20.506667  48.17
     Yes         19  0.187250  0.710345         19  24.120000  45.35
Thur No          45  0.160298  0.266312         45  17.113111  41.19
     Yes         17  0.163863  0.241255         17  19.190588  43.11
'''

def top(group,n=5,column='tip_pct'):
    return group.sort_values(by=column,ascending=False)[:n]
top(content,n=6)
'''
     total_bill   tip smoker  day    time  size   tip_pct
172        7.25  5.15    Yes  Sun  Dinner     2  0.710345
178        9.60  4.00    Yes  Sun  Dinner     2  0.416667
67         3.07  1.00    Yes  Sat  Dinner     1  0.325733
232       11.61  3.39     No  Sat  Dinner     2  0.291990
183       23.17  6.50    Yes  Sun  Dinner     4  0.280535
109       14.31  4.00    Yes  Sat  Dinner     2  0.279525
'''

smkgroup=content.groupby('smoker')
smkgroup.apply(top) #按照smoker类型分组,并求出各分组中tip_pct排名前五的项,apply(fuc)实现对各分组都应用定义的函数fuc
'''
            total_bill   tip smoker   day    time  size   tip_pct
smoker                                                           
No     232       11.61  3.39     No   Sat  Dinner     2  0.291990
       149        7.51  2.00     No  Thur   Lunch     2  0.266312
       51        10.29  2.60     No   Sun  Dinner     2  0.252672
       185       20.69  5.00     No   Sun  Dinner     5  0.241663
       88        24.71  5.85     No  Thur   Lunch     2  0.236746
Yes    172        7.25  5.15    Yes   Sun  Dinner     2  0.710345
       178        9.60  4.00    Yes   Sun  Dinner     2  0.416667
       67         3.07  1.00    Yes   Sat  Dinner     1  0.325733
       183       23.17  6.50    Yes   Sun  Dinner     4  0.280535
       109       14.31  4.00    Yes   Sat  Dinner     2  0.279525
'''
smkgroup.apply(top,n=3, column='total_bill')  #可以在函数名后加可以传递给函数的参数
'''
            total_bill    tip smoker  day    time  size   tip_pct
smoker                                                           
No     212       48.33   9.00     No  Sat  Dinner     4  0.186220
       59        48.27   6.73     No  Sat  Dinner     4  0.139424
       156       48.17   5.00     No  Sun  Dinner     6  0.103799
Yes    170       50.81  10.00    Yes  Sat  Dinner     3  0.196812
       182       45.35   3.50    Yes  Sun  Dinner     3  0.077178
       102       44.30   2.50    Yes  Sat  Dinner     3  0.056433
'''

frame=pd.DataFrame({'data1':np.arange(10),
                    'data2':np.arange(5,15)})
quartiles=pd.cut(frame.data1,3) #分成三等分进行分组,显示各个元素处在哪个分组
'''
0    (-0.009, 3.0]
1    (-0.009, 3.0]
2    (-0.009, 3.0]
3    (-0.009, 3.0]
4       (3.0, 6.0]
5       (3.0, 6.0]
6       (3.0, 6.0]
7       (6.0, 9.0]
8       (6.0, 9.0]
9       (6.0, 9.0]
'''

s=frame['data1']
s[::2]=np.nan
'''
s:
0    NaN
1    1.0
2    NaN
3    3.0
4    NaN
5    5.0
6    NaN
7    7.0
8    NaN
9    9.0
'''
s1=s.fillna(s.mean())   #用s.mean(s中除去空值的平均值)填充空值
'''
0    5.0
1    1.0
2    5.0
3    3.0
4    5.0
5    5.0
6    5.0
7    7.0
8    5.0
9    9.0
'''

states = ['Ohio', 'New York', 'Vermont', 'Florida',
          'Oregon', 'Nevada', 'California', 'Idaho']
group_key = ['East'] * 4 + ['West'] * 4
data=pd.Series(np.arange(8),index=states)
data[::2]=np.nan
'''
Ohio          NaN
New York      1.0
Vermont       NaN
Florida       3.0
Oregon        NaN
Nevada        5.0
California    NaN
Idaho         7.0
'''
gp=data.groupby(group_key)  #用列表group_key作为groupby分类参数,可以将列表中相同项对应的数据源分为一组
'''
East
Ohio        NaN
New York    1.0
Vermont     NaN
Florida     3.0
dtype: float64
West
Oregon        NaN
Nevada        5.0
California    NaN
Idaho         7.0
'''
gp.mean()
'''
East    2.0
West    6.0
'''
fill_mean=lambda g:g.fillna(g.mean())
gp.apply(fill_mean)
'''
Ohio          2.0
New York      1.0
Vermont       2.0
Florida       3.0
Oregon        6.0
Nevada        5.0
California    6.0
Idaho         7.0
'''
fill_values = {'East': 0.5, 'West': -1}
fill=lambda g:g.fillna(fill_values[g.name])    #groupby后各分组有一个默认自带的name参数,表示各分组名
gp.apply(fill)
'''
Ohio          0.5
New York      1.0
Vermont       0.5
Florida       3.0
Oregon       -1.0
Nevada        5.0
California   -1.0
Idaho         7.0
'''

df = pd.DataFrame({'category': ['a', 'a', 'a', 'a',
                                'b', 'b', 'b', 'b'],
                   'data': np.arange(8),
                   'weights':[0.1]*4+[0.3]*4})
'''
  category  data  weights
0        a     0      0.1
1        a     1      0.1
2        a     2      0.1
3        a     3      0.1
4        b     4      0.3
5        b     5      0.3
6        b     6      0.3
7        b     7      0.3
'''
group=df.groupby('category')
get_wavg=lambda g:np.average(g['data'],weights=g['weights'])    #求group各分组的加权平均数
print(group.apply(get_wavg))
'''
category
a    1.5
b    5.5
'''

 

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值