'''
groupby 聚合练习
'''
import pandas as pd
import numpy as np
df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
'key2' : ['one', 'two', 'one', 'two', 'one'],
'data1' : np.arange(5),
'data2' : np.arange(1,6)})
gp1=df.groupby('key1')
'''
gp1内容:
a
key1 key2 data1 data2
0 a one 0 1
1 a two 1 2
4 a one 4 5
b
key1 key2 data1 data2
2 b one 2 3
3 b two 3 4
'''
def minus(group):
return group.max()-group.min()
#print(gp1.data1.agg(minus))
'''
key1
a 4
b 1
'''
content=pd.read_csv('example/tips.csv')
content['tip_pct']=content['tip']/content['total_bill']
grouped=content.groupby(['day', 'smoker'])
grouped_pct=grouped['tip_pct']
#print(grouped_pct.agg('mean')) #计算grouped_pct(也即grouped的'tip_pct'列)的平均值(mean)
'''
day smoker
Fri No 0.151650
Yes 0.174783
Sat No 0.158048
Yes 0.147906
Sun No 0.160113
Yes 0.187250
Thur No 0.160298
Yes 0.163863
'''
#print(grouped_pct.agg([('foo', 'mean'), ('bar', np.std)]))
#也可以传入一个列表,包括两个元组,每个元组第一个参数('foo'和'bar')表示group中的列名,第二个参数('mean'和np.std)是计算方法
'''
foo bar
day smoker
Fri No 0.151650 0.028123
Yes 0.174783 0.051293
Sat No 0.158048 0.039767
Yes 0.147906 0.061375
Sun No 0.160113 0.042347
Yes 0.187250 0.154134
Thur No 0.160298 0.038774
Yes 0.163863 0.039389
'''
functions = ['count', 'mean', 'max'] #可以应用于全部列的函数列表
result = grouped['tip_pct', 'total_bill'].agg(functions) #对'tip_pct', 'total_bill'列用functions列表中的函数计算
'''
tip_pct total_bill
count mean max count mean max
day smoker
Fri No 4 0.151650 0.187735 4 18.420000 22.75
Yes 15 0.174783 0.263480 15 16.813333 40.17
Sat No 45 0.158048 0.291990 45 19.661778 48.33
Yes 42 0.147906 0.325733 42 21.276667 50.81
Sun No 57 0.160113 0.252672 57 20.506667 48.17
Yes 19 0.187250 0.710345 19 24.120000 45.35
Thur No 45 0.160298 0.266312 45 17.113111 41.19
Yes 17 0.163863 0.241255 17 19.190588 43.11
'''
def top(group,n=5,column='tip_pct'):
return group.sort_values(by=column,ascending=False)[:n]
top(content,n=6)
'''
total_bill tip smoker day time size tip_pct
172 7.25 5.15 Yes Sun Dinner 2 0.710345
178 9.60 4.00 Yes Sun Dinner 2 0.416667
67 3.07 1.00 Yes Sat Dinner 1 0.325733
232 11.61 3.39 No Sat Dinner 2 0.291990
183 23.17 6.50 Yes Sun Dinner 4 0.280535
109 14.31 4.00 Yes Sat Dinner 2 0.279525
'''
smkgroup=content.groupby('smoker')
smkgroup.apply(top) #按照smoker类型分组,并求出各分组中tip_pct排名前五的项,apply(fuc)实现对各分组都应用定义的函数fuc
'''
total_bill tip smoker day time size tip_pct
smoker
No 232 11.61 3.39 No Sat Dinner 2 0.291990
149 7.51 2.00 No Thur Lunch 2 0.266312
51 10.29 2.60 No Sun Dinner 2 0.252672
185 20.69 5.00 No Sun Dinner 5 0.241663
88 24.71 5.85 No Thur Lunch 2 0.236746
Yes 172 7.25 5.15 Yes Sun Dinner 2 0.710345
178 9.60 4.00 Yes Sun Dinner 2 0.416667
67 3.07 1.00 Yes Sat Dinner 1 0.325733
183 23.17 6.50 Yes Sun Dinner 4 0.280535
109 14.31 4.00 Yes Sat Dinner 2 0.279525
'''
smkgroup.apply(top,n=3, column='total_bill') #可以在函数名后加可以传递给函数的参数
'''
total_bill tip smoker day time size tip_pct
smoker
No 212 48.33 9.00 No Sat Dinner 4 0.186220
59 48.27 6.73 No Sat Dinner 4 0.139424
156 48.17 5.00 No Sun Dinner 6 0.103799
Yes 170 50.81 10.00 Yes Sat Dinner 3 0.196812
182 45.35 3.50 Yes Sun Dinner 3 0.077178
102 44.30 2.50 Yes Sat Dinner 3 0.056433
'''
frame=pd.DataFrame({'data1':np.arange(10),
'data2':np.arange(5,15)})
quartiles=pd.cut(frame.data1,3) #分成三等分进行分组,显示各个元素处在哪个分组
'''
0 (-0.009, 3.0]
1 (-0.009, 3.0]
2 (-0.009, 3.0]
3 (-0.009, 3.0]
4 (3.0, 6.0]
5 (3.0, 6.0]
6 (3.0, 6.0]
7 (6.0, 9.0]
8 (6.0, 9.0]
9 (6.0, 9.0]
'''
s=frame['data1']
s[::2]=np.nan
'''
s:
0 NaN
1 1.0
2 NaN
3 3.0
4 NaN
5 5.0
6 NaN
7 7.0
8 NaN
9 9.0
'''
s1=s.fillna(s.mean()) #用s.mean(s中除去空值的平均值)填充空值
'''
0 5.0
1 1.0
2 5.0
3 3.0
4 5.0
5 5.0
6 5.0
7 7.0
8 5.0
9 9.0
'''
states = ['Ohio', 'New York', 'Vermont', 'Florida',
'Oregon', 'Nevada', 'California', 'Idaho']
group_key = ['East'] * 4 + ['West'] * 4
data=pd.Series(np.arange(8),index=states)
data[::2]=np.nan
'''
Ohio NaN
New York 1.0
Vermont NaN
Florida 3.0
Oregon NaN
Nevada 5.0
California NaN
Idaho 7.0
'''
gp=data.groupby(group_key) #用列表group_key作为groupby分类参数,可以将列表中相同项对应的数据源分为一组
'''
East
Ohio NaN
New York 1.0
Vermont NaN
Florida 3.0
dtype: float64
West
Oregon NaN
Nevada 5.0
California NaN
Idaho 7.0
'''
gp.mean()
'''
East 2.0
West 6.0
'''
fill_mean=lambda g:g.fillna(g.mean())
gp.apply(fill_mean)
'''
Ohio 2.0
New York 1.0
Vermont 2.0
Florida 3.0
Oregon 6.0
Nevada 5.0
California 6.0
Idaho 7.0
'''
fill_values = {'East': 0.5, 'West': -1}
fill=lambda g:g.fillna(fill_values[g.name]) #groupby后各分组有一个默认自带的name参数,表示各分组名
gp.apply(fill)
'''
Ohio 0.5
New York 1.0
Vermont 0.5
Florida 3.0
Oregon -1.0
Nevada 5.0
California -1.0
Idaho 7.0
'''
df = pd.DataFrame({'category': ['a', 'a', 'a', 'a',
'b', 'b', 'b', 'b'],
'data': np.arange(8),
'weights':[0.1]*4+[0.3]*4})
'''
category data weights
0 a 0 0.1
1 a 1 0.1
2 a 2 0.1
3 a 3 0.1
4 b 4 0.3
5 b 5 0.3
6 b 6 0.3
7 b 7 0.3
'''
group=df.groupby('category')
get_wavg=lambda g:np.average(g['data'],weights=g['weights']) #求group各分组的加权平均数
print(group.apply(get_wavg))
'''
category
a 1.5
b 5.5
'''