df = pd.DataFrame({'data1':np.random.rand(5),'data2':np.random.rand(5),'key1':list('aabba'),'key2':['one','two','one','two','one']})
k_mean = df.groupby('key1').mean()print(df)print(k_mean)print(pd.merge(df,k_mean,left_on='key1',right_index=True).add_prefix('mean_')) # .add_prefix('mean_'):添加前缀
# 通过分组、合并,得到一个包含均值的Dataframe
print(df.groupby('key2').mean()) # 按照key2分组求均值
print(df.groupby('key2').transform(np.mean))# data1、data2每个位置元素取对应分组列的均值
# 字符串不能进行计算
----------------------------------------------------------------------
data1 data2 key1 key2
00.0037270.390301 a one
10.7447770.130300 a two
20.8872070.679309 b one
30.4485850.169208 b two
40.4480450.993775 a one
data1 data2
key1
a 0.3988500.504792
b 0.6678960.424258
mean_data1_x mean_data2_x mean_key1 mean_key2 mean_data1_y mean_data2_y
00.0037270.390301 a one 0.3988500.50479210.7447770.130300 a two 0.3988500.50479240.4480450.993775 a one 0.3988500.50479220.8872070.679309 b one 0.6678960.42425830.4485850.169208 b two 0.6678960.424258----------------------------------------------------------------------
data1 data2
key2
one 0.4463260.687795
two 0.5966810.149754
data1 data2
00.4463260.68779510.5966810.14975420.4463260.68779530.5966810.14975440.4463260.687795
一般化Groupby方法:apply
df = pd.DataFrame({'data1':np.random.rand(5),'data2':np.random.rand(5),'key1':list('aabba'),'key2':['one','two','one','two','one']})print(df.groupby('key1').apply(lambda x: x.describe()))# apply直接运行其中的函数
# 这里为匿名函数,直接描述分组后的统计量
def f_df1(d,n):return(d.sort_index()[:n])
def f_df2(d,k1):return(d[k1])print(df.groupby('key1').apply(f_df1,2))print(df.groupby('key1').apply(f_df2,'data2'))# f_df1函数:返回排序后的前n行数据# f_df2函数:返回分组后表的k1列,结果为Series,层次化索引
# 直接运行f_df函数
# 参数直接写在后面,也可以为.apply(f_df,n =2))----------------------------------------------------------------------
data1 data2
key1
a count 3.0000003.000000
mean 0.5617540.233470
std 0.3134390.337209
min 0.3256040.02690625%0.3839530.03890650%0.4423030.05090675%0.6798290.336753
max 0.9173550.622599
b count 2.0000002.000000
mean 0.8819060.547206
std 0.0793570.254051
min 0.8257910.36756425%0.8538490.45738550%0.8819060.54720675%0.9099630.637026
max 0.9380200.726847
data1 data2 key1 key2
key1
a 00.3256040.050906 a one
10.9173550.622599 a two
b 20.8257910.726847 b one
30.9380200.367564 b two
key1
a 00.05090610.62259940.026906
b 20.72684730.367564
df = pd.DataFrame({'data1':np.random.rand(8),'data2':np.random.rand(8),'key':list('aabbabab')})print(df)
df_ = df.groupby('key').transform(np.mean)print(df1)print(df.join(df_,rsuffix='_mean'))----------------------------------------------------------------------
创建df为:
data1 data2 key
00.2985070.481976 a
10.7619400.472252 a
20.6394910.964618 b
30.2389190.756300 b
40.2313370.617190 a
50.6817430.906671 b
60.6015920.798819 a
70.0717030.109292 b
------
data1 data2
00.5606570.57584010.5606570.57584020.4972250.20128630.4972250.20128640.5606570.57584050.4972250.20128660.5606570.57584070.4972250.201286
求和且合并之后结果为:
data1 data2 key data1_mean data2_mean
00.2985070.481976 a 0.4733440.59255910.7619400.472252 a 0.4733440.59255920.6394910.964618 b 0.4079640.68422030.2389190.756300 b 0.4079640.68422040.2313370.617190 a 0.4733440.59255950.6817430.906671 b 0.4079640.68422060.6015920.798819 a 0.4733440.59255970.0717030.109292 b 0.4079640.684220