一、一般化Groupby方法:apply
import numpy as np
import pandas as pd
# 一般化Groupby方法:apply
df = pd.DataFrame({'data1': np.random.rand(5),
'data2': np.random.rand(5),
'key1': list('aabba'),
'key2': ['one', 'two', 'one', 'two', 'one']})
# apply直接运行其中的函数
# 这里为匿名函数,直接描述分组后的统计量
x1 = df.groupby('key1').apply(lambda x: x.describe())
print("x1 = df.groupby('key1').apply(lambda x: x.describe()) = \n", x1)
print('-' * 200)
# f_df1函数:返回排序后的前n行数据
def f_df1(d, n):
return (d.sort_index()[:n])
# f_df2函数:返回分组后表的k1列,结果为Series,层次化索引
def f_df2(d, k1):
return (d[k1])
print("df = \n", df)
print('-' * 200)
x1 = df.groupby('key1').apply(f_df1, 2)
print("x1 = df.groupby('key1').apply(f_df1, 2) = \n", x1)
print('-' * 200)
x2 = df.groupby('key1').apply(f_df2, 'data2')
print("x2 = df.groupby('key1').apply(f_df2, 'data2') = \n", x2)
print('-' * 50)
print("type(x2) = ", type(x2))
print('-' * 200)
打印结果:
x1 = df.groupby('key1').apply(lambda x: x.describe()) =
data1 data2
key1
a count 3.000000 3.000000
mean 0.776583 0.498758
std 0.357791 0.257671
min 0.363582 0.225764
25% 0.668674 0.379274
50% 0.973767 0.532783
75% 0.983083 0.635255
max 0.992400 0.737726
b count 2.000000 2.000000
mean 0.593805 0.531914
std 0.041973 0.451099
min 0.564126 0.212939
25% 0.578965 0.372426
50% 0.593805 0.531914
75% 0.608644 0.691402
max 0.623484 0.850889
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
df =
data1 data2 key1 key2
0 0.363582 0.225764 a one
1 0.973767 0.737726 a two
2 0.623484 0.212939 b one
3 0.564126 0.850889 b two
4 0.992400 0.532783 a one
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
x1 = df.groupby('key1').apply(f_df1, 2) =
data1 data2 key1 key2
key1
a 0 0.363582 0.225764 a one
1 0.973767 0.737726 a two
b 2 0.623484 0.212939 b one
3 0.564126 0.850889 b two
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
x2 = df.groupby('key1').apply(f_df2, 'data2') =
key1
a 0 0.225764
1 0.737726
4 0.532783
b 2 0.212939
3 0.850889
Name: data2, dtype: float64
--------------------------------------------------
type(x2) = <class 'pandas.core.series.Series'>
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Process finished with exit code 0