#encoding:utf8
import numpy as np
import pandas as pd
'''
内置聚合函数
自定义聚合函数
一次性应该多个聚合函数
不同列应用不同聚合函数
重置索引
'''
df = pd.DataFrame(
{
'key1':['a','a','b','b','a'],
'key2':['one','two','one','two','one'],
'data1':np.random.randint(1,10,5),
'data2':np.random.randint(1,10,5),
}
)
print(df)
'''
data1 data2 key1 key2
0 1 8 a one
1 7 5 a two
2 1 6 b one
3 9 7 b two
4 5 4 a one
'''
#内置聚合函数
#求和
print(df.groupby('key1').sum())
'''
data1 data2
key1
a 21 19
b 6 4
'''
#求平均值
print(df.groupby('key1').mean())
'''
data1 data2
key1
a 3.333333 5.0
b 3.500000 4.0
'''
#求最小值
print(df.groupby('key1').min())
'''
data1 data2 key2
key1
a 1 3 one
b 4 1 one
'''
#求最大值
print(df.groupby('key1').max())
'''
data1 data2 key2
key1
a 9 9 two
b 4 4 two
'''
#展示一些列的聚合运算结果
print(df.groupby('key1').describe())
'''
data1 data2
key1
a count 3.000000 3.000000
mean 4.666667 5.333333
std 0.577350 2.081666
min 4.000000 3.000000
25% 4.500000 4.500000
50% 5.000000 6.000000
75% 5.000000 6.500000
max 5.000000 7.000000
b count 2.000000 2.000000
mean 6.500000 8.500000
std 3.535534 0.707107
min 4.000000 8.000000
25% 5.250000 8.250000
50% 6.500000 8.500000
75% 7.750000 8.750000
max 9.000000 9.000000
'''
#自定义聚合函数
grouped = df.groupby('key1')
#聚合结果最大值减去最小值
def peak_range(s):
return s.max() - s.min()
print(df)
print(grouped.agg(peak_range))
'''
data1 data2 key1 key2
0 7 2 a one
1 2 8 a two
2 7 8 b one
3 7 2 b two
4 1 7 a one
data1 data2
key1
a 6 6
b 0 6
'''
#还可以传如多个聚合函数进行聚合运算
print(grouped.agg(['std','mean','sum',peak_range]))
'''
data1 data2
std mean sum peak_range std mean sum peak_range
key1
a 1.732051 5.0 15 3 2.516611 6.666667 20 5
b 2.121320 2.5 5 3 2.121320 2.500000 5 3
'''
print(grouped.agg(['std','mean','sum',('range',peak_range)]))
'''
data1 data2
std mean sum range std mean sum range
key1
a 3.785939 3.666667 11 7 2.081666 7.333333 22 4
b 1.414214 5.000000 10 2 0.707107 1.500000 3 1
'''
#不同的列使用不同的聚合函数
d = {'data1':'mean','data2':'sum'}
print(grouped.agg(d))
'''
data1 data2
key1
a 5.0 19
b 5.5 17
'''
d = {'data1':['mean','sum'],'data2':['sum',('range',peak_range)]}
print(grouped.agg(d))
'''
data2 data1
sum range mean sum
key1
a 14 7 5.333333 16
b 9 5 5.500000 11
'''
print(grouped.agg(d).reset_index())
'''
key1 data2 data1
sum range mean sum
0 a 16 7 7 21
1 b 15 3 5 10
'''
#高级应用
df = pd.DataFrame(
{
'key1':['a','a','b','b','a'],
'key2':['one','two','one','two','one'],
'data1':np.random.randint(1,10,5),
'data2':np.random.randint(1,10,5),
}
)
k1_mean = df.groupby('key1').mean()
print(k1_mean)
'''
data1 data2
key1
a 5.0 3.333333
b 4.0 8.500000
'''
#首先修改聚合之后的列显示名称
k1_mean = df.groupby('key1').mean().add_prefix('mean_')
print(k1_mean)
'''
mean_data1 mean_data2
key1
a 7.666667 6.0
b 8.000000 5.0
'''
#使用合并函数
print(pd.merge(df,k1_mean,left_on='key1',right_index=True))
'''
data1 data2 key1 key2 mean_data1 mean_data2
0 5 7 a one 4.0 6.666667
1 2 8 a two 4.0 6.666667
4 5 5 a one 4.0 6.666667
2 6 8 b one 7.5 8.000000
3 9 8 b two 7.5 8.000000
'''
#另外的方式
k1_mean = df.groupby('key1').transform(np.mean).add_prefix('mean_')
print(k1_mean)
'''
mean_data1 mean_data2
0 6.666667 7.0
1 6.666667 7.0
2 5.500000 3.5
3 5.500000 3.5
4 6.666667 7.0
'''
#合并
df[k1_mean.columns] = k1_mean
print(df)
'''
data1 data2 key1 key2 mean_data1 mean_data2
0 9 3 a one 7 1.666667
1 7 1 a two 7 1.666667
2 7 5 b one 7 6.500000
3 7 8 b two 7 6.500000
4 5 1 a one 7 1.666667
'''
df = pd.DataFrame(
np.random.randint(1,10,(5,5)),
columns=list('abcde'),
index=['Alice','Bob','Candy','Dark','Emily']
)
print(df)
'''
a b c d e
Alice 8 1 4 8 1
Bob 6 8 2 7 1
Candy 3 2 3 2 6
Dark 8 9 2 9 8
Emily 7 9 6 3 4
'''
def demean(s):
return s - s.mean()
key = ['one','one','two','one','two']
demeaned = df.groupby(key).transform(demean)
print(demeaned)
'''
a b c d e
Alice 1.0 -1.0 -3.0 0.666667 2.0
Bob 2.0 0.0 0.0 1.666667 -3.0
Candy -2.5 3.0 1.5 0.500000 -0.5
Dark -3.0 1.0 3.0 -2.333333 1.0
Emily 2.5 -3.0 -1.5 -0.500000 0.5
'''
#误差
print(demeaned.groupby(key).mean())
'''
a b c d e
one 2.960595e-16 2.960595e-16 2.960595e-16 0.0 0.0
two 0.000000e+00 0.000000e+00 0.000000e+00 0.0 0.0
'''
#输出某一列排序的最大的前两行
df = pd.DataFrame(
{
'key1':['a','a','b','b','a','a','a','b','b','a'],
'key2':['one','two','one','two','one','one','two','one','two','one'],
'data1':np.random.randint(1,10,10),
'data2':np.random.randint(1,10,10),
}
)
print(df)
'''
data1 data2 key1 key2
0 1 2 a one
1 5 8 a two
2 7 9 b one
3 8 3 b two
4 9 8 a one
5 7 8 a one
6 6 7 a two
7 2 7 b one
8 7 8 b two
9 4 4 a one
'''
def top(g,n=2,column='data1'):
return g.sort_values(by=column,ascending=False)[:n]
print(df)
print(top(df))
print(top(df,n=3))
'''
data1 data2 key1 key2
0 1 4 a one
1 3 5 a two
2 1 6 b one
3 8 7 b two
4 7 7 a one
5 6 4 a one
6 4 1 a two
7 4 8 b one
8 4 3 b two
9 6 4 a one
data1 data2 key1 key2
3 8 7 b two
4 7 7 a one
data1 data2 key1 key2
3 8 7 b two
4 7 7 a one
5 6 4 a one
'''
print(df)
print(df.groupby('key1').apply(top))
'''
data1 data2 key1 key2
0 5 9 a one
1 5 2 a two
2 4 9 b one
3 9 3 b two
4 3 1 a one
5 7 5 a one
6 3 4 a two
7 4 3 b one
8 4 6 b two
9 9 6 a one
data1 data2 key1 key2
key1
a 9 9 6 a one
5 7 5 a one
b 3 9 3 b two
2 4 9 b one
'''
#对于Nan值采用分组的平均值进行填充
states = [
'成都','贵州','重庆','西藏',
'广州','杭州','福州','海南'
]
group_key = ['East'] * 4 + ['West'] * 4
data = pd.Series(np.random.randn(8),index=states)
data[['重庆','福州','海南']]=np.NaN
print(data)
'''
成都 -0.415079
贵州 0.276800
重庆 NaN
西藏 0.112333
广州 -1.029535
杭州 2.415340
福州 NaN
海南 NaN
'''
print(data.groupby(group_key).mean())
'''
East 1.641739
West -0.894022
'''
def fill_mean(g):
return g.fillna(g.mean())
print(data.groupby(group_key).apply(fill_mean))
'''
成都 -0.171079
贵州 0.577168
重庆 -0.355028
西藏 -1.471175
广州 -0.149200
杭州 0.193913
福州 0.022356
海南 0.022356
'''