import numpy as np
import pandas as pd
from pandas import Series,DataFrame
from numpy import nan as NA
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
df1=DataFrame(
{
'key1':list('aabba'),
'key2':'one,two,one,two,one'.split(','),
'data1':np.random.randint(0,10,5),
'data2':np.random.randint(0,10,5)
}
)
df1
key1 key2 data1 data2
0 a one 7 2
1 a two 4 5
2 b one 3 7
3 b two 9 8
4 a one 1 2
s1=df1['data1']
Series.groupby
按照某个series 进行分组,返回一个分组后的对象
s1.groupby(df1['key1'])
<pandas.core.groupby.generic.SeriesGroupBy object at 0x0000017FF3763F28>
# 可以用list()转换,查看结构,可以知道分组对象的每个值都是分组依据和分组结果
list(s1.groupby(df1.key1))
[('a',
0 7
1 4
4 1
Name: data1, dtype: int32),
('b',
2 3
3 9
Name: data1, dtype: int32)]
我们的分组对象是可以直接使用聚合(统计)函数
s1.groupby(df1.key1).sum()
key1
a 12
b 12
Name: data1, dtype: int32
s1.groupby(df1.key1).mean()
key1
a 4
b 6
Name: data1, dtype: int32
s1.groupby(df1.key1).count()
key1
a 3
b 2
Name: data1, dtype: int64
DataFrame.groupby
df1
key1 key2 data1 data2
0 a one 7 2
1 a two 4 5
2 b one 3 7
3 b two 9 8
4 a one 1 2
# 用匹配的值
df1.groupby(df1.key2)
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000017FF379FEB8>
list(df1.groupby(df1.key2))
[('one',
key1 key2 data1 data2
0 a one 7 2
2 b one 3 7
4 a one 1 2),
('two',
key1 key2 data1 data2
1 a two 4 5
3 b two 9 8)]
# 如果分组依据来自本身的某列,直接用列名称
df1.groupby('key2')
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000017FF37B5668>
# 可以用其他独立的Series进行分组
df1.groupby(Series(['c','c','d','d','d']))
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000017FF37B5EF0>
list(df1.groupby(Series(['c','c','d','d','d'])))
[('c',
key1 key2 data1 data2
0 a one 7 2
1 a two 4 5),
('d',
key1 key2 data1 data2
2 b one 3 7
3 b two 9 8
4 a one 1 2)]
# 可以直接使用聚合函数
df1.groupby(Series(['c','c','d','d','d'])).sum()
data1 data2
c 11 7
d 13 17
df1.groupby(Series(['c','c','d','d','d'])).max()
key1 key2 data1 data2
c a two 7 5
d b two 9 8
使用多个依据进行分组
DataFrame.groupby([Series1,Series2,…])
如果Series(mapper)来源本身,可以直接使用列名称
DataFrame.groupby([columnName1,columnName2,…])
df1.groupby([df1.key1,df1.key2])
<pandas.