pandas中groupby的相关笔记
import numpy as np
import pandas as pd
df= pd. DataFrame( np. random. randint( 0 , 15 , ( 5 , 2 ) ) , columns= [ 'nI' , 'nII' ] )
df
df. sum ( ) , df. sum ( axis= 1 )
(nI 35
nII 36
dtype: int64,
0 22
1 10
2 11
3 25
4 3
dtype: int64)
df. mean( ) , df. mean( axis= 1 )
(nI 7.0
nII 7.2
dtype: float64,
0 11.0
1 5.0
2 5.5
3 12.5
4 1.5
dtype: float64)
df. describe( )
nI nII count 5.000000 5.000000 mean 7.000000 7.200000 std 4.949747 6.379655 min 1.000000 0.000000 25% 3.000000 1.000000 50% 8.000000 9.000000 75% 10.000000 12.000000 max 13.000000 14.000000
df= pd. DataFrame( { 'key' : [ 'A' , 'B' , 'C' , 'A' , 'B' , 'C' ] , 'value' : [ 5 , 4 , 3 , 2 , 1 , 0 ] } )
df
key value 0 A 5 1 B 4 2 C 3 3 A 2 4 B 1 5 C 0
df. groupby( 'key' )
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000027414C02610>
df. groupby( 'key' ) [ 'value' ]
<pandas.core.groupby.generic.SeriesGroupBy object at 0x000002741CBD8610>
df. groupby( 'key' ) . sum ( )
df. groupby( 'key' ) [ 'value' ] . sum ( )
key
A 7
B 5
C 3
Name: value, dtype: int64
for key, value in df. groupby( 'key' ) :
print ( '%s' % value)
key value
0 A 5
3 A 2
key value
1 B 4
4 B 1
key value
2 C 3
5 C 0
for key, value in df. groupby( 'key' ) [ 'value' ] :
print ( value)
0 5
3 2
Name: value, dtype: int64
1 4
4 1
Name: value, dtype: int64
2 3
5 0
Name: value, dtype: int64
groupby:
延迟计算(lazy evaluation),有点像yield。 split-apply-combine,如下图 :
对于多层索引,DataFrame.groupby()
中使用level 参数。 可使用axis 参数改变split方向,但是传递的label必须是列名(也不是索引),不能是行名。 可使用group_keys=False 来去掉调用apply()方法时的分组名(group keys).
df2= pd. DataFrame( { 'key' : [ 'A' , 'B' , 'C' , 'A' , 'B' , 'C' ] , 'data1' : np. random. randint( 2 , 15 , ( 6 , ) ) , 'data2' : np. random. ranf( ( 6 , ) ) } )
df2
key data1 data2 0 A 12 0.774106 1 B 4 0.260684 2 C 13 0.349387 3 A 6 0.651588 4 B 14 0.645364 5 C 4 0.974801
df2. groupby( 'key' ) . aggregate( [ 'min' , np. median, max ] )
data1 data2 min median max min median max key A 6 9.0 12 0.651588 0.712847 0.774106 B 4 9.0 14 0.260684 0.453024 0.645364 C 4 8.5 13 0.349387 0.662094 0.974801
df2. groupby( 'key' ) . aggregate( { 'data1' : min , 'data2' : 'max' } )
data1 data2 key A 6 0.774106 B 4 0.645364 C 4 0.974801
def fun_filter ( x) :
print ( x)
return x[ 'data1' ] . std( ) > 6
df2. groupby( 'key' ) . filter ( fun_filter)
key data1 data2
0 A 12 0.774106
3 A 6 0.651588
key data1 data2
1 B 4 0.260684
4 B 14 0.645364
key data1 data2
2 C 13 0.349387
5 C 4 0.974801
key data1 data2 1 B 4 0.260684 2 C 13 0.349387 4 B 14 0.645364 5 C 4 0.974801
df2. groupby( 'key' ) . transform( lambda x: x- 1 )
data1 data2 0 11 -0.225894 1 3 -0.739316 2 12 -0.650613 3 5 -0.348412 4 13 -0.354636 5 3 -0.025199
def fun_t ( x) :
print ( x)
return x
df2. groupby( 'key' ) . transform( fun_t)
0 12.0
3 6.0
Name: data1, dtype: float64
0 12
3 6
Name: data1, dtype: int32
0 0.774106
3 0.651588
Name: data2, dtype: float64
data1 data2
0 12 0.774106
3 6 0.651588
data1 data2
1 4 0.260684
4 14 0.645364
data1 data2
2 13 0.349387
5 4 0.974801
data1 data2 0 12 0.774106 1 4 0.260684 2 13 0.349387 3 6 0.651588 4 14 0.645364 5 4 0.974801
df. groupby( 'key' ) . transform( np. mean)
value 0 3.5 1 2.5 2 1.5 3 3.5 4 2.5 5 1.5
df2. groupby( 'key' ) . apply ( lambda x: x[ 'data1' ] - x[ 'data2' ] )
key
A 0 11.225894
3 5.348412
B 1 3.739316
4 13.354636
C 2 12.650613
5 3.025199
dtype: float64
任意分组
这里的关键是要理解分组键 的概念以及对映射 的理解。
df2
key data1 data2 0 A 12 0.774106 1 B 4 0.260684 2 C 13 0.349387 3 A 6 0.651588 4 B 14 0.645364 5 C 4 0.974801
L= [ 'first3' , 'first3' , 'first3' , 'mid2' , 'mid2' , 'lastone' ]
L2= [ 1 , 1 , 2 , 1 , 2 , 1 ]
df2. groupby( L) . sum ( )
data1 data2 first3 29 1.384176 lastone 4 0.974801 mid2 20 1.296952
df2. groupby( [ L, L2] ) . sum ( )
data1 data2 first3 1 16 1.034789 2 13 0.349387 lastone 1 4 0.974801 mid2 1 6 0.651588 2 14 0.645364
df2
key data1 data2 0 A 12 0.774106 1 B 4 0.260684 2 C 13 0.349387 3 A 6 0.651588 4 B 14 0.645364 5 C 4 0.974801
df2. groupby( 'key' ) . apply ( lambda x: np. sum ( x, axis= 1 ) )
key
A 0 12.774106
3 6.651588
B 1 4.260684
4 14.645364
C 2 13.349387
5 4.974801
dtype: float64
参考文档