import pandas as pd
import numpy as np
df = pd. DataFrame( { "key1" : [ "a" , "a" , "b" , "b" , "a" ] ,
"key2" : [ "one" , "two" , "one" , "two" , "one" ] ,
"data1" : np. random. randint( 1 , 10 , 5 ) ,
"data2" : np. random. randint( 1 , 10 , 5 ) } )
df
data1 data2 key1 key2 0 9 5 a one 1 2 6 a two 2 2 9 b one 3 7 4 b two 4 4 7 a one
df. groupby( "key1" ) . sum ( )
df. groupby( "key1" ) . describe( )
data1 data2 key1 a count 3.000000 3.000000 mean 5.000000 6.000000 std 3.605551 1.000000 min 2.000000 5.000000 25% 3.000000 5.500000 50% 4.000000 6.000000 75% 6.500000 6.500000 max 9.000000 7.000000 b count 2.000000 2.000000 mean 4.500000 6.500000 std 3.535534 3.535534 min 2.000000 4.000000 25% 3.250000 5.250000 50% 4.500000 6.500000 75% 5.750000 7.750000 max 7.000000 9.000000
grouped = df. groupby( "key1" )
def peak_range ( s) :
print ( type ( s) )
return s. max ( ) - s. min ( )
grouped. agg( peak_range)
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
grouped. agg( [ "std" , "mean" , "sum" , peak_range] )
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
data1 data2 std mean sum peak_range std mean sum peak_range key1 a 3.605551 5.0 15 7 1.000000 6.0 18 2 b 3.535534 4.5 9 5 3.535534 6.5 13 5
grouped. agg( [ "std" , "mean" , "sum" , ( "range" , peak_range) ] )
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
data1 data2 std mean sum range std mean sum range key1 a 3.605551 5.0 15 7 1.000000 6.0 18 2 b 3.535534 4.5 9 5 3.535534 6.5 13 5
d = { "data1" : "mean" ,
"data2" : "sum" }
grouped. agg( d)
data1 data2 key1 a 5.0 18 b 4.5 13
d = { "data1" : [ "mean" , ( "range" , peak_range) ] ,
"data2" : "sum" }
grouped. agg( d)
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
data1 data2 mean range sum key1 a 5.0 7 18 b 4.5 5 13
grouped. agg( d) . reset_index( )
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
key1 data1 data2 mean range sum 0 a 5.0 7 18 1 b 4.5 5 13
df = pd. DataFrame( { "key1" : [ "a" , "a" , "b" , "b" , "a" ] ,
"key2" : [ "one" , "two" , "one" , "two" , "one" ] ,
"data1" : np. random. randint( 1 , 10 , 5 ) ,
"data2" : np. random. randint( 1 , 10 , 5 ) } )
df
data1 data2 key1 key2 0 6 2 a one 1 1 4 a two 2 1 7 b one 3 6 4 b two 4 4 7 a one
kl_mean = df. groupby( "key1" ) . mean( ) . add_prefix( "mean_" )
kl_mean
mean_data1 mean_data2 key1 a 3.666667 4.333333 b 3.500000 5.500000
pd. merge( df, kl_mean, left_on= "key1" , right_index= True )
data1 data2 key1 key2 mean_data1 mean_data2 0 6 2 a one 3.666667 4.333333 1 1 4 a two 3.666667 4.333333 4 4 7 a one 3.666667 4.333333 2 1 7 b one 3.500000 5.500000 3 6 4 b two 3.500000 5.500000
kl_mean = df. groupby( "key1" ) . transform( np. mean) . add_prefix( "mean_" )
kl_mean
mean_data1 mean_data2 0 3.666667 4.333333 1 3.666667 4.333333 2 3.500000 5.500000 3 3.500000 5.500000 4 3.666667 4.333333
df[ kl_mean. columns] = kl_mean
df
data1 data2 key1 key2 mean_data1 mean_data2 0 6 2 a one 3.666667 4.333333 1 1 4 a two 3.666667 4.333333 2 1 7 b one 3.500000 5.500000 3 6 4 b two 3.500000 5.500000 4 4 7 a one 3.666667 4.333333
df = pd. DataFrame( np. random. randint( 1 , 10 , ( 5 , 5 ) ) ,
columns= [ "a" , "b" , "c" , "d" , "e" ] ,
index= [ "alice" , "bob" , "candy" , "dark" , "emily" ] )
df
a b c d e alice 9 8 5 4 2 bob 2 8 3 3 6 candy 9 4 4 4 8 dark 7 6 8 1 7 emily 2 4 5 7 1
def demean ( s) :
return s - s. mean( )
key = [ "one" , "one" , "two" , "one" , "two" ]
demeaned = df. groupby( key) . transform( demean)
demeaned
a b c d e alice 3.0 0.666667 -0.333333 1.333333 -3.0 bob -4.0 0.666667 -2.333333 0.333333 1.0 candy 3.5 0.000000 -0.500000 -1.500000 3.5 dark 1.0 -1.333333 2.666667 -1.666667 2.0 emily -3.5 0.000000 0.500000 1.500000 -3.5
states = [ "ohio" , "new york" , "vermont" , "florida" ,
"oregon" , "nevada" , "california" , "idaho" ]
group_key = [ "east" ] * 4 + [ "west" ] * 4
data = pd. Series( np. random. randint( 8 ) , index = states)
data[ [ "vermont" , "nevada" , "idaho" ] ] = np. nan
data
ohio 7.0
new york 7.0
vermont NaN
florida 7.0
oregon 7.0
nevada NaN
california 7.0
idaho NaN
dtype: float64
data. groupby( group_key) . mean( )
east 7.0
west 7.0
dtype: float64
data. groupby( group_key) . apply ( lambda g: g. fillna( g. mean( ) ) )
ohio 7.0
new york 7.0
vermont 7.0
florida 7.0
oregon 7.0
nevada 7.0
california 7.0
idaho 7.0
dtype: float64