import numpy as np
import pandas as pd
df = pd. read_csv( r'D:\study\pandas\data\table.csv' , index_col= 'ID' )
df. head( )
School Class Gender Address Height Weight Math Physics ID 1101 S_1 C_1 M street_1 173 63 34.0 A+ 1102 S_1 C_1 F street_2 192 73 32.5 B+ 1103 S_1 C_1 M street_2 186 82 87.2 B+ 1104 S_1 C_1 F street_2 167 81 80.4 B- 1105 S_1 C_1 F street_4 159 64 84.8 B+
一、SAC过程
内涵 SAC指的是分组操作中的split-apply-combine过程 其中split指基于某一些规则,将数据拆成若干组,apply是指对每一组独立地使用函数,combine指将每一组的结果组合成某一类数据结构
apply过程 在该过程中,我们实际往往会遇到四类问题: 整合(Aggregation)——即分组计算统计量(如求均值、求每组元素个数) 变换(Transformation)——即分组对每个单元的数据进行操作(如元素标准化) 过滤(Filtration)——即按照某些规则筛选出一些组(如选出组内某一指标小于50的组) 综合问题——即前面提及的三种问题的混合
二、groupby函数
grouped_single = df. groupby( 'School' )
grouped_single. get_group( 'S_1' ) . head( )
School Class Gender Address Height Weight Math Physics ID 1101 S_1 C_1 M street_1 173 63 34.0 A+ 1102 S_1 C_1 F street_2 192 73 32.5 B+ 1103 S_1 C_1 M street_2 186 82 87.2 B+ 1104 S_1 C_1 F street_2 167 81 80.4 B- 1105 S_1 C_1 F street_4 159 64 84.8 B+
grouped_mul = df. groupby( [ 'School' , 'Class' ] )
grouped_mul. get_group( ( 'S_2' , 'C_4' ) )
School Class Gender Address Height Weight Math Physics ID 2401 S_2 C_4 F street_2 192 62 45.3 A 2402 S_2 C_4 M street_7 166 82 48.7 B 2403 S_2 C_4 F street_6 158 60 59.7 B+ 2404 S_2 C_4 F street_2 160 84 67.7 B 2405 S_2 C_4 F street_6 193 54 47.6 B
grouped_single. size( )
School
S_1 15
S_2 20
dtype: int64
grouped_mul. size( )
School Class
S_1 C_1 5
C_2 5
C_3 5
S_2 C_1 5
C_2 5
C_3 5
C_4 5
dtype: int64
grouped_single. ngroups
2
grouped_mul. ngroups
7
for name, group in grouped_single:
print ( name)
display( group. head( ) )
S_1
School Class Gender Address Height Weight Math Physics ID 1101 S_1 C_1 M street_1 173 63 34.0 A+ 1102 S_1 C_1 F street_2 192 73 32.5 B+ 1103 S_1 C_1 M street_2 186 82 87.2 B+ 1104 S_1 C_1 F street_2 167 81 80.4 B- 1105 S_1 C_1 F street_4 159 64 84.8 B+
S_2
School Class Gender Address Height Weight Math Physics ID 2101 S_2 C_1 M street_7 174 84 83.3 C 2102 S_2 C_1 F street_6 161 61 50.6 B+ 2103 S_2 C_1 M street_4 157 61 52.5 B- 2104 S_2 C_1 F street_5 159 97 72.2 B+ 2105 S_2 C_1 M street_4 170 81 34.2 A
df. set_index( [ 'Gender' , 'School' ] ) . groupby( level= 1 , axis= 0 ) . get_group( 'S_1' ) . head( )
Class Address Height Weight Math Physics Gender School M S_1 C_1 street_1 173 63 34.0 A+ F S_1 C_1 street_2 192 73 32.5 B+ M S_1 C_1 street_2 186 82 87.2 B+ F S_1 C_1 street_2 167 81 80.4 B- S_1 C_1 street_4 159 64 84.8 B+
print ( [ attr for attr in dir ( grouped_single) if not attr. startswith( '_' ) ] )
['Address', 'Class', 'Gender', 'Height', 'Math', 'Physics', 'School', 'Weight', 'agg', 'aggregate', 'all', 'any', 'apply', 'backfill', 'bfill', 'boxplot', 'corr', 'corrwith', 'count', 'cov', 'cumcount', 'cummax', 'cummin', 'cumprod', 'cumsum', 'describe', 'diff', 'dtypes', 'expanding', 'ffill', 'fillna', 'filter', 'first', 'get_group', 'groups', 'head', 'hist', 'idxmax', 'idxmin', 'indices', 'last', 'mad', 'max', 'mean', 'median', 'min', 'ndim', 'ngroup', 'ngroups', 'nth', 'nunique', 'ohlc', 'pad', 'pct_change', 'pipe', 'plot', 'prod', 'quantile', 'rank', 'resample', 'rolling', 'sem', 'shift', 'size', 'skew', 'std', 'sum', 'tail', 'take', 'transform', 'tshift', 'var']
grouped_single. head( 2 )
School Class Gender Address Height Weight Math Physics ID 1101 S_1 C_1 M street_1 173 63 34.0 A+ 1102 S_1 C_1 F street_2 192 73 32.5 B+ 2101 S_2 C_1 M street_7 174 84 83.3 C 2102 S_2 C_1 F street_6 161 61 50.6 B+
grouped_single. first( )
Class Gender Address Height Weight Math Physics School S_1 C_1 M street_1 173 63 34.0 A+ S_2 C_1 M street_7 174 84 83.3 C
df. groupby( np. random. choice( [ 'a' , 'b' , 'c' ] , df. shape[ 0 ] ) ) . get_group( 'a' ) . head( )
School Class Gender Address Height Weight Math Physics ID 1101 S_1 C_1 M street_1 173 63 34.0 A+ 1104 S_1 C_1 F street_2 167 81 80.4 B- 1202 S_1 C_2 F street_4 176 94 63.5 B- 1204 S_1 C_2 F street_5 162 63 33.8 B 1205 S_1 C_2 F street_6 167 63 68.4 B-
df[ : 5 ] . groupby( lambda x: print ( x) ) . head( 0 )
1101
1102
1103
1104
1105
School Class Gender Address Height Weight Math Physics ID
df. groupby( lambda x: '奇数行' if not df. index. get_loc( x) % 2 == 1 else '偶数行' ) . groups
{'偶数行': Int64Index([1102, 1104, 1201, 1203, 1205, 1302, 1304, 2101, 2103, 2105, 2202,
2204, 2301, 2303, 2305, 2402, 2404],
dtype='int64', name='ID'),
'奇数行': Int64Index([1101, 1103, 1105, 1202, 1204, 1301, 1303, 1305, 2102, 2104, 2201,
2203, 2205, 2302, 2304, 2401, 2403, 2405],
dtype='int64', name='ID')}
math_score = df. set_index( [ 'Gender' , 'School' ] ) [ 'Math' ] . sort_index( )
grouped_score = df. set_index( [ 'Gender' , 'School' ] ) . sort_index( ) . groupby \
( lambda x: ( x, '均分及格' if math_score[ x] . mean( ) > 60 else '均分不及格' ) )
for name, _ in grouped_score:
print ( name)
(('F', 'S_1'), '均分及格')
(('F', 'S_2'), '均分及格')
(('M', 'S_1'), '均分及格')
(('M', 'S_2'), '均分不及格')
df. groupby( [ 'Gender' , 'School' ] ) [ 'Math' ] . mean( ) >= 60
Gender School
F S_1 True
S_2 True
M S_1 True
S_2 False
Name: Math, dtype: bool
df. groupby( [ 'Gender' , 'School' ] ) [ [ 'Math' , 'Height' ] ] . mean( )
Math Height Gender School F S_1 64.100000 173.125000 S_2 66.427273 173.727273 M S_1 63.342857 178.714286 S_2 51.155556 172.000000
bins = [ 0 , 40 , 60 , 80 , 90 , 100 ]
cuts = pd. cut( df[ 'Math' ] , bins= bins)
df. groupby( cuts) [ 'Math' ] . count( )
Math
(0, 40] 7
(40, 60] 10
(60, 80] 9
(80, 90] 7
(90, 100] 2
Name: Math, dtype: int64
三、聚合、过滤和变换
group_m = grouped_single[ 'Math' ]
group_m. std( ) . values/ np. sqrt( group_m. count( ) . values) == group_m. sem( ) . values
array([ True, True])
group_m. agg( [ 'sum' , 'mean' , 'std' ] )
sum mean std School S_1 956.2 63.746667 23.077474 S_2 1191.1 59.555000 17.589305
group_m. agg( [ ( 'rename_sum' , 'sum' ) , ( 'rename_mean' , 'mean' ) ] )
rename_sum rename_mean School S_1 956.2 63.746667 S_2 1191.1 59.555000
grouped_mul. agg( { 'Math' : [ 'mean' , 'max' ] , 'Height' : 'var' } )
Math Height mean max var School Class S_1 C_1 63.78 87.2 183.3 C_2 64.30 97.0 132.8 C_3 63.16 87.7 179.2 S_2 C_1 58.56 83.3 54.7 C_2 62.80 85.4 256.0 C_3 63.06 95.5 205.7 C_4 53.80 67.7 300.2
grouped_single[ 'Math' ] . agg( lambda x: print ( x. head( ) , '间隔' ) )
1101 34.0
1102 32.5
1103 87.2
1104 80.4
1105 84.8
Name: Math, dtype: float64 间隔
2101 83.3
2102 50.6
2103 52.5
2104 72.2
2105 34.2
Name: Math, dtype: float64 间隔
School
S_1 NaN
S_2 NaN
Name: Math, dtype: float64
grouped_single[ 'Math' ] . agg( lambda x: x. max ( ) - x. min ( ) )
School
S_1 65.5
S_2 62.8
Name: Math, dtype: float64
def R1 ( x) :
return x. max ( ) - x. min ( )
def R2 ( x) :
return x. max ( ) - x. median( )
grouped_single[ 'Math' ] . agg( min_score1= pd. NamedAgg( column= 'col1' , aggfunc= R1) ,
max_score1= pd. NamedAgg( column= 'col2' , aggfunc= 'max' ) ,
range_score2= pd. NamedAgg( column= 'col3' , aggfunc= R2) ) . head( )
min_score1 max_score1 range_score2 School S_1 65.5 97.0 33.5 S_2 62.8 95.5 39.4
def f ( s, low, high) :
return s. between( low, high) . max ( )
grouped_single[ 'Math' ] . agg( f, 50 , 52 )
School
S_1 False
S_2 True
Name: Math, dtype: bool
def f_test ( s, low, high) :
return s. between( low, high) . max ( )
def agg_f ( f_mul, name, * args, ** kwargs) :
def wrapper ( x) :
return f_mul( x, * args, ** kwargs)
wrapper. __name__ = name
return wrapper
new_f = agg_f( f_test, 'at_least_one_in_50_52' , 50 , 52 )
grouped_single[ 'Math' ] . agg( [ new_f, 'mean' ] ) . head( )
at_least_one_in_50_52 mean School S_1 False 63.746667 S_2 True 59.555000
grouped_single[ [ 'Math' , 'Physics' ] ] . filter ( lambda x: ( x[ 'Math' ] > 32 ) . all ( ) ) . head( )
Math Physics ID 2101 83.3 C 2102 50.6 B+ 2103 52.5 B- 2104 72.2 B+ 2105 34.2 A
grouped_single[ [ 'Math' , 'Height' ] ] . transform( lambda x: x- x. min ( ) ) . head( )
Math Height ID 1101 2.5 14 1102 1.0 33 1103 55.7 27 1104 48.9 8 1105 53.3 0
grouped_single[ [ 'Math' , 'Height' ] ] . transform( lambda x: x. mean( ) ) . head( )
Math Height ID 1101 63.746667 175.733333 1102 63.746667 175.733333 1103 63.746667 175.733333 1104 63.746667 175.733333 1105 63.746667 175.733333
grouped_single[ [ 'Math' , 'Height' ] ] . transform( lambda x: ( x- x. mean( ) ) / x. std( ) ) . head( )
Math Height ID 1101 -1.288991 -0.214991 1102 -1.353990 1.279460 1103 1.016287 0.807528 1104 0.721627 -0.686923 1105 0.912289 -1.316166
df_nan = df[ [ 'Math' , 'School' ] ] . copy( ) . reset_index( )
df_nan. loc[ np. random. randint( 0 , df. shape[ 0 ] , 25 ) , [ 'Math' ] ] = np. nan
df_nan. head( )
ID Math School 0 1101 NaN S_1 1 1102 NaN S_1 2 1103 NaN S_1 3 1104 80.4 S_1 4 1105 84.8 S_1
df_nan. groupby( 'School' ) . transform( lambda x: x. fillna( x. mean( ) ) ) . \
join( df. reset_index( ) [ 'School' ] ) . head( )
ID Math School 0 1101 73.0875 S_1 1 1102 73.0875 S_1 2 1103 73.0875 S_1 3 1104 80.4000 S_1 4 1105 84.8000 S_1
四、apply函数
df. groupby( 'School' ) . apply ( lambda x: print ( x. head( 1 ) ) )
School Class Gender Address Height Weight Math Physics
ID
1101 S_1 C_1 M street_1 173 63 34.0 A+
School Class Gender Address Height Weight Math Physics
ID
2101 S_2 C_1 M street_7 174 84 83.3 C
df[ [ 'School' , 'Math' , 'Height' ] ] . groupby( 'School' ) . apply ( lambda x: x. max ( ) )
School Math Height School S_1 S_1 97.0 195 S_2 S_2 95.5 194
df[ [ 'School' , 'Math' , 'Height' ] ] . groupby( 'School' ) . apply ( lambda x: x- x. min ( ) ) . head( )
Math Height ID 1101 2.5 14.0 1102 1.0 33.0 1103 55.7 27.0 1104 48.9 8.0 1105 53.3 0.0
df[ [ 'School' , 'Math' , 'Height' ] ] . groupby( 'School' ) \
. apply ( lambda x: pd. DataFrame( { 'col1' : x[ 'Math' ] - x[ 'Math' ] . max ( ) ,
'col2' : x[ 'Math' ] - x[ 'Math' ] . min ( ) ,
'col3' : x[ 'Height' ] - x[ 'Height' ] . max ( ) ,
'col4' : x[ 'Height' ] - x[ 'Height' ] . min ( ) } ) ) . head( )
col1 col2 col3 col4 ID 1101 -63.0 2.5 -22 14 1102 -64.5 1.0 -3 33 1103 -9.8 55.7 -9 27 1104 -16.6 48.9 -28 8 1105 -12.2 53.3 -36 0
from collections import OrderedDict
def f ( df) :
data = OrderedDict( )
data[ 'M_sum' ] = df[ 'Math' ] . sum ( )
data[ 'W_var' ] = df[ 'Weight' ] . var( )
data[ 'H_mean' ] = df[ 'Height' ] . mean( )
return pd. Series( data)
grouped_single. apply ( f)
M_sum W_var H_mean School S_1 956.2 117.428571 175.733333 S_2 1191.1 181.081579 172.950000
五、问题与练习
【问题一】 什么是fillna的前向/后向填充,如何实现? df.fillna(method=‘pad’) 前向填充:backfill/bfill 后向填充:pad/ffill 【问题二】 下面的代码实现了什么功能?请仿照设计一个它的groupby版本。 In [41]: s = pd.Series ([0, 1, 1, 0, 1, 1, 1, 0]) s1 = s.cumsum() result = s.mul(s1).diff().where(lambda x: x < 0).ffill().add(s1,fill_value =0)
【问题三】 如何计算组内0.25分位数与0.75分位数?要求显示在同一张表上。 【问题四】 既然索引已经能够选出某些符合条件的子集,那么filter函数的设计有什么意义? 【问题五】 整合、变换、过滤三者在输入输出和功能上有何异同? 【问题六】 在带参数的多函数聚合时,有办法能够绕过wrap技巧实现同样功能吗?
df= pd. read_csv( r'D:\study\pandas\data\Diamonds.csv' )
df. head( )
carat color depth price 0 0.23 E 61.5 326 1 0.21 E 59.8 326 2 0.23 E 56.9 327 3 0.29 I 62.4 334 4 0.31 J 63.3 335
df_r = df. query( 'carat>1' ) [ 'price' ]
df_r. max ( ) - df_r. min ( )
17561
bins = df[ 'depth' ] . quantile( np. linspace( 0 , 1 , 6 ) ) . tolist( )
cuts = pd. cut( df[ 'depth' ] , bins= bins)
df[ 'cuts' ] = cuts
df. head( )
carat color depth price cuts 0 0.23 E 61.5 326 (60.8, 61.6] 1 0.21 E 59.8 326 (43.0, 60.8] 2 0.23 E 56.9 327 (43.0, 60.8] 3 0.29 I 62.4 334 (62.1, 62.7] 4 0.31 J 63.3 335 (62.7, 79.0]
color_result = df. groupby( 'cuts' ) [ 'color' ] . describe( )
color_result
count unique top freq cuts (43.0, 60.8] 11294 7 E 2259 (60.8, 61.6] 11831 7 G 2593 (61.6, 62.1] 10403 7 G 2247 (62.1, 62.7] 10137 7 G 2193 (62.7, 79.0] 10273 7 G 2000
df[ '均重价格' ] = df[ 'price' ] / df[ 'carat' ]
color_result[ 'top' ] == [ i[ 1 ] for i in df. groupby( [ 'cuts' , 'color' ] ) \
[ '均重价格' ] . mean( ) . groupby( [ 'cuts' ] ) . idxmax( ) . values]
cuts
(43.0, 60.8] False
(60.8, 61.6] False
(61.6, 62.1] False
(62.1, 62.7] True
(62.7, 79.0] True
Name: top, dtype: bool
df = df. drop( columns= '均重价格' )
cuts = pd. cut( df[ 'carat' ] , bins= [ 0 , 0.5 , 1 , 1.5 , 2 , np. inf] )
df[ 'cuts' ] = cuts
df. head( )
carat color depth price cuts 0 0.23 E 61.5 326 (0.0, 0.5] 1 0.21 E 59.8 326 (0.0, 0.5] 2 0.23 E 56.9 327 (0.0, 0.5] 3 0.29 I 62.4 334 (0.0, 0.5] 4 0.31 J 63.3 335 (0.0, 0.5]
def f ( nums) :
if not nums:
return 0
res = 1
cur_len = 1
for i in range ( 1 , len ( nums) ) :
if nums[ i- 1 ] < nums[ i] :
cur_len += 1
res = max ( cur_len, res)
else :
cur_len = 1
return res
for name, group in df. groupby( 'cuts' ) :
group = group. sort_values( by= 'depth' )
s = group[ 'price' ]
print ( name, f( s. tolist( ) ) )
(0.0, 0.5] 8
(0.5, 1.0] 8
(1.0, 1.5] 7
(1.5, 2.0] 11
(2.0, inf] 7
for name, group in df[ [ 'carat' , 'price' , 'color' ] ] . groupby( 'color' ) :
L1 = np. array( [ np. ones( group. shape[ 0 ] ) , group[ 'carat' ] ] ) . reshape( 2 , group. shape[ 0 ] )
L2 = group[ 'price' ]
result = ( np. linalg. inv( L1. dot( L1. T) ) . dot( L1) ) . dot( L2) . reshape( 2 , 1 )
print ( '当颜色为%s时,截距项为: %f,回归系数为:%f' % ( name, result[ 0 ] , result[ 1 ] ) )
当颜色为D时,截距项为: -2361.017152,回归系数为:8408.353126
当颜色为E时,截距项为: -2381.049600,回归系数为:8296.212783
当颜色为F时,截距项为: -2665.806191,回归系数为:8676.658344
当颜色为G时,截距项为: -2575.527643,回归系数为:8525.345779
当颜色为H时,截距项为: -2460.418046,回归系数为:7619.098320
当颜色为I时,截距项为: -2878.150356,回归系数为:7761.041169
当颜色为J时,截距项为: -2920.603337,回归系数为:7094.192092
df= pd. read_csv( r'D:\study\pandas\data\Drugs.csv' )
df. head( )
YYYY State COUNTY SubstanceName DrugReports 0 2010 VA ACCOMACK Propoxyphene 1 1 2010 OH ADAMS Morphine 9 2 2010 PA ADAMS Methadone 2 3 2010 VA ALEXANDRIA CITY Heroin 5 4 2010 PA ALLEGHENY Hydromorphone 5
idx = pd. IndexSlice
for i in range ( 2010 , 2018 ) :
county = ( df. groupby( [ 'COUNTY' , 'YYYY' ] ) . sum ( ) . loc[ idx[ : , i] , : ] . idxmax( ) [ 0 ] [ 0 ] )
state = df. query( 'COUNTY == "%s"' % county) [ 'State' ] . iloc[ 0 ]
state_true = df. groupby( [ 'State' , 'YYYY' ] ) . sum ( ) . loc[ idx[ : , i] , : ] . idxmax( ) [ 0 ] [ 0 ]
if state== state_true:
print ( '在%d年,%s县的报告书最多,它所属的州%s也是报告数最多的' % ( i, county, state) )
else :
print ( '在%d年,%s县的报告书最多,但它所属的州%s不是报告数最多的,%s州报告书最多' % ( i, county, state, state_true) )
在2010年,PHILADELPHIA县的报告书最多,它所属的州PA也是报告数最多的
在2011年,PHILADELPHIA县的报告书最多,但它所属的州PA不是报告数最多的,OH州报告书最多
在2012年,PHILADELPHIA县的报告书最多,但它所属的州PA不是报告数最多的,OH州报告书最多
在2013年,PHILADELPHIA县的报告书最多,但它所属的州PA不是报告数最多的,OH州报告书最多
在2014年,PHILADELPHIA县的报告书最多,但它所属的州PA不是报告数最多的,OH州报告书最多
在2015年,PHILADELPHIA县的报告书最多,但它所属的州PA不是报告数最多的,OH州报告书最多
在2016年,HAMILTON县的报告书最多,它所属的州OH也是报告数最多的
在2017年,HAMILTON县的报告书最多,它所属的州OH也是报告数最多的
df_b = df[ ( df[ 'YYYY' ] . isin( [ 2014 , 2015 ] ) ) & ( df[ 'SubstanceName' ] == 'Heroin' ) ]
df_add = df_b. groupby( [ 'YYYY' , 'State' ] ) . sum ( )
( df_add. loc[ 2015 ] - df_add. loc[ 2014 ] ) . idxmax( )
DrugReports OH
dtype: object
df_b = df[ ( df[ 'YYYY' ] . isin( [ 2014 , 2015 ] ) ) & ( df[ 'State' ] == 'OH' ) ]
df_add = df_b. groupby( [ 'YYYY' , 'SubstanceName' ] ) . sum ( )
display( ( df_add. loc[ 2015 ] - df_add. loc[ 2014 ] ) . idxmax( ) )
display( ( df_add. loc[ 2015 ] / df_add. loc[ 2014 ] ) . idxmax( ) )
DrugReports Heroin
dtype: object
DrugReports Acetyl fentanyl
dtype: object