import numpy as np
import pandas as pd
df= pd. read_csv( 'E:\jupyter Notebook\天池比赛\pandas学习\joyful-pandas-master\data\\table.csv' , index_col= 'ID' )
df= df. drop( columns= 'Unnamed: 0' )
df. info( )
<class 'pandas.core.frame.DataFrame'>
Int64Index: 35 entries, 1101 to 2405
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 School 35 non-null object
1 Class 35 non-null object
2 Gender 35 non-null object
3 Address 35 non-null object
4 Height 35 non-null int64
5 Weight 35 non-null int64
6 Math 35 non-null float64
7 Physics 35 non-null object
dtypes: float64(1), int64(2), object(5)
memory usage: 2.5+ KB
SAC过程
内涵
SAC指的是分组操作中的split-apply-combine过程 其中split指基于某一些规则,将数据拆成若干组,apply是指对每一组独立地使用函数,combine指将每一组的结果组合成某一类数据结构
apply过程
在该过程中,我们实际往往会遇到四类问题: 整合(Aggregation)——即分组计算统计量(如求均值、求每组元素个数) 变换(Transformation)——即分组对每个单元的数据进行操作(如元素标准化) 过滤(Filtration)——即按照某些规则筛选出一些组(如选出组内某一指标小于50的组) 综合问题——即前面提及的三种问题的混合
group函数
分组函数的基本内容
根据某一列分组
grouped_single= df. groupby( 'School' )
grouped_single
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002277196F748>
grouped_single. get_group( 'S_1' ) . head( )
School Class Gender Address Height Weight Math Physics ID 1101 S_1 C_1 M street_1 173 63 34.0 A+ 1102 S_1 C_1 F street_2 192 73 32.5 B+ 1103 S_1 C_1 M street_2 186 82 87.2 B+ 1104 S_1 C_1 F street_2 167 81 80.4 B- 1105 S_1 C_1 F street_4 159 64 84.8 B+
根据某几列分组
grouped_mul= df. groupby( [ 'School' , 'Class' ] )
grouped_mul. get_group( ( 'S_2' , 'C_4' ) )
School Class Gender Address Height Weight Math Physics ID 2401 S_2 C_4 F street_2 192 62 45.3 A 2402 S_2 C_4 M street_7 166 82 48.7 B 2403 S_2 C_4 F street_6 158 60 59.7 B+ 2404 S_2 C_4 F street_2 160 84 67.7 B 2405 S_2 C_4 F street_6 193 54 47.6 B
组容量与组数
grouped_single. size( )
School
S_1 15
S_2 20
dtype: int64
grouped_mul. size( )
School Class
S_1 C_1 5
C_2 5
C_3 5
S_2 C_1 5
C_2 5
C_3 5
C_4 5
dtype: int64
grouped_single. ngroups
2
grouped_mul. ngroups
7
组的遍历
for name, group in grouped_single:
print ( name)
display( group. head( ) )
S_1
School Class Gender Address Height Weight Math Physics ID 1101 S_1 C_1 M street_1 173 63 34.0 A+ 1102 S_1 C_1 F street_2 192 73 32.5 B+ 1103 S_1 C_1 M street_2 186 82 87.2 B+ 1104 S_1 C_1 F street_2 167 81 80.4 B- 1105 S_1 C_1 F street_4 159 64 84.8 B+
S_2
School Class Gender Address Height Weight Math Physics ID 2101 S_2 C_1 M street_7 174 84 83.3 C 2102 S_2 C_1 F street_6 161 61 50.6 B+ 2103 S_2 C_1 M street_4 157 61 52.5 B- 2104 S_2 C_1 F street_5 159 97 72.2 B+ 2105 S_2 C_1 M street_4 170 81 34.2 A
level参数(用于多级索引)和axis参数
df. set_index( [ 'Gender' , 'School' , 'Class' ] ) . groupby( level= 1 , axis= 0 ) . get_group( 'S_1' ) . head( )
Address Height Weight Math Physics Gender School Class M S_1 C_1 street_1 173 63 34.0 A+ F S_1 C_1 street_2 192 73 32.5 B+ M S_1 C_1 street_2 186 82 87.2 B+ F S_1 C_1 street_2 167 81 80.4 B- C_1 street_4 159 64 84.8 B+
groupby对象的特点
查看所有可调用的方法
print ( [ attr for attr in dir ( grouped_single) if not attr. startswith( '_' ) ] )
['Address', 'Class', 'Gender', 'Height', 'Math', 'Physics', 'School', 'Weight', 'agg', 'aggregate', 'all', 'any', 'apply', 'backfill', 'bfill', 'boxplot', 'corr', 'corrwith', 'count', 'cov', 'cumcount', 'cummax', 'cummin', 'cumprod', 'cumsum', 'describe', 'diff', 'dtypes', 'expanding', 'ffill', 'fillna', 'filter', 'first', 'get_group', 'groups', 'head', 'hist', 'idxmax', 'idxmin', 'indices', 'last', 'mad', 'max', 'mean', 'median', 'min', 'ndim', 'ngroup', 'ngroups', 'nth', 'nunique', 'ohlc', 'pad', 'pct_change', 'pipe', 'plot', 'prod', 'quantile', 'rank', 'resample', 'rolling', 'sem', 'shift', 'size', 'skew', 'std', 'sum', 'tail', 'take', 'transform', 'tshift', 'var']
分组对象的head和first
对分组对象使用head函数,返回的是每组的前几行,而不是数据集的前几行
grouped_single. head( 3 )
School Class Gender Address Height Weight Math Physics ID 1101 S_1 C_1 M street_1 173 63 34.0 A+ 1102 S_1 C_1 F street_2 192 73 32.5 B+ 1103 S_1 C_1 M street_2 186 82 87.2 B+ 2101 S_2 C_1 M street_7 174 84 83.3 C 2102 S_2 C_1 F street_6 161 61 50.6 B+ 2103 S_2 C_1 M street_4 157 61 52.5 B-
first显示的是以分组为索引的每组的第一个分组信息
grouped_single. first( )
Class Gender Address Height Weight Math Physics School S_1 C_1 M street_1 173 63 34.0 A+ S_2 C_1 M street_7 174 84 83.3 C
分组依据
对于groupby函数而言,分组的依据比较自由,只要是与数据框长度相同的列表即可,同时支持函数型分组
df. groupby( np. random. choice( [ 'a' , 'b' , 'c' ] , df. shape[ 0 ] ) ) . get_group( 'a' ) . head( 20 )
School Class Gender Address Height Weight Math Physics ID 1101 S_1 C_1 M street_1 173 63 34.0 A+ 1102 S_1 C_1 F street_2 192 73 32.5 B+ 1104 S_1 C_1 F street_2 167 81 80.4 B- 1203 S_1 C_2 M street_6 160 53 58.8 A+ 1204 S_1 C_2 F street_5 162 63 33.8 B 1205 S_1 C_2 F street_6 167 63 68.4 B- 1301 S_1 C_3 M street_4 161 68 31.5 B+ 1303 S_1 C_3 M street_7 188 82 49.7 B 2101 S_2 C_1 M street_7 174 84 83.3 C 2104 S_2 C_1 F street_5 159 97 72.2 B+ 2105 S_2 C_1 M street_4 170 81 34.2 A 2201 S_2 C_2 M street_5 193 100 39.1 B 2204 S_2 C_2 M street_1 175 74 47.2 B- 2301 S_2 C_3 F street_4 157 78 72.3 B+ 2303 S_2 C_3 F street_7 190 99 65.9 C 2402 S_2 C_4 M street_7 166 82 48.7 B 2403 S_2 C_4 F street_6 158 60 59.7 B+
从原理上说,利用函数的时候,传入的对象就是索引,因此根据这一特性可以做一些复杂的操作
df[ : 5 ] . groupby( lambda x: print ( x) ) . head( 0 )
1101
1102
1103
1104
1105
School Class Gender Address Height Weight Math Physics ID
df. groupby( lambda x: '奇数行' if df. index. get_loc( x) % 2 != 1 else '偶数行' ) . groups
{'偶数行': Int64Index([1102, 1104, 1201, 1203, 1205, 1302, 1304, 2101, 2103, 2105, 2202,
2204, 2301, 2303, 2305, 2402, 2404],
dtype='int64', name='ID'),
'奇数行': Int64Index([1101, 1103, 1105, 1202, 1204, 1301, 1303, 1305, 2102, 2104, 2201,
2203, 2205, 2302, 2304, 2401, 2403, 2405],
dtype='int64', name='ID')}
如果是多层索引,那么lambda表达式中的输入就是元组,下面实现的功能为查看两所学校中男女学生的均分是及格 此处只是演示groupby的用法,实际操作不会这样写
math_sorce= df. set_index( [ 'Gender' , 'School' ] ) [ 'Math' ] . sort_index( )
math_sorce
Gender School
F S_1 32.5
S_1 80.4
S_1 84.8
S_1 63.5
S_1 33.8
S_1 68.4
S_1 87.7
S_1 61.7
S_2 50.6
S_2 72.2
S_2 68.5
S_2 85.4
S_2 72.3
S_2 65.9
S_2 95.5
S_2 45.3
S_2 59.7
S_2 67.7
S_2 47.6
M S_1 34.0
S_1 87.2
S_1 97.0
S_1 58.8
S_1 31.5
S_1 49.7
S_1 85.2
S_2 83.3
S_2 52.5
S_2 34.2
S_2 39.1
S_2 73.8
S_2 47.2
S_2 32.7
S_2 48.9
S_2 48.7
Name: Math, dtype: float64
grouped_sorce= df. set_index( [ 'Gender' , 'School' ] ) . sort_index( ) . groupby( lambda x: ( x, '均分及格' if math_sorce[ x] . mean( ) >= 60 else '均分不及格' ) )
for name, _ in grouped_sorce: print ( name)
(('F', 'S_1'), '均分及格')
(('F', 'S_2'), '均分及格')
(('M', 'S_1'), '均分及格')
(('M', 'S_2'), '均分不及格')
groupby和[]操作
df. groupby( [ 'Gender' , 'School' ] ) [ 'Math' ] . mean( ) >= 60
Gender School
F S_1 True
S_2 True
M S_1 True
S_2 False
Name: Math, dtype: bool
df. groupby( [ 'Gender' , 'School' ] ) [ [ 'Math' , 'Height' ] ] . mean( )
Math Height Gender School F S_1 64.100000 173.125000 S_2 66.427273 173.727273 M S_1 63.342857 178.714286 S_2 51.155556 172.000000
连续型变量分组
bins= [ 0 , 40 , 60 , 80 , 90 , 100 ]
cuts= pd. cut( df[ 'Math' ] , bins= bins)
df. groupby( cuts) [ 'Math' ] . count( )
Math
(0, 40] 7
(40, 60] 10
(60, 80] 9
(80, 90] 7
(90, 100] 2
Name: Math, dtype: int64
聚合、过滤和变换
聚合(Aggregation)
常用的聚合函数
所谓聚合就是把一堆数,变成一个标量,因此mean/sum/size/count/std/var/sem/describe/first/last/nth/min/max都是聚合函数 为了熟悉操作,不妨验证标准误sem函数,它的计算公式是:
组
内
标
准
差
组
容
量
\frac{组内标准差}{\sqrt{组容量}}
组 容 量
组 内 标 准 差 ,下面进行验证:
group_m= grouped_single[ 'Math' ]
group_m. std( ) . values/ np. sqrt( group_m. count( ) . values) == group_m. sem( ) . values
array([ True, True])
group_m. std( ) . values/ np. sqrt( group_m. count( ) . values)
array([5.95857818, 3.93308821])
group_m. sem( ) . values
array([5.95857818, 3.93308821])
group_m. std( ) . values
array([23.07747407, 17.58930521])
同时使用多个聚合函数
group_m. agg( [ 'sum' , 'mean' , 'std' ] )
sum mean std School S_1 956.2 63.746667 23.077474 S_2 1191.1 59.555000 17.589305
指定哪些列使用哪些函数
grouped_mul. agg( { 'Math' : { 'mean' , 'max' } , 'Height' : 'var' } )
Math Height max mean var School Class S_1 C_1 87.2 63.78 183.3 C_2 97.0 64.30 132.8 C_3 87.7 63.16 179.2 S_2 C_1 83.3 58.56 54.7 C_2 85.4 62.80 256.0 C_3 95.5 63.06 205.7 C_4 67.7 53.80 300.2
使用自定义函数
grouped_single[ 'Math' ] . agg( lambda x: print ( x. head( ) , '间隔' ) )
1101 34.0
1102 32.5
1103 87.2
1104 80.4
1105 84.8
Name: Math, dtype: float64 间隔
2101 83.3
2102 50.6
2103 52.5
2104 72.2
2105 34.2
Name: Math, dtype: float64 间隔
School
S_1 None
S_2 None
Name: Math, dtype: object
grouped_single[ 'Math' ] . agg( lambda x: x. max ( ) - x. min ( ) )
School
S_1 65.5
S_2 62.8
Name: Math, dtype: float64
利用NameAgg函数进行多个聚合
注意:不支持lambda函数,只能通过def函数实现
def R1 ( x) :
return x. max ( ) - x. min ( )
def R2 ( x) :
return x. max ( ) - x. median( )
grouped_single[ 'Math' ] . head( )
ID
1101 34.0
1102 32.5
1103 87.2
1104 80.4
1105 84.8
2101 83.3
2102 50.6
2103 52.5
2104 72.2
2105 34.2
Name: Math, dtype: float64
grouped_single[ 'Math' ] . agg( min_score1= pd. NamedAgg( column= 'col1' , aggfunc= R1) ,
max_score2= pd. NamedAgg( column= 'col2' , aggfunc= 'max' ) ,
range_score2= pd. NamedAgg( column= 'col3' , aggfunc= R2) )
min_score1 max_score2 range_score2 School S_1 65.5 97.0 33.5 S_2 62.8 95.5 39.4
带参数的聚合函数
1. 判断是否组内数学分数至少有一个值在50-52之间
def f ( s, high, low) :
return s. between( high, low) . max ( )
grouped_single[ 'Math' ] . agg( f, 52 , 50 )
School
S_1 False
S_2 False
Name: Math, dtype: bool
2. 如果需要使用多个函数,并且其中至少有一个带参数,则使用wrap技巧:
def f_test ( s, high, low) :
return s. between( high, low) . max ( )
def agg_f ( f_mul, name, * args, ** kwargs) :
def wrapper ( x) :
return f_mul( x, * args, ** kwargs)
wrapper. _name_= name
return wrapper
new_f= agg_f( f_test, 'at_least_one_in_50-52' , 52 , 50 )
grouped_single[ 'Math' ] . agg( [ new_f, 'mean' ] ) . head( )
wrapper mean School S_1 False 63.746667 S_2 False 59.555000
过滤
- filter函数是用来筛选某些组的(结果是全体组),因此传入的值是布尔标量
grouped_single[ [ 'Math' , 'Physics' ] ] . filter ( lambda x: ( x[ 'Math' ] > 32 ) . all ( ) )
Math Physics ID 2101 83.3 C 2102 50.6 B+ 2103 52.5 B- 2104 72.2 B+ 2105 34.2 A 2201 39.1 B 2202 68.5 B+ 2203 73.8 A+ 2204 47.2 B- 2205 85.4 B 2301 72.3 B+ 2302 32.7 A 2303 65.9 C 2304 95.5 A- 2305 48.9 B 2401 45.3 A 2402 48.7 B 2403 59.7 B+ 2404 67.7 B 2405 47.6 B
变换
传入对象
transform函数中传入的对象是组内的列,并且返回值需要与列长完全一致
grouped_single[ [ 'Math' , 'Height' ] ] . transform( lambda x: x- x. min ( ) )
Math Height ID 1101 2.5 14 1102 1.0 33 1103 55.7 27 1104 48.9 8 1105 53.3 0 1201 65.5 29 1202 32.0 17 1203 27.3 1 1204 2.3 3 1205 36.9 8 1301 0.0 2 1302 56.2 16 1303 18.2 29 1304 53.7 36 1305 30.2 28 2101 50.6 19 2102 17.9 6 2103 19.8 2 2104 39.5 4 2105 1.5 15 2201 6.4 38 2202 35.8 39 2203 41.1 0 2204 14.5 20 2205 52.7 28 2301 39.6 2 2302 0.0 16 2303 33.2 35 2304 62.8 9 2305 16.2 32 2401 12.6 37 2402 16.0 11 2403 27.0 3 2404 35.0 5 2405 14.9 38
如果返回了标量值,那么组内的所有元素会被广播这个值
grouped_single[ [ 'Math' , 'Height' ] ] . transform( lambda x: x. mean( ) )
Math Height ID 1101 63.746667 175.733333 1102 63.746667 175.733333 1103 63.746667 175.733333 1104 63.746667 175.733333 1105 63.746667 175.733333 1201 63.746667 175.733333 1202 63.746667 175.733333 1203 63.746667 175.733333 1204 63.746667 175.733333 1205 63.746667 175.733333 1301 63.746667 175.733333 1302 63.746667 175.733333 1303 63.746667 175.733333 1304 63.746667 175.733333 1305 63.746667 175.733333 2101 59.555000 172.950000 2102 59.555000 172.950000 2103 59.555000 172.950000 2104 59.555000 172.950000 2105 59.555000 172.950000 2201 59.555000 172.950000 2202 59.555000 172.950000 2203 59.555000 172.950000 2204 59.555000 172.950000 2205 59.555000 172.950000 2301 59.555000 172.950000 2302 59.555000 172.950000 2303 59.555000 172.950000 2304 59.555000 172.950000 2305 59.555000 172.950000 2401 59.555000 172.950000 2402 59.555000 172.950000 2403 59.555000 172.950000 2404 59.555000 172.950000 2405 59.555000 172.950000
grouped_single. head( )
School Class Gender Address Height Weight Math Physics ID 1101 S_1 C_1 M street_1 173 63 34.0 A+ 1102 S_1 C_1 F street_2 192 73 32.5 B+ 1103 S_1 C_1 M street_2 186 82 87.2 B+ 1104 S_1 C_1 F street_2 167 81 80.4 B- 1105 S_1 C_1 F street_4 159 64 84.8 B+ 2101 S_2 C_1 M street_7 174 84 83.3 C 2102 S_2 C_1 F street_6 161 61 50.6 B+ 2103 S_2 C_1 M street_4 157 61 52.5 B- 2104 S_2 C_1 F street_5 159 97 72.2 B+ 2105 S_2 C_1 M street_4 170 81 34.2 A
利用变换方法进行组内标准化
grouped_single[ [ 'Math' , 'Height' ] ] . transform( lambda x: ( x- x. mean( ) ) / x. std( ) )
Math Height ID 1101 -1.288991 -0.214991 1102 -1.353990 1.279460 1103 1.016287 0.807528 1104 0.721627 -0.686923 1105 0.912289 -1.316166 1201 1.440943 0.964839 1202 -0.010689 0.020975 1203 -0.214350 -1.237510 1204 -1.297658 -1.080200 1205 0.201640 -0.686923 1301 -1.397322 -1.158855 1302 1.037953 -0.057681 1303 -0.608674 0.964839 1304 0.929622 1.515426 1305 -0.088687 0.886183 2101 1.349968 0.073242 2102 -0.509116 -0.833560 2103 -0.401096 -1.112576 2104 0.718903 -0.973068 2105 -1.441501 -0.205774 2201 -1.162923 1.398568 2202 0.508548 1.468322 2203 0.809867 -1.252084 2204 -0.702415 0.142996 2205 1.469359 0.701028 2301 0.724588 -1.112576 2302 -1.526780 -0.136020 2303 0.360731 1.189306 2304 2.043571 -0.624298 2305 -0.605766 0.980044 2401 -0.810436 1.328814 2402 -0.617136 -0.484790 2403 0.008244 -1.042822 2404 0.463065 -0.903314 2405 -0.679674 1.398568
利用变换方法进行组内缺失值的均值补充
df_nan = df[ [ 'Math' , 'School' ] ] . copy( ) . reset_index( )
df_nan. loc[ np. random. randint( 0 , df. shape[ 0 ] , 25 ) , [ 'Math' ] ] = np. nan
df_nan
ID Math School 0 1101 NaN S_1 1 1102 NaN S_1 2 1103 NaN S_1 3 1104 NaN S_1 4 1105 NaN S_1 5 1201 NaN S_1 6 1202 NaN S_1 7 1203 58.8 S_1 8 1204 33.8 S_1 9 1205 NaN S_1 10 1301 31.5 S_1 11 1302 NaN S_1 12 1303 49.7 S_1 13 1304 85.2 S_1 14 1305 61.7 S_1 15 2101 NaN S_2 16 2102 NaN S_2 17 2103 52.5 S_2 18 2104 NaN S_2 19 2105 NaN S_2 20 2201 39.1 S_2 21 2202 68.5 S_2 22 2203 NaN S_2 23 2204 47.2 S_2 24 2205 85.4 S_2 25 2301 72.3 S_2 26 2302 NaN S_2 27 2303 65.9 S_2 28 2304 95.5 S_2 29 2305 NaN S_2 30 2401 NaN S_2 31 2402 NaN S_2 32 2403 59.7 S_2 33 2404 67.7 S_2 34 2405 NaN S_2
df_nan. groupby( 'School' ) . transform( lambda x: x. fillna( x. mean( ) ) ) . join( df. reset_index( ) [ 'School' ] )
ID Math School 0 1101 53.45 S_1 1 1102 53.45 S_1 2 1103 53.45 S_1 3 1104 53.45 S_1 4 1105 53.45 S_1 5 1201 53.45 S_1 6 1202 53.45 S_1 7 1203 58.80 S_1 8 1204 33.80 S_1 9 1205 53.45 S_1 10 1301 31.50 S_1 11 1302 53.45 S_1 12 1303 49.70 S_1 13 1304 85.20 S_1 14 1305 61.70 S_1 15 2101 65.38 S_2 16 2102 65.38 S_2 17 2103 52.50 S_2 18 2104 65.38 S_2 19 2105 65.38 S_2 20 2201 39.10 S_2 21 2202 68.50 S_2 22 2203 65.38 S_2 23 2204 47.20 S_2 24 2205 85.40 S_2 25 2301 72.30 S_2 26 2302 65.38 S_2 27 2303 65.90 S_2 28 2304 95.50 S_2 29 2305 65.38 S_2 30 2401 65.38 S_2 31 2402 65.38 S_2 32 2403 59.70 S_2 33 2404 67.70 S_2 34 2405 65.38 S_2
apply函数
apply函数的灵活性
对于传入值而言,从下面的打印内容可以看到是以分组的表传入apply中:
df. groupby( 'School' ) . apply ( lambda x: print ( x. head( ) ) )
School Class Gender Address Height Weight Math Physics
ID
1101 S_1 C_1 M street_1 173 63 34.0 A+
1102 S_1 C_1 F street_2 192 73 32.5 B+
1103 S_1 C_1 M street_2 186 82 87.2 B+
1104 S_1 C_1 F street_2 167 81 80.4 B-
1105 S_1 C_1 F street_4 159 64 84.8 B+
School Class Gender Address Height Weight Math Physics
ID
2101 S_2 C_1 M street_7 174 84 83.3 C
2102 S_2 C_1 F street_6 161 61 50.6 B+
2103 S_2 C_1 M street_4 157 61 52.5 B-
2104 S_2 C_1 F street_5 159 97 72.2 B+
2105 S_2 C_1 M street_4 170 81 34.2 A
df[ [ 'School' , 'Math' , 'Height' ] ] . groupby( 'School' ) . apply ( lambda x: x. max ( ) )
School Math Height School S_1 S_1 97.0 195 S_2 S_2 95.5 194
列表返回值
df[ [ 'School' , 'Math' , 'Height' ] ] . groupby( 'School' ) . apply ( lambda x: x- x. max ( ) )
Math Height ID 1101 -63.0 -22.0 1102 -64.5 -3.0 1103 -9.8 -9.0 1104 -16.6 -28.0 1105 -12.2 -36.0 1201 0.0 -7.0 1202 -33.5 -19.0 1203 -38.2 -35.0 1204 -63.2 -33.0 1205 -28.6 -28.0 1301 -65.5 -34.0 1302 -9.3 -20.0 1303 -47.3 -7.0 1304 -11.8 0.0 1305 -35.3 -8.0 2101 -12.2 -20.0 2102 -44.9 -33.0 2103 -43.0 -37.0 2104 -23.3 -35.0 2105 -61.3 -24.0 2201 -56.4 -1.0 2202 -27.0 0.0 2203 -21.7 -39.0 2204 -48.3 -19.0 2205 -10.1 -11.0 2301 -23.2 -37.0 2302 -62.8 -23.0 2303 -29.6 -4.0 2304 0.0 -30.0 2305 -46.6 -7.0 2401 -50.2 -2.0 2402 -46.8 -28.0 2403 -35.8 -36.0 2404 -27.8 -34.0 2405 -47.9 -1.0
DataFrame返回值
df[ [ 'School' , 'Math' , 'Height' ] ] . groupby( 'School' ) . apply ( lambda x: pd. DataFrame( { 'col1' : x[ 'Math' ] - x[ 'Math' ] . max ( ) , 'col2' : x[ 'Math' ] - x[ 'Math' ] . min ( ) } ) )
col1 col2 ID 1101 -63.0 2.5 1102 -64.5 1.0 1103 -9.8 55.7 1104 -16.6 48.9 1105 -12.2 53.3 1201 0.0 65.5 1202 -33.5 32.0 1203 -38.2 27.3 1204 -63.2 2.3 1205 -28.6 36.9 1301 -65.5 0.0 1302 -9.3 56.2 1303 -47.3 18.2 1304 -11.8 53.7 1305 -35.3 30.2 2101 -12.2 50.6 2102 -44.9 17.9 2103 -43.0 19.8 2104 -23.3 39.5 2105 -61.3 1.5 2201 -56.4 6.4 2202 -27.0 35.8 2203 -21.7 41.1 2204 -48.3 14.5 2205 -10.1 52.7 2301 -23.2 39.6 2302 -62.8 0.0 2303 -29.6 33.2 2304 0.0 62.8 2305 -46.6 16.2 2401 -50.2 12.6 2402 -46.8 16.0 2403 -35.8 27.0 2404 -27.8 35.0 2405 -47.9 14.9
用apply同时统计多个指标
from collections import OrderedDict
def f ( df) :
data= OrderedDict( )
data[ 'M_sum' ] = df[ 'Math' ] . sum ( )
data[ 'W_var' ] = df[ 'Weight' ] . var( )
data[ 'H_mean' ] = df[ 'Height' ] . mean( )
return pd. Series( data)
grouped_single. apply ( f) . head( )
M_sum W_var H_mean School S_1 956.2 117.428571 175.733333 S_2 1191.1 181.081579 172.950000
问题与练习
问题
什么是fillna的前向/后向填充,如何实现?
使用靠近缺失值的数值进行填充(前一个、后一个) bfill,ffill
下面的代码实现了什么功能?请仿照设计一个它的groupby版本。
s = pd. Series ( [ 0 , 1 , 1 , 0 , 1 , 1 , 1 , 0 ] )
s1 = s. cumsum( )
result = s. mul( s1) . diff( ) . where( lambda x: x < 0 ) . ffill( ) . add( s1, fill_value = 0 )
result
File "<ipython-input-169-c345ec6719c0>", line 2
s1 = s.cumsum()?
^
SyntaxError: invalid syntax
s = pd. Series ( [ 0 , 1 , 1 , 0 , 1 , 1 , 1 , 0 ] )
s1 = s. cumsum( )
result=