import numpy as np
import pandas as pd
df = pd. read_csv( 'C:/Users/admin/Desktop/joyful-pandas-master/joyful-pandas-master/data/table.csv' , index_col = 'ID' )
df. head( )
School Class Gender Address Height Weight Math Physics ID 1101 S_1 C_1 M street_1 173 63 34.0 A+ 1102 S_1 C_1 F street_2 192 73 32.5 B+ 1103 S_1 C_1 M street_2 186 82 87.2 B+ 1104 S_1 C_1 F street_2 167 81 80.4 B- 1105 S_1 C_1 F street_4 159 64 84.8 B+
一、SAC过程
1.1、内涵
SAC指的是分组操作中的split-apply-combine过程
split指基于某一些规则,将数据拆成若干组, apply是指对每一组独立地使用函数, combine指将每一组的结果组合成某一类数据结构
1.2、apply过程
整合(Aggregation)——即分组计算统计量(如求均值、求每组元素个数) 变换(Transformation)——即分组对每个单元的数据进行操作(如元素标准化) 过滤(Filtration)——即按照某些规则筛选出一些组(如选出组内某一指标小于50的组)
二、groupby函数
grouped_single = df. groupby( 'School' )
grouped_single. get_group( 'S_1' ) . head( )
School Class Gender Address Height Weight Math Physics ID 1101 S_1 C_1 M street_1 173 63 34.0 A+ 1102 S_1 C_1 F street_2 192 73 32.5 B+ 1103 S_1 C_1 M street_2 186 82 87.2 B+ 1104 S_1 C_1 F street_2 167 81 80.4 B- 1105 S_1 C_1 F street_4 159 64 84.8 B+
grouped_mul = df. groupby( [ 'School' , 'Class' ] )
grouped_mul. get_group( ( 'S_2' , 'C_4' ) ) . head( )
School Class Gender Address Height Weight Math Physics ID 2401 S_2 C_4 F street_2 192 62 45.3 A 2402 S_2 C_4 M street_7 166 82 48.7 B 2403 S_2 C_4 F street_6 158 60 59.7 B+ 2404 S_2 C_4 F street_2 160 84 67.7 B 2405 S_2 C_4 F street_6 193 54 47.6 B
grouped_single. size( )
School
S_1 15
S_2 20
dtype: int64
grouped_mul. size( )
School Class
S_1 C_1 5
C_2 5
C_3 5
S_2 C_1 5
C_2 5
C_3 5
C_4 5
dtype: int64
grouped_single. ngroups
2
grouped_mul. ngroups
7
for name, group in grouped_single:
print ( name)
display( group. head( ) )
S_1
School Class Gender Address Height Weight Math Physics ID 1101 S_1 C_1 M street_1 173 63 34.0 A+ 1102 S_1 C_1 F street_2 192 73 32.5 B+ 1103 S_1 C_1 M street_2 186 82 87.2 B+ 1104 S_1 C_1 F street_2 167 81 80.4 B- 1105 S_1 C_1 F street_4 159 64 84.8 B+
S_2
School Class Gender Address Height Weight Math Physics ID 2101 S_2 C_1 M street_7 174 84 83.3 C 2102 S_2 C_1 F street_6 161 61 50.6 B+ 2103 S_2 C_1 M street_4 157 61 52.5 B- 2104 S_2 C_1 F street_5 159 97 72.2 B+ 2105 S_2 C_1 M street_4 170 81 34.2 A
df. set_index( [ 'Gender' , 'School' ] ) . groupby( level = 1 , axis = 0 ) . get_group( 'S_1' ) . head( )
Class Address Height Weight Math Physics Gender School M S_1 C_1 street_1 173 63 34.0 A+ F S_1 C_1 street_2 192 73 32.5 B+ M S_1 C_1 street_2 186 82 87.2 B+ F S_1 C_1 street_2 167 81 80.4 B- S_1 C_1 street_4 159 64 84.8 B+
grouped_single. head( 3 )
School Class Gender Address Height Weight Math Physics ID 1101 S_1 C_1 M street_1 173 63 34.0 A+ 1102 S_1 C_1 F street_2 192 73 32.5 B+ 1103 S_1 C_1 M street_2 186 82 87.2 B+ 2101 S_2 C_1 M street_7 174 84 83.3 C 2102 S_2 C_1 F street_6 161 61 50.6 B+ 2103 S_2 C_1 M street_4 157 61 52.5 B-
grouped_single. first( )
Class Gender Address Height Weight Math Physics School S_1 C_1 M street_1 173 63 34.0 A+ S_2 C_1 M street_7 174 84 83.3 C
df. groupby( np. random. choice( [ 'a' , 'b' , 'c' ] , df. shape[ 0 ] ) ) . get_group( 'a' ) . head( )
School Class Gender Address Height Weight Math Physics ID 1105 S_1 C_1 F street_4 159 64 84.8 B+ 1201 S_1 C_2 M street_5 188 68 97.0 A- 1205 S_1 C_2 F street_6 167 63 68.4 B- 1304 S_1 C_3 M street_2 195 70 85.2 A 2102 S_2 C_1 F street_6 161 61 50.6 B+
df[ : 5 ] . groupby( lambda x: print ( x) ) . head( )
1101
1102
1103
1104
1105
School Class Gender Address Height Weight Math Physics ID 1101 S_1 C_1 M street_1 173 63 34.0 A+ 1102 S_1 C_1 F street_2 192 73 32.5 B+ 1103 S_1 C_1 M street_2 186 82 87.2 B+ 1104 S_1 C_1 F street_2 167 81 80.4 B- 1105 S_1 C_1 F street_4 159 64 84.8 B+
df. groupby( lambda x : "奇数行" if not df. index. get_loc( x) % 2 == 1 else "偶数行" ) . groups
{'偶数行': Int64Index([1102, 1104, 1201, 1203, 1205, 1302, 1304, 2101, 2103, 2105, 2202,
2204, 2301, 2303, 2305, 2402, 2404],
dtype='int64', name='ID'),
'奇数行': Int64Index([1101, 1103, 1105, 1202, 1204, 1301, 1303, 1305, 2102, 2104, 2201,
2203, 2205, 2302, 2304, 2401, 2403, 2405],
dtype='int64', name='ID')}
math_score = df. set_index( [ 'Gender' , 'School' ] ) [ 'Math' ] . sort_index( )
grouped_score = df. set_index( [ 'Gender' , 'School' ] ) . sort_index( ) . \
groupby( lambda x: ( x, '均分及格' if math_score[ x] . mean( ) >= 60 else '均分不及格' ) )
for name, _ in grouped_score: print ( name)
(('F', 'S_1'), '均分及格')
(('F', 'S_2'), '均分及格')
(('M', 'S_1'), '均分及格')
(('M', 'S_2'), '均分不及格')
df. groupby( [ 'Gender' , 'School' ] ) [ 'Math' ] . mean( ) >= 60
Gender School
F S_1 True
S_2 True
M S_1 True
S_2 False
Name: Math, dtype: bool
df. groupby( [ 'Gender' , 'School' ] ) [ [ 'Math' , 'Height' ] ] . mean( )
Math Height Gender School F S_1 64.100000 173.125000 S_2 66.427273 173.727273 M S_1 63.342857 178.714286 S_2 51.155556 172.000000
bins = [ 0 , 40 , 60 , 80 , 90 , 100 ]
cuts = pd. cut( df[ 'Math' ] , bins = bins)
df. groupby( cuts) [ 'Math' ] . count( )
Math
(0, 40] 7
(40, 60] 10
(60, 80] 9
(80, 90] 7
(90, 100] 2
Name: Math, dtype: int64
三、聚合、过滤和变换
group_m = grouped_single[ 'Math' ]
group_m. std( ) . values/ np. sqrt( group_m. count( ) . values) == group_m. sem( ) . values
array([ True, True])
group_m. agg( [ 'sum' , 'mean' , 'std' ] )
sum mean std School S_1 956.2 63.746667 23.077474 S_2 1191.1 59.555000 17.589305
group_m. agg( [ ( 'sum_1' , 'sum' ) , ( 'mean_1' , 'mean' ) ] )
sum_1 mean_1 School S_1 956.2 63.746667 S_2 1191.1 59.555000
grouped_mul. agg( { 'Math' : [ 'mean' , 'std' ] , 'Height' : 'max' } )
Math Height mean std max School Class S_1 C_1 63.78 27.981458 192 C_2 64.30 22.623218 188 C_3 63.16 23.841309 195 S_2 C_1 58.56 19.310697 174 C_2 62.80 19.165725 194 C_3 63.06 23.811930 190 C_4 53.80 9.548822 193
grouped_single[ 'Math' ] . agg( lambda x: print ( x. head( ) , '间隔' ) )
1101 34.0
1102 32.5
1103 87.2
1104 80.4
1105 84.8
Name: Math, dtype: float64 间隔
2101 83.3
2102 50.6
2103 52.5
2104 72.2
2105 34.2
Name: Math, dtype: float64 间隔
School
S_1 None
S_2 None
Name: Math, dtype: object
grouped_single[ 'Math' ] . agg( lambda x: x. max ( ) - x. min ( ) )
School
S_1 65.5
S_2 62.8
Name: Math, dtype: float64
def R1 ( x) :
return x. max ( ) - x. min ( )
def R2 ( x) :
return x. max ( ) - x. median( )
grouped_single[ 'Math' ] . agg( min_score1= pd. NamedAgg( column= 'col1' , aggfunc= R1) ,
max_score1= pd. NamedAgg( column= 'col2' , aggfunc= 'max' ) ,
range_score2= pd. NamedAgg( column= 'col3' , aggfunc= R2) ) . head( )
min_score1 max_score1 range_score2 School S_1 65.5 97.0 33.5 S_2 62.8 95.5 39.4
def f ( s, low , high) :
return s. between( low, high) . max ( )
grouped_single[ 'Math' ] . agg( f, 50 , 53 )
School
S_1 False
S_2 True
Name: Math, dtype: bool
grouped_single[ [ 'Math' , 'Physics' ] ] . filter ( lambda x: ( x[ 'Math' ] > 32 ) . all ( ) ) . head( )
Math Physics ID 2101 83.3 C 2102 50.6 B+ 2103 52.5 B- 2104 72.2 B+ 2105 34.2 A
grouped_single[ [ 'Math' , 'Height' ] ] . transform( lambda x : x- x. min ( ) ) . head( )
Math Height ID 1101 2.5 14 1102 1.0 33 1103 55.7 27 1104 48.9 8 1105 53.3 0
grouped_single[ [ 'Math' , 'Height' ] ] . transform( lambda x: x. mean( ) ) . head( )
Math Height ID 1101 63.746667 175.733333 1102 63.746667 175.733333 1103 63.746667 175.733333 1104 63.746667 175.733333 1105 63.746667 175.733333
grouped_single[ [ 'Math' , 'Height' ] ] . transform( lambda x : ( x- x. mean( ) ) / x. std( ) ) . head( )
Math Height ID 1101 -1.288991 -0.214991 1102 -1.353990 1.279460 1103 1.016287 0.807528 1104 0.721627 -0.686923 1105 0.912289 -1.316166
df_nan = df[ [ 'Math' , 'School' ] ] . copy( ) . reset_index( )
df_nan. loc[ np. random. randint( 0 , df. shape[ 0 ] , 25 ) , [ 'Math' ] ] = np. nan
df_nan. head( )
ID Math School 0 1101 34.0 S_1 1 1102 32.5 S_1 2 1103 87.2 S_1 3 1104 80.4 S_1 4 1105 NaN S_1
df_nan. groupby( 'School' ) . transform( lambda x : x. fillna( x. mean( ) ) ) . head( )
ID Math 0 1101 34.000000 1 1102 32.500000 2 1103 87.200000 3 1104 80.400000 4 1105 65.466667