CH03 - Pandas 【分组】

import numpy as np 
import pandas as pd
df = pd.read_csv('C:/Users/admin/Desktop/joyful-pandas-master/joyful-pandas-master/data/table.csv',index_col = 'ID')
df.head()
SchoolClassGenderAddressHeightWeightMathPhysics
ID
1101S_1C_1Mstreet_11736334.0A+
1102S_1C_1Fstreet_21927332.5B+
1103S_1C_1Mstreet_21868287.2B+
1104S_1C_1Fstreet_21678180.4B-
1105S_1C_1Fstreet_41596484.8B+

一、SAC过程

1.1、内涵

SAC指的是分组操作中的split-apply-combine过程

  • split指基于某一些规则,将数据拆成若干组,
  • apply是指对每一组独立地使用函数,
  • combine指将每一组的结果组合成某一类数据结构

1.2、apply过程

  • 整合(Aggregation)——即分组计算统计量(如求均值、求每组元素个数)
  • 变换(Transformation)——即分组对每个单元的数据进行操作(如元素标准化)
  • 过滤(Filtration)——即按照某些规则筛选出一些组(如选出组内某一指标小于50的组)

二、groupby函数

grouped_single = df.groupby('School')
grouped_single.get_group('S_1').head()
SchoolClassGenderAddressHeightWeightMathPhysics
ID
1101S_1C_1Mstreet_11736334.0A+
1102S_1C_1Fstreet_21927332.5B+
1103S_1C_1Mstreet_21868287.2B+
1104S_1C_1Fstreet_21678180.4B-
1105S_1C_1Fstreet_41596484.8B+
grouped_mul = df.groupby(['School','Class'])
grouped_mul.get_group(('S_2','C_4')).head()
SchoolClassGenderAddressHeightWeightMathPhysics
ID
2401S_2C_4Fstreet_21926245.3A
2402S_2C_4Mstreet_71668248.7B
2403S_2C_4Fstreet_61586059.7B+
2404S_2C_4Fstreet_21608467.7B
2405S_2C_4Fstreet_61935447.6B
grouped_single.size()
School
S_1    15
S_2    20
dtype: int64
grouped_mul.size()
School  Class
S_1     C_1      5
        C_2      5
        C_3      5
S_2     C_1      5
        C_2      5
        C_3      5
        C_4      5
dtype: int64
grouped_single.ngroups
2
grouped_mul.ngroups
7
for name,group in grouped_single:
    print(name)
    display(group.head())
S_1
SchoolClassGenderAddressHeightWeightMathPhysics
ID
1101S_1C_1Mstreet_11736334.0A+
1102S_1C_1Fstreet_21927332.5B+
1103S_1C_1Mstreet_21868287.2B+
1104S_1C_1Fstreet_21678180.4B-
1105S_1C_1Fstreet_41596484.8B+
S_2
SchoolClassGenderAddressHeightWeightMathPhysics
ID
2101S_2C_1Mstreet_71748483.3C
2102S_2C_1Fstreet_61616150.6B+
2103S_2C_1Mstreet_41576152.5B-
2104S_2C_1Fstreet_51599772.2B+
2105S_2C_1Mstreet_41708134.2A
df.set_index(['Gender','School']).groupby(level = 1,axis = 0).get_group('S_1').head()
ClassAddressHeightWeightMathPhysics
GenderSchool
MS_1C_1street_11736334.0A+
FS_1C_1street_21927332.5B+
MS_1C_1street_21868287.2B+
FS_1C_1street_21678180.4B-
S_1C_1street_41596484.8B+
grouped_single.head(3)
SchoolClassGenderAddressHeightWeightMathPhysics
ID
1101S_1C_1Mstreet_11736334.0A+
1102S_1C_1Fstreet_21927332.5B+
1103S_1C_1Mstreet_21868287.2B+
2101S_2C_1Mstreet_71748483.3C
2102S_2C_1Fstreet_61616150.6B+
2103S_2C_1Mstreet_41576152.5B-
grouped_single.first()
ClassGenderAddressHeightWeightMathPhysics
School
S_1C_1Mstreet_11736334.0A+
S_2C_1Mstreet_71748483.3C
df.groupby(np.random.choice(['a','b','c'],df.shape[0])).get_group('a').head()
SchoolClassGenderAddressHeightWeightMathPhysics
ID
1105S_1C_1Fstreet_41596484.8B+
1201S_1C_2Mstreet_51886897.0A-
1205S_1C_2Fstreet_61676368.4B-
1304S_1C_3Mstreet_21957085.2A
2102S_2C_1Fstreet_61616150.6B+
df[:5].groupby(lambda x: print(x)).head()
1101
1102
1103
1104
1105
SchoolClassGenderAddressHeightWeightMathPhysics
ID
1101S_1C_1Mstreet_11736334.0A+
1102S_1C_1Fstreet_21927332.5B+
1103S_1C_1Mstreet_21868287.2B+
1104S_1C_1Fstreet_21678180.4B-
1105S_1C_1Fstreet_41596484.8B+
df.groupby(lambda x : "奇数行" if not df.index.get_loc(x)%2 == 1 else "偶数行").groups
{'偶数行': Int64Index([1102, 1104, 1201, 1203, 1205, 1302, 1304, 2101, 2103, 2105, 2202,
             2204, 2301, 2303, 2305, 2402, 2404],
            dtype='int64', name='ID'),
 '奇数行': Int64Index([1101, 1103, 1105, 1202, 1204, 1301, 1303, 1305, 2102, 2104, 2201,
             2203, 2205, 2302, 2304, 2401, 2403, 2405],
            dtype='int64', name='ID')}
math_score = df.set_index(['Gender','School'])['Math'].sort_index()
grouped_score = df.set_index(['Gender','School']).sort_index().\
            groupby(lambda x:(x,'均分及格' if math_score[x].mean()>=60 else '均分不及格'))
for name,_ in grouped_score:print(name)
(('F', 'S_1'), '均分及格')
(('F', 'S_2'), '均分及格')
(('M', 'S_1'), '均分及格')
(('M', 'S_2'), '均分不及格')
df.groupby(['Gender','School'])['Math'].mean() >= 60
Gender  School
F       S_1        True
        S_2        True
M       S_1        True
        S_2       False
Name: Math, dtype: bool
df.groupby(['Gender','School'])[['Math','Height']].mean()
MathHeight
GenderSchool
FS_164.100000173.125000
S_266.427273173.727273
MS_163.342857178.714286
S_251.155556172.000000
bins = [0,40,60,80,90,100]
cuts = pd.cut(df['Math'],bins = bins)
df.groupby(cuts)['Math'].count()
Math
(0, 40]       7
(40, 60]     10
(60, 80]      9
(80, 90]      7
(90, 100]     2
Name: Math, dtype: int64

三、聚合、过滤和变换

group_m = grouped_single['Math']
group_m.std().values/np.sqrt(group_m.count().values) ==group_m.sem().values
array([ True,  True])
group_m.agg(['sum','mean','std'])
summeanstd
School
S_1956.263.74666723.077474
S_21191.159.55500017.589305
group_m.agg([('sum_1','sum'),('mean_1','mean')])
sum_1mean_1
School
S_1956.263.746667
S_21191.159.555000
grouped_mul.agg({'Math':['mean','std'],'Height':'max'})
MathHeight
meanstdmax
SchoolClass
S_1C_163.7827.981458192
C_264.3022.623218188
C_363.1623.841309195
S_2C_158.5619.310697174
C_262.8019.165725194
C_363.0623.811930190
C_453.809.548822193
grouped_single['Math'].agg(lambda x:print(x.head(),'间隔'))
1101    34.0
1102    32.5
1103    87.2
1104    80.4
1105    84.8
Name: Math, dtype: float64 间隔
2101    83.3
2102    50.6
2103    52.5
2104    72.2
2105    34.2
Name: Math, dtype: float64 间隔





School
S_1    None
S_2    None
Name: Math, dtype: object
grouped_single['Math'].agg(lambda x:x.max()-x.min())
School
S_1    65.5
S_2    62.8
Name: Math, dtype: float64
def R1(x):
    return x.max()-x.min()
def R2(x):
    return x.max()-x.median()
grouped_single['Math'].agg(min_score1=pd.NamedAgg(column='col1', aggfunc=R1),
                           max_score1=pd.NamedAgg(column='col2', aggfunc='max'),
                           range_score2=pd.NamedAgg(column='col3', aggfunc=R2)).head()
min_score1max_score1range_score2
School
S_165.597.033.5
S_262.895.539.4
def f(s, low ,high):
    return s.between(low,high).max()
grouped_single['Math'].agg(f,50,53)
School
S_1    False
S_2     True
Name: Math, dtype: bool
grouped_single[['Math','Physics']].filter(lambda x:(x['Math']>32).all()).head()
MathPhysics
ID
210183.3C
210250.6B+
210352.5B-
210472.2B+
210534.2A
grouped_single[['Math','Height']].transform(lambda x : x-x.min()).head()
MathHeight
ID
11012.514
11021.033
110355.727
110448.98
110553.30
grouped_single[['Math','Height']].transform(lambda x:x.mean()).head()
MathHeight
ID
110163.746667175.733333
110263.746667175.733333
110363.746667175.733333
110463.746667175.733333
110563.746667175.733333
grouped_single[['Math','Height']].transform(lambda x : (x-x.mean())/x.std()).head()
MathHeight
ID
1101-1.288991-0.214991
1102-1.3539901.279460
11031.0162870.807528
11040.721627-0.686923
11050.912289-1.316166
df_nan = df[['Math','School']].copy().reset_index()
df_nan.loc[np.random.randint(0,df.shape[0],25),['Math']]=np.nan
df_nan.head()
IDMathSchool
0110134.0S_1
1110232.5S_1
2110387.2S_1
3110480.4S_1
41105NaNS_1
df_nan.groupby('School').transform(lambda x :x.fillna(x.mean())).head()
IDMath
0110134.000000
1110232.500000
2110387.200000
3110480.400000
4110565.466667

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值