import numpy as np
import pandas as pd
4.1 分组模式及其对象
4.1.1 分组的一般模式
df.groupby(分组依据)[数据来源].使用操作
df = pd. read_csv( r'joyful-pandas-master\data\learn_pandas.csv' )
df. groupby( 'Gender' ) [ 'Height' ] . median( )
Gender
Female 159.6
Male 173.4
Name: Height, dtype: float64
4.1.2 分组依据的本质
df. groupby( [ 'School' , 'Gender' ] ) [ 'Height' ] . mean( )
School Gender
Fudan University Female 158.776923
Male 174.212500
Peking University Female 158.666667
Male 172.030000
Shanghai Jiao Tong University Female 159.122500
Male 176.760000
Tsinghua University Female 159.753333
Male 171.638889
Name: Height, dtype: float64
condition = df. Weight > df. Weight. mean( )
df. groupby( condition) [ 'Height' ] . mean( )
Weight
False 159.034646
True 172.705357
Name: Height, dtype: float64
items = np. random. choice( list ( 'abc' ) , df. shape[ 0 ] )
items
array(['c', 'c', 'b', 'c', 'c', 'b', 'b', 'b', 'a', 'a', 'a', 'c', 'c',
'c', 'b', 'b', 'c', 'a', 'b', 'a', 'c', 'b', 'c', 'c', 'b', 'a',
'c', 'a', 'c', 'c', 'a', 'c', 'c', 'c', 'a', 'c', 'c', 'a', 'a',
'c', 'b', 'c', 'c', 'b', 'b', 'c', 'a', 'b', 'b', 'b', 'a', 'c',
'b', 'b', 'b', 'a', 'b', 'b', 'c', 'b', 'b', 'b', 'b', 'c', 'a',
'c', 'a', 'a', 'c', 'c', 'c', 'a', 'c', 'b', 'a', 'b', 'b', 'c',
'a', 'b', 'b', 'a', 'b', 'c', 'a', 'a', 'a', 'a', 'a', 'b', 'a',
'b', 'a', 'b', 'a', 'c', 'c', 'b', 'c', 'b', 'a', 'b', 'a', 'b',
'a', 'c', 'c', 'a', 'b', 'b', 'c', 'c', 'b', 'b', 'b', 'a', 'b',
'b', 'a', 'c', 'c', 'b', 'b', 'b', 'b', 'c', 'a', 'c', 'b', 'b',
'a', 'b', 'c', 'a', 'a', 'b', 'b', 'b', 'c', 'a', 'a', 'a', 'a',
'a', 'c', 'a', 'b', 'b', 'a', 'b', 'c', 'a', 'b', 'a', 'a', 'c',
'b', 'c', 'b', 'c', 'c', 'c', 'b', 'c', 'a', 'b', 'b', 'a', 'b',
'c', 'b', 'b', 'b', 'b', 'a', 'b', 'a', 'b', 'b', 'b', 'a', 'c',
'b', 'a', 'b', 'a', 'b', 'c', 'a', 'b', 'b', 'c', 'c', 'a', 'b',
'c', 'a', 'c', 'c', 'c'], dtype='<U1')
df. groupby( items) [ 'Height' ] . mean( )
a 163.283333
b 163.984932
c 162.155357
Name: Height, dtype: float64
df. groupby( [ condition, items] ) [ 'Height' ] . mean( )
Weight
False a 159.609524
b 158.879545
c 158.612195
True a 176.141667
b 171.731034
c 171.840000
Name: Height, dtype: float64
df[ [ 'School' , 'Gender' ] ] . drop_duplicates( )
School Gender 0 Shanghai Jiao Tong University Female 1 Peking University Male 2 Shanghai Jiao Tong University Male 3 Fudan University Female 4 Fudan University Male 5 Tsinghua University Female 9 Peking University Female 16 Tsinghua University Male
df. groupby( [ df[ 'School' ] , df[ 'Gender' ] ] ) [ 'Height' ] . mean( )
School Gender
Fudan University Female 158.776923
Male 174.212500
Peking University Female 158.666667
Male 172.030000
Shanghai Jiao Tong University Female 159.122500
Male 176.760000
Tsinghua University Female 159.753333
Male 171.638889
Name: Height, dtype: float64
4.1.3 Groupby 对象
gb = df. groupby( [ 'School' , 'Grade' ] )
gb
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000205BA959108>
gb. ngroups
16
res = gb. groups
res
{('Fudan University', 'Freshman'): [15, 28, 63, 70, 73, 105, 108, 157, 186], ('Fudan University', 'Junior'): [26, 41, 82, 84, 90, 107, 145, 152, 173, 187, 189, 195], ('Fudan University', 'Senior'): [39, 46, 49, 52, 66, 77, 112, 129, 131, 138, 144], ('Fudan University', 'Sophomore'): [3, 4, 37, 48, 68, 98, 135, 170], ('Peking University', 'Freshman'): [1, 32, 35, 36, 38, 45, 54, 57, 88, 96, 99, 140, 185], ('Peking University', 'Junior'): [9, 20, 59, 72, 75, 102, 159, 183], ('Peking University', 'Senior'): [30, 86, 116, 127, 130, 132, 147, 194], ('Peking University', 'Sophomore'): [29, 61, 83, 101, 120], ('Shanghai Jiao Tong University', 'Freshman'): [0, 6, 10, 60, 114, 117, 119, 121, 141, 148, 149, 153, 184], ('Shanghai Jiao Tong University', 'Junior'): [31, 42, 50, 56, 58, 64, 85, 93, 115, 122, 143, 155, 164, 172, 174, 188, 190], ('Shanghai Jiao Tong University', 'Senior'): [2, 12, 19, 21, 22, 23, 79, 87, 89, 103, 104, 109, 123, 134, 156, 161, 165, 166, 171, 192, 197, 198], ('Shanghai Jiao Tong University', 'Sophomore'): [13, 65, 71, 124, 167], ('Tsinghua University', 'Freshman'): [5, 8, 33, 34, 43, 44, 47, 51, 62, 67, 81, 111, 125, 133, 136, 142, 146], ('Tsinghua University', 'Junior'): [7, 11, 16, 17, 27, 69, 94, 95, 113, 118, 128, 137, 150, 154, 158, 160, 162, 163, 169, 176, 177, 191], ('Tsinghua University', 'Senior'): [14, 18, 24, 25, 78, 92, 100, 126, 168, 175, 179, 180, 193, 196], ('Tsinghua University', 'Sophomore'): [40, 53, 55, 74, 76, 80, 91, 97, 106, 110, 139, 151, 178, 181, 182, 199]}
gb. size( )
School Grade
Fudan University Freshman 9
Junior 12
Senior 11
Sophomore 8
Peking University Freshman 13
Junior 8
Senior 8
Sophomore 5
Shanghai Jiao Tong University Freshman 13
Junior 17
Senior 22
Sophomore 5
Tsinghua University Freshman 17
Junior 22
Senior 14
Sophomore 16
dtype: int64
gb. get_group( ( 'Fudan University' , 'Freshman' ) ) . iloc[ : 4 , : 6 ]
School Grade Name Gender Height Weight 15 Fudan University Freshman Changqiang Yang Female 156.0 49.0 28 Fudan University Freshman Gaoqiang Qin Female 170.2 63.0 63 Fudan University Freshman Gaofeng Zhao Female 152.2 43.0 70 Fudan University Freshman Yanquan Wang Female 163.5 55.0
4.1.4 分组的三大操作
聚合 变换 过滤
4.2 聚合函数
4.2.1 内置聚合函数
gb = df. groupby( 'Gender' ) [ 'Height' ]
gb. idxmin( )
Gender
Female 143
Male 199
Name: Height, dtype: int64
gb. quantile( 0.95 )
Gender
Female 166.8
Male 185.9
Name: Height, dtype: float64
gb = df. groupby( 'Gender' ) [ [ 'Height' , 'Weight' ] ]
gb. max ( )
Height Weight Gender Female 170.2 63.0 Male 193.9 89.0
all 是否全为True any 是否有一个为True prod 乘积 skew 偏度 df.mean() # 返回所有列的均值 df.mean(1) # 返回所有行的均值,下同 df.corr() # 返回列与列之间的相关系数 df.count() # 返回每一列中的非空值的个数 df.max() # 返回每一列的最大值 df.min() # 返回每一列的最小值 df.abs() # 绝对值 df.median() # 返回每一列的中位数 df.std() # 返回每一列的标准差, 贝塞尔校正的样本标准偏差 df.var() # 无偏方差 df.sem() # 平均值的标准误差 df.mode() # 众数 df.prod() # 连乘 df.mad() # 平均绝对偏差 df.cumprod() # 累积连乘,累乘 df.cumsum(axis=0) # 累积连加,累加 df.nunique() # 去重数量,不同值的量 df.idxmax() # 每列最大的值的索引名 df.idxmin() # 最小 df.cummax() # 累积最大值 df.cummin() # 累积最小值 df.skew() # 样本偏度 (第三阶) df.kurt() # 样本峰度 (第四阶) df.quantile() # 样本分位数 (不同 % 的值)
4.2.2 age方法
gb. agg( [ 'sum' , 'idxmax' , 'skew' ] )
Height Weight sum idxmax skew sum idxmax skew Gender Female 21014.0 28 -0.219253 6469.0 28 -0.268482 Male 8854.9 193 0.437535 3929.0 2 -0.332393
gb. agg( { 'Height' : [ 'mean' , 'max' ] , 'Weight' : 'count' } )
Height Weight mean max count Gender Female 159.19697 170.2 135 Male 173.62549 193.9 54
练一练
gb. agg( { 'Height' : [ 'sum' , 'idxmax' , 'skew' ] , 'Weight' : [ 'sum' , 'idxmax' , 'skew' ] } )
Height Weight sum idxmax skew sum idxmax skew Gender Female 21014.0 28 -0.219253 6469.0 28 -0.268482 Male 8854.9 193 0.437535 3929.0 2 -0.332393
gb. agg( lambda x: x. max ( ) - x. min ( ) )
Height Weight Gender Female 24.8 29.0 Male 38.2 38.0
def my_func ( s) :
res = 'High'
if s. mean( ) <= df[ s. name] . mean( ) :
res = 'low'
return res
gb. agg( my_func)
Height Weight Gender Female low low Male High High
gb. agg( [ ( 'range' , lambda x: x. max ( ) - x. min ( ) ) , ( 'my_sum' , 'sum' ) ] )
Height Weight range my_sum range my_sum Gender Female 24.8 21014.0 29.0 6469.0 Male 38.2 8854.9 38.0 3929.0
gb. agg( { 'Height' : [ ( 'my_func' , my_func) , 'sum' ] ,
'Weight' : lambda x: x. max ( ) } )
Height Weight my_func sum <lambda> Gender Female low 21014.0 63.0 Male High 8854.9 89.0
4.3变换和过滤
4.3.1 变换函数与transform方法
gb. cummax( ) . head( )
Height Weight 0 158.9 46.0 1 166.5 70.0 2 188.9 89.0 3 NaN 46.0 4 188.9 89.0
gb. transform( lambda x: ( x- x. mean( ) ) / x. std( ) ) . head( )
Height Weight 0 -0.058760 -0.354888 1 -1.010925 -0.355000 2 2.167063 2.089498 3 NaN -1.279789 4 0.053133 0.159631
gb. transform( 'mean' ) . head( )
Height Weight 0 159.19697 47.918519 1 173.62549 72.759259 2 173.62549 72.759259 3 159.19697 47.918519 4 173.62549 72.759259
gb. filter ( lambda x: x. shape[ 0 ] > 100 ) . head( )
Height Weight 0 158.9 46.0 3 NaN 41.0 5 158.0 51.0 6 162.5 52.0 7 161.9 50.0
4.4 跨列分组
4.4.1 apply的引入
4.4.2 apply的使用
def BMI ( x) :
Height = x[ 'Height' ] / 100
Weight = x[ 'Weight' ]
BMI_value = Weight/ Height** 2
return BMI_value. mean( )
gb. apply ( BMI)
Gender
Female 18.860930
Male 24.318654
dtype: float64
gb = df. groupby( [ 'Gender' , 'Test_Number' ] ) [ [ 'Height' , 'Weight' ] ]
gb. apply ( lambda x: 0 )
Gender Test_Number
Female 1 0
2 0
3 0
Male 1 0
2 0
3 0
dtype: int64