unleash its powers 释放它的力量
introduction
# perform the simplest aggregation involving only a single grouping column, a single aggregation column, ad a single aggregation function
# Task: find the average arrival delay for each airline
# pandas has different syntaxes to create an aggregation
import pandas as pd
import numpy as np
flights=pd.read_csv('flights.csv')
flights.head()
Out[9]:
MONTH DAY WEEKDAY AIRLINE ... SCHED_ARR ARR_DELAY DIVERTED CANCELLED
0 1 1 4 WN ... 1905 65.0 0 0
1 1 1 4 UA ... 1333 -13.0 0 0
2 1 1 4 MQ ... 1453 35.0 0 0
3 1 1 4 AA ... 1935 -7.0 0 0
4 1 1 4 WN ... 2225 39.0 0 0
[5 rows x 14 columns]
# define the grouping columns('ATRLINE'), aggregating columns('AIR_DELAY'), and aggregating functions(mean)
# place the grouping column in the .group method and then call the .agg method with a dictionary pairing the aggregation column with its aggregation function
# if you pass in a dictionary, it returns back a DataFrame
flights.groupby('AIRLINE').agg({'ARR_DELAY':'mean'})
Out[15]:
ARR_DELAY
AIRLINE
AA 5.542661
AS -0.833333
B6 8.692593
DL 0.339691
EV 7.034580
F9 13.630651
HA 4.972973
MQ 6.860591
NK 18.436070
OO 7.593463
UA 7.765755
US 1.681105
VX 5.348884
WN 6.397353
# alternatively, you may place the aggregation column in the operator and then pass the aggregation function as a string to a .agg
flights.groupby('AIRLINE')["ARR_DELAY"].agg('mean')
Out[17]:
AIRLINE
AA 5.542661
AS -0.833333
B6 8.692593
DL 0.339691
EV 7.034580
F9 13.630651
HA 4.972973
MQ 6.860591
NK 18.436070
OO 7.593463
UA 7.765755
US 1.681105
VX 5.348884
WN 6.397353
Name: ARR_DELAY, dtype: float64
# note that returns a Series
# you can pass any aggregation function directly to the .agg method, such as the Numpy mean function
flights.groupby("AIRLINE")["ARR_DELAY"].agg(np.mean)
Out[20]:
AIRLINE
AA 5.542661
AS -0.833333
B6 8.692593
DL 0.339691
EV 7.034580
F9 13.630651
HA 4.972973
MQ 6.860591
NK 18.436070
OO 7.593463
UA 7.765755
US 1.681105
VX 5.348884
WN 6.397353
Name: ARR_DELAY, dtype: float64
# skip the agg method altogether in this case and use the code in text method directly
flights.groupby("AIRLINE")["ARR_DELAY"].mean()
Out[23]:
AIRLINE
AA 5.542661
AS -0.833333
B6 8.692593
DL 0.339691
EV 7.034580
F9 13.630651
HA 4.972973
MQ 6.860591
NK 18.436070
OO 7.593463
UA 7.765755
US 1.681105
VX 5.348884
WN 6.397353
Name: ARR_DELAY, dtype: float64
if you do not use an aggregation function with .agg
, pandas raises an exception.
Grouping and aggregating with multiple columns and functions
hierarchical column 分层列
# the First Task: finding the number of cancelled flights for every airline per weekday
# define: the grouping columns (AIRLINE,WEEKDAY), the aggregating columns(CANCELLED), the aggregating function(sum)
flights.groupby(["AIRLINE","WEEKDAY"])["CANCELLED"].sum()
Out[26]:
AIRLINE WEEKDAY
AA 1 41
2 9
3 16
4 20
5 18
..
WN 3 18
4 10
5 7
6 10
7 7
Name: CANCELLED, Length: 98, dtype: int64
# the Second Task : find the number and percentage of cancelled and diverted flights for every airling per weekday
# define the grouping columns(AIRLINE,WEEKDAY), the aggregating columns(CANCELLED,DIVERTED), the aggregating function(SUM,mean)
flights.groupby(["AIRLINE","WEEKDAY"])[["CANCELLED","DIVERTED"]].agg(['sum','mean'])
Out[29]:
CANCELLED DIVERTED
sum mean sum mean
AIRLINE WEEKDAY
AA 1 41 0.032106 6 0.004699
2 9 0.007341 2 0.001631
3 16 0.011949 2 0.001494
4 20 0.015004 5 0.003751
5 18 0.014151 1 0.000786
... ... ... ...
WN 3 18 0.014118 2 0.001569
4 10 0.007911 4 0.003165
5 7 0.005828 0 0.000000
6 10 0.010132 3 0.003040
7 7 0.006066 3 0.002600
[98 rows x 4 columns]
# the third Task: for each origin and destination, find the total number of flights, the number and percentage of cancelled flights
# and the average and variance of the airline
# use a dictionary in the .agg method to map specific aggregating to specific aggregating function
# the size aggregation function returns the total number of rows per group.
# this is different than the 'count' aggregating function which returns the number of non-missing values per group.
flights.groupby(['ORG_AIR','DEST_AIR']).agg({'CANCELLED':['sum','mean','size'],"AIR_TIME":['mean','var']})
Out[33]:
CANCELLED AIR_TIME
sum mean size mean var
ORG_AIR DEST_AIR
ATL ABE 0 0.000000 31 96.387097 45.778495
ABQ 0 0.000000 16 170.500000 87.866667
ABY 0 0.000000 19 28.578947 6.590643
ACY 0 0.000000 6 91.333333 11.466667
AEX 0 0.000000 40 78.725000 47.332692
... ... ... ... ...
SFO SNA 4 0.032787 122 64.059322 11.338331
STL 0 0.000000 20 198.900000 101.042105
SUN 0 0.000000 10 78.000000 25.777778
TUS 0 0.000000 20 100.200000 35.221053
XNA 0 0.000000 2 173.500000 0.500000
[1130 rows x 5 columns]
# use a named aggregation object that can create non-hierarchical columns
# new syntax to create flat columns
flights.groupby(['ORG_AIR','DEST_AIR']).agg(sum_cancelled=pd.NamedAgg(column='CANCELLED',aggfunc='sum'),
mean_cancelled=pd.NamedAgg(column="CANCELLED",aggfunc='mean'),
size_cancelled=pd.NamedAgg(column='CANCELLED',aggfunc='size'),
mean_air_time=pd.NamedAgg(column="AIR_TIME",aggfunc="mean"),
var_air_time=pd.NamedAgg(column="AIR_TIME",aggfunc='var'))
Out[35]:
sum_cancelled mean_cancelled ... mean_air_time var_air_time
ORG_AIR DEST_AIR ...
ATL ABE 0 0.000000 ... 96.387097 45.778495
ABQ 0 0.000000 ... 170.500000 87.866667
ABY 0 0.000000 ... 28.578947 6.590643
ACY 0 0.000000 ... 91.333333 11.466667
AEX 0 0.000000 ... 78.725000 47.332692
... ... ... ... ...
SFO SNA 4 0.032787 ... 64.059322 11.338331
STL 0 0.000000 ... 198.900000 101.042105
SUN 0 0.000000 ... 78.000000 25.777778
TUS 0 0.000000 ... 100.200000 35.221053
XNA 0 0.000000 ... 173.500000 0.500000
[1130 rows x 5 columns]
syntax 句法结构,语法
# use a named aggregation object that can create non-hierarchical columns
flights.groupby(['ORG_AIR','DEST_AIR']).agg(sum_cancelled=pd.NamedAgg(column='CANCELLED',aggfunc='sum'),
mean_cancelled=pd.NamedAgg(column="CANCELLED",aggfunc='mean'),
size_cancelled=pd.NamedAgg(column='CANCELLED',aggfunc='size'),
mean_air_time=pd.NamedAgg(column="AIR_TIME",aggfunc="mean"),
var_air_time=pd.NamedAgg(column="AIR_TIME",aggfunc='var'))
Out[35]:
sum_cancelled mean_cancelled ... mean_air_time var_air_time
ORG_AIR DEST_AIR ...
ATL ABE 0 0.000000 ... 96.387097 45.778495
ABQ 0 0.000000 ... 170.500000 87.866667
ABY 0 0.000000 ... 28.578947 6.590643
ACY 0 0.000000 ... 91.333333 11.466667
AEX 0 0.000000 ... 78.725000 47.332692
... ... ... ... ...
SFO SNA 4 0.032787 ... 64.059322 11.338331
STL 0 0.000000 ... 198.900000 101.042105
SUN 0 0.000000 ... 78.000000 25.777778
TUS 0 0.000000 ... 100.200000 35.221053
XNA 0 0.000000 ... 173.500000 0.500000
[1130 rows x 5 columns]
# to flatten the columns Out[33], use the .to_flat_index method
res=flights.groupby(["ORG_AIR","DEST_AIR"]).agg({"CANCELLED":['sum','mean','size'],"AIR_TIME":['mean','var']})
res.columns
Out[38]:
MultiIndex([('CANCELLED', 'sum'),
('CANCELLED', 'mean'),
('CANCELLED', 'size'),
( 'AIR_TIME', 'mean'),
( 'AIR_TIME', 'var')],
)
res.columns=['_'.join(x) for x in res.columns.to_flat_index()]
res.columns
Out[40]:
Index(['CANCELLED_sum', 'CANCELLED_mean', 'CANCELLED_size', 'AIR_TIME_mean',
'AIR_TIME_var'],
dtype='object')
res
Out[41]:
CANCELLED_sum CANCELLED_mean ... AIR_TIME_mean AIR_TIME_var
ORG_AIR DEST_AIR ...
ATL ABE 0 0.000000 ... 96.387097 45.778495
ABQ 0 0.000000 ... 170.500000 87.866667
ABY 0 0.000000 ... 28.578947 6.590643
ACY 0 0.000000 ... 91.333333 11.466667
AEX 0 0.000000 ... 78.725000 47.332692
... ... ... ... ...
SFO SNA 4 0.032787 ... 64.059322 11.338331
STL 0 0.000000 ... 198.900000 101.042105
SUN 0 0.000000 ... 78.000000 25.777778
TUS 0 0.000000 ... 100.200000 35.221053
XNA 0 0.000000 ... 173.500000 0.500000
[1130 rows x 5 columns]
res.index
Out[42]:
MultiIndex([('ATL', 'ABE'),
('ATL', 'ABQ'),
('ATL', 'ABY'),
('ATL', 'ACY'),
('ATL', 'AEX'),
('ATL', 'AGS'),
('ATL', 'ALB'),
('ATL', 'ANC'),
('ATL', 'ASE'),
('ATL', 'ATW'),
...
('SFO', 'SBP'),
('SFO', 'SEA'),
('SFO', 'SLC'),
('SFO', 'SMF'),
('SFO', 'SMX'),
('SFO', 'SNA'),
('SFO', 'STL'),
('SFO', 'SUN'),
('SFO', 'TUS'),
('SFO', 'XNA')],
names=['ORG_AIR', 'DEST_AIR'], length=1130)
res.index.to_flat_index()
Out[43]:
Index([('ATL', 'ABE'), ('ATL', 'ABQ'), ('ATL', 'ABY'), ('ATL', 'ACY'),
('ATL', 'AEX'), ('ATL', 'AGS'), ('ATL', 'ALB'), ('ATL', 'ANC'),
('ATL', 'ASE'), ('ATL', 'ATW'),
...
('SFO', 'SBP'), ('SFO', 'SEA'), ('SFO', 'SLC'), ('SFO', 'SMF'),
('SFO', 'SMX'), ('SFO', 'SNA'), ('SFO', 'STL'), ('SFO', 'SUN'),
('SFO', 'TUS'), ('SFO', 'XNA')],
dtype='object', length=1130)
for x in res.index.to_flat_index():
print('_'.join(x))
ATL_ABE
ATL_ABQ
ATL_ABY
ATL_ACY
ATL_AEX
ATL_AGS
ATL_ALB
# we will have to leverage the .pipe method
def flatten_cols(df):
df.columns=['_'.join(x) for x in df.columns.to_flat_index()]
return df
res=flights.groupby(["ORG_AIR","DEST_AIR"]).agg({"CANCELLED":["sum",'mean','size'],'AIR_TIME':['mean','var']}).pipe(flatten_cols)
res
Out[49]:
CANCELLED_sum CANCELLED_mean ... AIR_TIME_mean AIR_TIME_var
ORG_AIR DEST_AIR ...
ATL ABE 0 0.000000 ... 96.387097 45.778495
ABQ 0 0.000000 ... 170.500000 87.866667
ABY 0 0.000000 ... 28.578947 6.590643
ACY 0 0.000000 ... 91.333333 11.466667
AEX 0 0.000000 ... 78.725000 47.332692
... ... ... ... ...
SFO SNA 4 0.032787 ... 64.059322 11.338331
STL 0 0.000000 ... 198.900000 101.042105
SUN 0 0.000000 ... 78.000000 25.777778
TUS 0 0.000000 ... 100.200000 35.221053
XNA 0 0.000000 ... 173.500000 0.500000
[1130 rows x 5 columns]
# when grouping with multiple columns, pandas creates a hierarchical index, or multi-index
# if one of the columns that we group by is categorical(and has a category type,not an object type), then pandas will create a Cartesian product of all combinations for each level
res=flights.assign(ORG_AIR=flights.ORG_AIR.astype('category'))
res=(flights.assign(ORG_AIR=flights.ORG_AIR.astype('category'))
.groupby(["ORG_AIR","DEST_AIR"])
.agg({"CANCELLED":['sum','mean','size'],
"AIR_TIME":['mean','var']}))
res
Out[54]:
CANCELLED AIR_TIME
sum mean size mean var
ORG_AIR DEST_AIR
ATL ABE 0 0.0 31 96.387097 45.778495
ABI 0 NaN 0 NaN NaN
ABQ 0 0.0 16 170.500000 87.866667
ABR 0 NaN 0 NaN NaN
ABY 0 0.0 19 28.578947 6.590643
... ... ... ... ...
SFO TYS 0 NaN 0 NaN NaN
VLD 0 NaN 0 NaN NaN
VPS 0 NaN 0 NaN NaN
XNA 0 0.0 2 173.500000 0.500000
YUM 0 NaN 0 NaN NaN
[2710 rows x 5 columns]
# note that is a 2710*5 DataFrame
# if you have categorical columns with higher cardinality,
# you can get many more values
# to remedy the combinatoric explosion, use the obeserved=True parameter
# this makes the categorical groupby works like grouping with string types
# and only shows the observed values and not the Cartesian product
res=(
flights.assign(ORG_AIR=flights.ORG_AIR.astype('category'))
.groupby(["ORG_AIR","DEST_AIR"])
.agg({"CANCELLED":['sum',"mean",'size'],"AIR_TIME":['mean','var']}))
res
Out[63]:
CANCELLED AIR_TIME
sum mean size mean var
ORG_AIR DEST_AIR
ATL ABE 0 0.0 31 96.387097 45.778495
ABI 0 NaN 0 NaN NaN
ABQ 0 0.0 16 170.500000 87.866667
ABR 0 NaN 0 NaN NaN
ABY 0 0.0 19 28.578947 6.590643
... ... ... ... ...
SFO TYS 0 NaN 0 NaN NaN
VLD 0 NaN 0 NaN NaN
VPS 0 NaN 0 NaN NaN
XNA 0 0.0 2 173.500000 0.500000
YUM 0 NaN 0 NaN NaN
[2710 rows x 5 columns]
# 上面是没有 observe 参数的结果
# 下面是有observe参数的结果
flights.assign(ORG_AIR=flights.ORG_AIR.astype('category'))
.groupby(["ORG_AIR","DEST_AIR"],observed=True)
.agg({"CANCELLED":['sum',"mean",'size'],"AIR_TIME":['mean','var']}))
res
Out[66]:
CANCELLED AIR_TIME
sum mean size mean var
ORG_AIR DEST_AIR
LAX ABQ 1 0.018182 55 89.259259 29.403215
ANC 0 0.000000 7 307.428571 78.952381
ASE 1 0.038462 26 102.920000 102.243333
ATL 0 0.000000 174 224.201149 127.155837
AUS 0 0.000000 80 150.537500 57.897310
... ... ... ... ...
MSP TTN 1 0.125000 8 124.428571 57.952381
TUL 0 0.000000 18 91.611111 63.075163
TUS 0 0.000000 2 176.000000 32.000000
TVC 0 0.000000 5 56.600000 10.300000
XNA 0 0.000000 14 90.642857 115.939560
[1130 rows x 5 columns]
# note this DataFrame is 1130*5, correct!
Removing the MultiIndex after grouping
retrieve 取回,索回,检索
inevitably, when using groupby, you will create a MultiInedx.
MultiIndex can happen in both the index and columns.
DataFrames with MultiIndexes are more difficult to navigate and occasionally have confusing column names as well.
Grouping with a Custom aggregation function
at some point, you may need to write your own custom user-defined function that does not exist in pandas or Numpy.
# Task: use the college dataset to calculate the mean and standard deviation of the undergraduate student population per state
# and then find the maximum number of standard deviations from the mean that any single population value is per state
import pandas as pd
import numpy as np
college=pd.read_csv('college.csv')
(college
.groupby('STABBR')
['UGDS']
.agg(['mean','std']).round(0))
Out[7]:
mean std
STABBR
AK 2493.0 4052.0
AL 2790.0 4658.0
AR 1644.0 3143.0
AS 1276.0 NaN
AZ 4130.0 14894.0
CA 3518.0 6709.0
CO 2325.0 4670.0
CT 1874.0 2871.0
DC 2645.0 3225.0
DE 2491.0 4503.0
FL 2493.0 7033.0
FM 2344.0 NaN
GA 2643.0 4422.0
GU 1894.0 1774.0
HI 2361.0 2999.0
IA 2294.0 5815.0
ID 2096.0 4865.0
IL 2189.0 4080.0
IN 2654.0 8278.0
KS 1861.0 3673.0
KY 1991.0 3783.0
LA 1660.0 3485.0
MA 2023.0 3267.0
MD 3003.0 5837.0
ME 1437.0 1808.0
MH 1078.0 NaN
MI 2643.0 5356.0
MN 2086.0 3580.0
MO 1557.0 3569.0
MP 1120.0 NaN
MS 2499.0 3848.0
MT 1471.0 3007.0
NC 2447.0 4212.0
ND 1460.0 2919.0
NE 1932.0 3579.0
NH 2060.0 4877.0
NJ 2353.0 4480.0
NM 2254.0 4566.0
NV 2274.0 6144.0
NY 2349.0 4421.0
OH 1577.0 4101.0
OK 1351.0 3280.0
OR 2183.0 4952.0
PA 1699.0 3793.0
PR 1599.0 2913.0
PW 602.0 NaN
RI 3306.0 4609.0
SC 2074.0 3778.0
SD 1358.0 2069.0
TN 1570.0 3279.0
TX 2999.0 7363.0
UT 3035.0 8056.0
VA 2695.0 6707.0
VI 1971.0 NaN
VT 1513.0 2194.0
WA 2271.0 4124.0
WI 2655.0 4615.0
WV 1758.0 5957.0
WY 2244.0 2745.0
# find the one that is farthest from the mean
def max_deviation(s):
std_score=(s-s.mean())/s.std()
return std_score.abs().max()
# pass it directly to the .agg method to complete the aggregation
# pandas implicitly passes the IGDS column as a Series to `max_deviation`
# the max_deviation function is called once for each group
# the .agg method requires that we return a scalar from the function, or else an exception will be raised
(college.groupby('STABBR')
['UGDS']
.agg(max_deviation).round(1))
Out[11]:
STABBR
AK 2.6
AL 5.8
AR 6.3
AS NaN
AZ 9.9
CA 6.1
CO 5.0
CT 5.6
DC 2.4
DE 3.5
FL 8.4
FM NaN
GA 5.4
GU 1.0
HI 3.8
IA 6.5
ID 4.5
IL 7.3
IN 9.1
KS 4.9
KY 5.2
LA 6.5
MA 6.1
MD 5.3
ME 4.0
MH NaN
MI 6.7
MN 7.8
MO 7.2
MP NaN
MS 4.0
MT 3.9
NC 4.9
ND 3.5
NE 5.0
NH 5.3
NJ 7.1
NM 4.5
NV 4.7
NY 8.2
Name: UGDS, dtype: float64
# pandas defaults to using the sample standard deviation, which is undefined for any group with just a single value
def max_deviation(s):
std_score=(s-s.mean())/s.std()
return std_score.abs().max()
(college.groupby(['STABBR','RELAFFIL'])[['UGDS','SATVRMID','SATMTMID']]
.agg([max_deviation,'mean','std']).round(1))
Out[11]:
UGDS ... SATMTMID
max_deviation mean std ... max_deviation mean std
STABBR RELAFFIL ...
AK 0 2.1 3508.9 4539.5 ... NaN NaN NaN
1 1.1 123.3 132.9 ... NaN 503.0 NaN
AL 0 5.2 3248.8 5102.4 ... 1.7 515.8 56.7
1 2.4 979.7 870.8 ... 1.4 485.6 61.4
AR 0 5.8 1793.7 3401.6 ... 2.0 503.6 39.0
... ... ... ... ... ... ...
WI 0 5.3 2879.1 5031.5 ... 1.3 591.2 85.7
1 3.4 1716.2 1934.6 ... 1.8 526.6 42.5
WV 0 6.9 1873.9 6271.7 ... 1.8 480.0 27.7
1 1.3 716.4 503.6 ... 1.7 484.8 17.7
WY 0 2.8 2244.4 2744.7 ... NaN 540.0 NaN
[112 rows x 9 columns]
# pandas use the name of function as the name of returned column
# you can change the column name directly with the .rename method
# or you can modify the function attribute ._name_
max_deviation.__name__
Out[15]: 'max_deviation'
max_deviation.__name__='Max Deviation'
(college.groupby(['STABBR','RELAFFIL'])[['UGDS','SATVRMID','SATMTMID']]
.agg([max_deviation,'mean','std']).round(1))
Out[17]:
UGDS ... SATMTMID
Max Deviation mean std ... Max Deviation mean std
STABBR RELAFFIL ...
AK 0 2.1 3508.9 4539.5 ... NaN NaN NaN
1 1.1 123.3 132.9 ... NaN 503.0 NaN
AL 0 5.2 3248.8 5102.4 ... 1.7 515.8 56.7
1 2.4 979.7 870.8 ... 1.4 485.6 61.4
AR 0 5.8 1793.7 3401.6 ... 2.0 503.6 39.0
... ... ... ... ... ... ...
WI 0 5.3 2879.1 5031.5 ... 1.3 591.2 85.7
1 3.4 1716.2 1934.6 ... 1.8 526.6 42.5
WV 0 6.9 1873.9 6271.7 ... 1.8 480.0 27.7
1 1.3 716.4 503.6 ... 1.7 484.8 17.7
WY 0 2.8 2244.4 2744.7 ... NaN 540.0 NaN
[112 rows x 9 columns]
# define a function that returns the percentage of schools with an undergraduate population of between 1000 and 3000
def pct_between_1_3k(s):
return (s.between(1000,3000).mean()*100)
# Task: build a customized function for the college dataset that finds the percentage of school by states and religious affiliation that have an undergraduate population between two values
# calculate this percentage grouping by states and religious affiliation
(college
.groupby(["STABBR",'RELAFFIL'])
["UGDS"]
.agg(pct_between_1_3k)
.round(1))
Out[22]:
STABBR RELAFFIL
AK 0 14.3
1 0.0
AL 0 23.6
1 33.3
AR 0 27.9
...
WI 0 13.8
1 36.0
WV 0 24.6
1 37.5
WY 0 54.5
Name: UGDS, Length: 112, dtype: float64
# give the user more fexibility to choose the lower and upper bound
# create a nwe function that allows users to parameterize those bounds
def pct_between(s,low,high):
return (s.between(low,high).mean()*100)
college.groupby(["STABBR",'RELAFFIL'])["UGDS"].agg(pct_between,1000,3000).round(1)
Out[26]:
STABBR RELAFFIL
AK 0 14.3
1 0.0
AL 0 23.6
1 33.3
AR 0 27.9
...
WI 0 13.8
1 36.0
WV 0 24.6
1 37.5
WY 0 54.5
Name: UGDS, Length: 112, dtype: float64
# the alternative method
college.groupby(["STABBR",'RELAFFIL'])["UGDS"].agg(pct_between,low=1000,high=3000).round(1)
Out[28]:
STABBR RELAFFIL
AK 0 14.3
1 0.0
AL 0 23.6
1 33.3
AR 0 27.9
...
WI 0 13.8
1 36.0
WV 0 24.6
1 37.5
WY 0 54.5
Name: UGDS, Length: 112, dtype: float64
Examining the groupby object
# the immediate result from using the .groupby method on a DataFrame is a 'groupby' object
# usually, we chain operations on this object to do aggregation or transformations without ever storing the intermediate values in variables
# now, examin the 'groupby' object to examine individual groups
grouped=college.groupby(["STABBR","RELAFFIL"])
type(grouped)
Out[34]: pandas.core.groupby.generic.DataFrameGroupBy
# use the dir function to discover the attributes of a 'groupby' object
print([attr for attr in dir(grouped) if not attr.startswith('_')])
['CITY', 'CURROPER', 'DISTANCEONLY', 'GRAD_DEBT_MDN_SUPP', 'HBCU', 'INSTNM', 'MD_EARN_WNE_P10', 'MENONLY', 'PCTFLOAN', 'PCTPELL', 'PPTUG_EF', 'RELAFFIL', 'SATMTMID', 'SATVRMID', 'STABBR', 'UG25ABV', 'UGDS', 'UGDS_2MOR', 'UGDS_AIAN', 'UGDS_ASIAN', 'UGDS_BLACK', 'UGDS_HISP', 'UGDS_NHPI', 'UGDS_NRA', 'UGDS_UNKN', 'UGDS_WHITE', 'WOMENONLY', 'agg', 'aggregate', 'all', 'any', 'apply', 'backfill', 'bfill', 'boxplot', 'corr', 'corrwith', 'count', 'cov', 'cumcount', 'cummax', 'cummin', 'cumprod', 'cumsum', 'describe', 'diff', 'dtypes', 'ewm', 'expanding', 'ffill', 'fillna', 'filter', 'first', 'get_group', 'groups', 'head', 'hist', 'idxmax', 'idxmin', 'indices', 'last', 'mad', 'max', 'mean', 'median', 'min', 'ndim', 'ngroup', 'ngroups', 'nth', 'nunique', 'ohlc', 'pad', 'pct_change', 'pipe', 'plot', 'prod', 'quantile', 'rank', 'resample', 'rolling', 'sample', 'sem', 'shift', 'size', 'skew', 'std', 'sum', 'tail', 'take', 'transform', 'tshift', 'var']
# find the numeber of groups
grouped.ngroups
Out[39]: 112
# to find the uniquely identifying labels for each group, look in the .groups attribute
# ,which contains a dictionary of each unique group mapped to all the corresponding index labes of that group
# because we grouped by two columns, each of the keys has a tuple,
# one value for the 'STABBR' column and another for the 'RELAFFFIL' column
grouped.groups
Out[44]: {('AK', 0): [60, 62, 63, 65, 66, 67, 5171], ('AK', 1): [61, 64, 5417], ('AL', 0): [0, 1, 3, 4, 5, 6, 7, 8, 9, 11, 13, 14, 15, 17, 18, 19, 20, 21, 22, 25, 26, 27, 28, 29, 31, 32, 33, 34, 37, 38, 39, 40, 42, 44, 45, 48, 49, 50, 52, 55, 56, 57, 58, 59, 4307, 4415, 4483, 4484, 4851, 4872, 4986, 4987, 5202, ...], ...}
groups=list(grouped.groups)
groups
Out[46]:
[('AK', 0),
('AK', 1),
('AL', 0),
('AL', 1),
('AR', 0),
....
('AR', 1),
('AS', 0)]
groups[:6]
Out[47]: [('AK', 0), ('AK', 1), ('AL', 0), ('AL', 1), ('AR', 0), ('AR', 1)]
# retrieve a single group with the '.get_group' method by passing it a tuple of an exact group label
# a tuple containing a unique combination of the values in the grouping columns
# the result is a DataFrame
grouped.get_group(('FL',1))
Out[50]:
INSTNM ... GRAD_DEBT_MDN_SUPP
712 The Baptist College of Florida ... 20052
713 Barry University ... 28250
714 Gooding Institute of Nurse Anesthesia ... PrivacySuppressed
715 Bethune-Cookman University ... 36250
724 Johnson University Florida ... 20199
... ... ...
7486 Strayer University-Coral Springs Campus ... 36173.5
7487 Strayer University-Fort Lauderdale Campus ... 36173.5
7488 Strayer University-Miramar Campus ... 36173.5
7489 Strayer University-Doral ... 36173.5
7490 Strayer University-Brickell ... 36173.5
[89 rows x 27 columns]
# take a peek at each individual group
# 'groupby' object are iterable
# when iterating through a 'groupby' object
# you are given a tuple containing the group name and the DataFrame
# with the grouping column moved into the index
for a in grouped:
print(a[0])
('AK', 0)
('AK', 1)
('AL', 0)
('AL', 1)
('AR', 0)
('AR', 1)
('AS', 0)
('AZ', 0)
a[1]
Out[61]:
INSTNM ... GRAD_DEBT_MDN_SUPP
4128 Casper College ... 10764
4129 Central Wyoming College ... 8757
4130 Eastern Wyoming College ... 10000
4131 Laramie County Community College ... 9750
4132 Northwest College ... 11100
4133 Sheridan College ... 7500
4134 Western Wyoming Community College ... 8000
4135 Cheeks International Academy of Beauty Culture... ... 8206
4136 Wyotech-Laramie ... 11599
4137 University of Wyoming ... 18750
5983 CollegeAmerica-Cheyenne ... 27235.5
[11 rows x 27 columns]
# this is the last DataFrame
# means this is the dataframe of the last group
# use .head() method on your 'groupby' object to get the first rows of each group together in a DataFrame
grouped.head(1)
Out[65]:
INSTNM ... GRAD_DEBT_MDN_SUPP
0 Alabama A & M University ... 33888
2 Amridge University ... 23370
43 Prince Institute-Southeast ... 20992
60 University of Alaska Anchorage ... 19449.5
61 Alaska Bible College ... PrivacySuppressed
... ... ...
4561 College of the Marshall Islands ... PrivacySuppressed
5289 Pacific Islands University ... PrivacySuppressed
6439 Touro University Nevada ... PrivacySuppressed
7404 University of the Virgin Islands-Albert A. Sheen ... 15150
7419 Computer Career Center-Las Cruces ... 14250
[112 rows x 27 columns]
# .nth method select those specific rows from each group when provided with a list of integer
# e.g. the following operation selects the first and the last rows from each group
grouped.nth([1,-1])
Out[68]:
INSTNM ... GRAD_DEBT_MDN_SUPP
STABBR RELAFFIL ...
AK 0 University of Alaska Fairbanks ... 19355
0 Ilisagvik College ... PrivacySuppressed
1 Alaska Pacific University ... 23250
1 Alaska Christian College ... PrivacySuppressed
AL 0 University of Alabama at Birmingham ... 21941.5
... ... ...
WV 0 BridgeValley Community & Technical College ... 9429.5
1 Appalachian Bible College ... 9300
1 West Virginia Business College-Nutter Fort ... 19258
WY 0 Central Wyoming College ... 8757
0 CollegeAmerica-Cheyenne ... 27235.5
[213 rows x 25 columns]
Filtering for states with a minority majority
minority majority 少数种族多数
gatekeeping 把关,守关,看门
# previously, use boolean array to filter rows
# in a similar fashion, when using the .groupby method , we can filter out groups
# .filter method of the group object accepts a function that must return either True or False to indicate
# whether a group is kept
# the .filter method applied after a call to .groupby method
# when .filter method is applied, the result does not use the grouping columns as index ,
# but keeps the original index
# the DataFrame .filter method filters columns, not values
# Task: find all the states that have more non-white undergraduates students than white
# keep all the rows from the states,as a whole, that have a minority majority
grouped=college.groupby('STABBR')
grouped.ngroups
Out[79]: 59
college['STABBR'].nunique()
Out[80]: 59
# verifying the same number
# the 'grouped' variable has a .filter method, whichaccepts a custom function that determines whether a group is kept
# the custom function accepts a DataFrame of the current group and returns a boolean
# define function that calculates the total percentage of minority students and returns True if this percentage is greater than a user-defined threshold
def check_minority(df,threshold):
minority_pct=1-df['UGDS_WHITE']
total_minority=(minority_pct*df["UGDS"]).sum()
total_ugds=df["UGDS"].sum()
return (total_minority/total_ugds) > threshold
# use the .filter method passed with the 'check_minority' function and a threshold of 50%
# to find all states that have a minority majority
college_filtered=grouped.filter(check_minority,threshold=0.5)
college_filtered
Out[89]:
INSTNM ... GRAD_DEBT_MDN_SUPP
68 Everest College-Phoenix ... 9500
69 Collins College ... 47000
70 Empire Beauty School-Paradise Valley ... 9588
71 Empire Beauty School-Tucson ... 9833
72 Thunderbird School of Global Management ... PrivacySuppressed
... ... ...
7528 WestMed College - Merced ... 15623.5
7529 Vantage College ... 9500
7530 SAE Institute of Technology San Francisco ... 9500
7533 Bay Area Medical Academy - San Jose Satellite ... ... PrivacySuppressed
7534 Excel Learning Center-San Antonio South ... 12125
[3028 rows x 27 columns]
college.shape
Out[90]: (7535, 27)
college_filtered.shape
Out[92]: (3028, 27)
college_filtered["STABBR"].nunique()
Out[93]: 20
Transforming through a weight loss bet
backgroud:
tile 并排显示;
emulate v.仿真,努力赶上;
tally v. 合计,计算,吻合
import pandas as pd
import numpy as np
weight_loss=pd.read_csv('weight_loss.csv')
weight_loss.query("Month=='Jan' ")
Out[5]:
Name Month Week Weight
0 Bob Jan Week 1 291
1 Amy Jan Week 1 197
2 Bob Jan Week 2 288
3 Amy Jan Week 2 189
4 Bob Jan Week 3 283
5 Amy Jan Week 3 189
6 Bob Jan Week 4 283
7 Amy Jan Week 4 190
# calculate weight loss from the currrent week to the first week of each month
# create a function that is capable of providing weekly updates
# it will take a Series and returns a Series of the same size
def percent_loss(s):
return ((s-s.iloc[0])/s.iloc[0])*100
# test out this function for Bob during the month January
(weight_loss.query("Name=='Bob' and Month=='Jan'")
["Weight"].pipe(percent_loss))
Out[11]:
0 0.000000
2 -1.030928
4 -2.749141
6 -2.749141
Name: Weight, dtype: float64
# we can apply this function to every single combination of person and month
# to get the weight per week in relation to the first week of the month
# use .transform method to apply this custom function
# the function we pass to .transform method needs to maintain the index of the group that is passed into it
# so,we can use 'percent_loss' here
weight_loss.groupby(["Name","Month"])['Weight'].transform(percent_loss)
Out[18]:
0 0.000000
1 0.000000
2 -1.030928
3 -4.060914
4 -2.749141
5 -4.060914
6 -2.749141
7 -3.553299
8 0.000000
9 0.000000
10 -2.826855
11 -3.157895
12 -5.300353
13 -6.842105
14 -5.300353
15 -8.947368
16 0.000000
17 0.000000
18 1.119403
19 0.000000
20 -1.119403
21 -1.734104
22 -2.611940
23 -1.734104
24 0.000000
25 0.000000
26 -1.149425
27 -3.529412
28 -3.065134
29 -3.529412
30 -4.214559
31 -5.294118
Name: Weight, dtype: float64
# because it has the same index, we can inseert it as a column
# the .transform method is useful for summarizing information from groups
# and then adding it back to the original DataFrame
(weight_loss.assign(percent_loss=
weight_loss.groupby(['Name','Month'])
["Weight"].transform(percent_loss)))
Out[22]:
Name Month Week Weight percent_loss
0 Bob Jan Week 1 291 0.000000
1 Amy Jan Week 1 197 0.000000
2 Bob Jan Week 2 288 -1.030928
3 Amy Jan Week 2 189 -4.060914
4 Bob Jan Week 3 283 -2.749141
5 Amy Jan Week 3 189 -4.060914
6 Bob Jan Week 4 283 -2.749141
7 Amy Jan Week 4 190 -3.553299
8 Bob Feb Week 1 283 0.000000
9 Amy Feb Week 1 190 0.000000
10 Bob Feb Week 2 275 -2.826855
11 Amy Feb Week 2 184 -3.157895
12 Bob Feb Week 3 268 -5.300353
13 Amy Feb Week 3 177 -6.842105
14 Bob Feb Week 4 268 -5.300353
15 Amy Feb Week 4 173 -8.947368
16 Bob Mar Week 1 268 0.000000
17 Amy Mar Week 1 173 0.000000
18 Bob Mar Week 2 271 1.119403
19 Amy Mar Week 2 173 0.000000
20 Bob Mar Week 3 265 -1.119403
21 Amy Mar Week 3 170 -1.734104
22 Bob Mar Week 4 261 -2.611940
23 Amy Mar Week 4 170 -1.734104
24 Bob Apr Week 1 261 0.000000
25 Amy Apr Week 1 170 0.000000
26 Bob Apr Week 2 258 -1.149425
27 Amy Apr Week 2 164 -3.529412
28 Bob Apr Week 3 253 -3.065134
29 Amy Apr Week 3 164 -3.529412
30 Bob Apr Week 4 250 -4.214559
31 Amy Apr Week 4 161 -5.294118
(weight_loss.assign(percent_loss=
weight_loss.groupby(['Name','Month'])
["Weight"].transform(percent_loss))
.query("Name=='Bob' and Month in ['Jan','Feb']")
)
Out[23]:
Name Month Week Weight percent_loss
0 Bob Jan Week 1 291 0.000000
2 Bob Jan Week 2 288 -1.030928
4 Bob Jan Week 3 283 -2.749141
6 Bob Jan Week 4 283 -2.749141
8 Bob Feb Week 1 283 0.000000
10 Bob Feb Week 2 275 -2.826855
12 Bob Feb Week 3 268 -5.300353
14 Bob Feb Week 4 268 -5.300353
# find the last week of every month to judge who is winner
(weight_loss.assign(percent_loss=
weight_loss.groupby(['Name','Month'])
["Weight"].transform(percent_loss).round(1))
.query("Week == 'Week 4' "))
Out[25]:
Name Month Week Weight percent_loss
6 Bob Jan Week 4 283 -2.7
7 Amy Jan Week 4 190 -3.6
14 Bob Feb Week 4 268 -5.3
15 Amy Feb Week 4 173 -8.9
22 Bob Mar Week 4 261 -2.6
23 Amy Mar Week 4 170 -1.7
30 Bob Apr Week 4 250 -4.2
31 Amy Apr Week 4 161 -5.3
# reshape this data with the .pivot method
# so that Amy's and Bob's percent weight loss is side by side for ech month
(weight_loss.assign(percent_loss=
weight_loss.groupby(['Name','Month'])
["Weight"].transform(percent_loss).round(1))
.query("Week == 'Week 4' ")
.pivot(index='Month',columns='Name',values='percent_loss')
)
Out[28]:
Name Amy Bob
Month
Apr -5.3 -4.2
Feb -8.9 -5.3
Jan -3.6 -2.7
Mar -1.7 -2.6
# create a column, winner, with the name of winner
# Numpy has a vectorized 'if then else' function called 'where',
# which can map a Series or array of Boolean to other values
(weight_loss.assign(percent_loss=
weight_loss.groupby(['Name','Month'])
["Weight"].transform(percent_loss).round(1))
.query("Week == 'Week 4' ")
.pivot(index='Month',columns='Name',values='percent_loss')
.assign(winner=lambda df_:np.where(df_.Amy<df_.Bob,'Amy','Bob'))
)
Out[32]:
Name Amy Bob winner
Month
Apr -5.3 -4.2 Amy
Feb -8.9 -5.3 Amy
Jan -3.6 -2.7 Amy
Mar -1.7 -2.6 Bob
# use .value_counts method to return the final score as the number of months won
(weight_loss.assign(percent_loss=
weight_loss.groupby(['Name','Month'])
["Weight"].transform(percent_loss).round(1))
.query("Week == 'Week 4' ")
.pivot(index='Month',columns='Name',values='percent_loss')
.assign(winner=lambda df_:np.where(df_.Amy<df_.Bob,'Amy','Bob'))
.winner.value_counts())
Out[34]:
Amy 3
Bob 1
Name: winner, dtype: int64
# after pivoting, we utilize the NumPy 'where' function,
# whose first parameter is a condition that produces a Series of Boolean
# True get mapped to Amy