## groupby
import pandas as pd
import numpy as np
df = pd.DataFrame({'key1' : ['a' , 'a' , 'b' , 'b' , 'a' ],
'key2' : ['one' , 'two' , 'one' , 'two' , 'one' ],
'data1' : np.random.randn(5 ),
'data2' : np.random.randn(5 )})
df
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
data1 data2 key1 key2 0 0.176816 0.713241 a one 1 1.080056 0.110213 a two 2 0.464790 -1.689117 b one 3 2.650545 -0.022297 b two 4 1.570758 0.371698 a one
grouped = df.data1.groupby(df.key1)
grouped
### mean
grouped.mean()
key1 a 0.942543 b 1.557667 Name: data1, dtype: float64 ### 多个key
means = df.data1.groupby([df.key1,df.key2]).mean()
means
key1 key2 a one 0.873787 two 1.080056 b one 0.464790 two 2.650545 Name: data1, dtype: float64 ### 行to列
means.unstack()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
key2 one two key1 a 0.873787 1.080056 b 0.464790 2.650545
### 其他key
states = np.array(['Ohio' ,'California' ,'California' ,'Ohio' ,'Ohio' ])
years = np.array([2005 ,2005 ,2006 ,2005 ,2006 ])
df.data1.groupby([states,years]).mean()
California 2005 1.080056 2006 0.464790 Ohio 2005 1.413680 2006 1.570758 Name: data1, dtype: float64 ### key类型不限制
df.groupby('key1' ).mean()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
data1 data2 key1 a 0.942543 0.398384 b 1.557667 -0.855707
df.groupby(['key1' ,'key2' ]).mean()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
data1 data2 key1 key2 a one 0.873787 0.542469 two 1.080056 0.110213 b one 0.464790 -1.689117 two 2.650545 -0.022297
### size
df.groupby(['key1' ,'key2' ]).size()
key1 key2 a one 2 two 1 b one 1 two 1 dtype: int64 ### 迭代
for name,group in df.groupby('key1' ):
print(name)
print(group)
a data1 data2 key1 key2 0 0.176816 0.713241 a one 1 1.080056 0.110213 a two 4 1.570758 0.371698 a one b data1 data2 key1 key2 2 0.464790 -1.689117 b one 3 2.650545 -0.022297 b two ### 多个key
for (k1,k2),group in df.groupby(['key1' ,'key2' ]):
print((k1,k2))
print(group)
(‘a’, ‘one’) data1 data2 key1 key2 0 0.176816 0.713241 a one 4 1.570758 0.371698 a one (‘a’, ‘two’) data1 data2 key1 key2 1 1.080056 0.110213 a two (‘b’, ‘one’) data1 data2 key1 key2 2 0.46479 -1.689117 b one (‘b’, ‘two’) data1 data2 key1 key2 3 2.650545 -0.022297 b two ### dict
pieces = dict(list(df.groupby('key1' )))
pieces['b' ]
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
data1 data2 key1 key2 2 0.464790 -1.689117 b one 3 2.650545 -0.022297 b two
df.dtypes
data1 float64 data2 float64 key1 object key2 object dtype: object ### 轴
grouped = df.groupby(df.dtypes,axis=1 )
for dtype,group in grouped:
print(dtype)
print(group)
float64 data1 data2 0 0.176816 0.713241 1 1.080056 0.110213 2 0.464790 -1.689117 3 2.650545 -0.022297 4 1.570758 0.371698 object key1 key2 0 a one 1 a two 2 b one 3 b two 4 a one ### 列
df.groupby('key1' )['data1' ]
df.groupby('key1' )[['data2' ]]
df['data1' ].groupby(df['key1' ])
df[['data2' ]].groupby(df['key1' ])
df.groupby(['key1' ,'key2' ])[['data2' ]].mean()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
data2 key1 key2 a one 0.542469 two 0.110213 b one -1.689117 two -0.022297
s_grouped = df.groupby(['key1' ,'key2' ])['data2' ]
s_grouped
s_grouped.mean()
key1 key2 a one 0.542469 two 0.110213 b one -1.689117 two -0.022297 Name: data2, dtype: float64 ### dict和series
people = pd.DataFrame(np.random.randn(5 , 5 ), columns=['a' , 'b' , 'c' , 'd' , 'e' ],index=['Joe' , 'Steve' , 'Wes' , 'Jim' , 'Travis' ])
people.iloc[2 :3 ,[1 ,2 ]] = np.nan
people
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
a b c d e Joe 0.014565 0.122691 -1.900661 0.934866 0.929380 Steve 0.229198 -0.815473 1.129672 -0.405952 -0.194390 Wes 0.221778 NaN NaN 0.139841 0.913400 Jim -0.536079 0.153632 0.593958 0.545882 -0.272977 Travis 0.468627 1.192111 -0.117644 -0.976098 -0.059373
#### map
mapping = {'a' : 'red' , 'b' : 'red' , 'c' : 'blue' , 'd' : 'blue' , 'e' : 'red' , 'f' : 'orange' }
by_column = people.groupby(mapping,axis=1 )
by_column.sum()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
blue red Joe -0.965795 1.066635 Steve 0.723719 -0.780666 Wes 0.139841 1.135178 Jim 1.139840 -0.655424 Travis -1.093742 1.601365
#### series
mao_series = pd.Series(mapping)
mao_series
a red b red c blue d blue e red f orange dtype: object
people.groupby(mao_series,axis=1 ).count()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
blue red Joe 2 3 Steve 2 3 Wes 1 2 Jim 2 3 Travis 2 3
### 函数
people.groupby(len).sum()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
a b c d e 3 -0.299736 0.276323 -1.306703 1.620590 1.569803 5 0.229198 -0.815473 1.129672 -0.405952 -0.194390 6 0.468627 1.192111 -0.117644 -0.976098 -0.059373
key_list = ['one' ,'one' ,'one' ,'two' ,'two' ]
people.groupby([len,key_list]).min()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
a b c d e 3 one 0.014565 0.122691 -1.900661 0.139841 0.913400 two -0.536079 0.153632 0.593958 0.545882 -0.272977 5 one 0.229198 -0.815473 1.129672 -0.405952 -0.194390 6 two 0.468627 1.192111 -0.117644 -0.976098 -0.059373
### 行索引
columns = pd.MultiIndex.from_arrays([['US' , 'US' , 'US' , 'JP' , 'JP' ], [1 , 3 , 5 , 1 , 3 ]],names=['cty' , 'tenor' ])
hier_df = pd.DataFrame(np.random.randn(4 , 5 ), columns=columns)
hier_df
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead tr th { text-align: left; }
cty US JP tenor 1 3 5 1 3 0 -1.560363 0.194168 0.657529 1.388475 1.330651 1 0.178243 0.710925 -0.092163 0.523347 1.669727 2 -0.835299 2.223159 -0.992638 -1.314376 -0.385737 3 -0.523334 0.610515 1.354068 -1.395062 0.580230
hier_df.groupby(level = 'cty' ,axis=1 ).count()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
## Aggregation groupby methods: Function name Description count Number of non-NA values in the group sum Sum of non-NA values mean Mean of non-NA values median Arithmetic median of non-NA values std, var Unbiased (n – 1 denominator) standard deviation and variance min, max Minimum and maximum of non-NA values prod Product of non-NA values first, last First and last non-NA values
df
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
data1 data2 key1 key2 0 0.176816 0.713241 a one 1 1.080056 0.110213 a two 2 0.464790 -1.689117 b one 3 2.650545 -0.022297 b two 4 1.570758 0.371698 a one
grouped = df.groupby('key1' )
grouped.data1.quantile(0.9 )
key1 a 1.472618 b 2.431969 Name: data1, dtype: float64
def peak_to_peak (arr) :
return arr.max() - arr.min()
grouped.agg(peak_to_peak)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
data1 data2 key1 a 1.393942 0.603028 b 2.185755 1.666820
grouped.describe()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead tr th { text-align: left; } .dataframe thead tr:last-of-type th { text-align: right; }
data1 data2 count mean std min 25% 50% 75% max count mean std min 25% 50% 75% max key1 a 3.0 0.942543 0.707072 0.176816 0.628436 1.080056 1.325407 1.570758 3.0 0.398384 0.302398 0.110213 0.240955 0.371698 0.542469 0.713241 b 2.0 1.557667 1.545562 0.464790 1.011228 1.557667 2.104106 2.650545 2.0 -0.855707 1.178620 -1.689117 -1.272412 -0.855707 -0.439002 -0.022297
### 复杂函数应用
tips = pd.read_csv('/Users/meininghang/Downloads/pydata-book-2nd-edition/examples/tips.csv' )
tips['tip_pct' ] = tips.tip / tips.total_bill
tips[:6 ]
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
total_bill tip smoker day time size tip_pct 0 16.99 1.01 No Sun Dinner 2 0.059447 1 10.34 1.66 No Sun Dinner 3 0.160542 2 21.01 3.50 No Sun Dinner 3 0.166587 3 23.68 3.31 No Sun Dinner 2 0.139780 4 24.59 3.61 No Sun Dinner 4 0.146808 5 25.29 4.71 No Sun Dinner 4 0.186240
grouped = tips.groupby(['day' ,'smoker' ])
grouped_pct = grouped['tip_pct' ]
grouped_pct.agg('mean' )
day smoker Fri No 0.151650 Yes 0.174783 Sat No 0.158048 Yes 0.147906 Sun No 0.160113 Yes 0.187250 Thur No 0.160298 Yes 0.163863 Name: tip_pct, dtype: float64
grouped_pct.agg(['mean' ,'std' ,peak_to_peak])
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
mean std peak_to_peak day smoker Fri No 0.151650 0.028123 0.067349 Yes 0.174783 0.051293 0.159925 Sat No 0.158048 0.039767 0.235193 Yes 0.147906 0.061375 0.290095 Sun No 0.160113 0.042347 0.193226 Yes 0.187250 0.154134 0.644685 Thur No 0.160298 0.038774 0.193350 Yes 0.163863 0.039389 0.151240
grouped_pct.agg([('foo' ,'mean' ),('bar' ,np.std)])
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
foo bar day smoker Fri No 0.151650 0.028123 Yes 0.174783 0.051293 Sat No 0.158048 0.039767 Yes 0.147906 0.061375 Sun No 0.160113 0.042347 Yes 0.187250 0.154134 Thur No 0.160298 0.038774 Yes 0.163863 0.039389
functions = ['count' ,'mean' ,'max' ]
result = grouped['tip_pct' ,'total_bill' ].agg(functions)
result
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead tr th { text-align: left; } .dataframe thead tr:last-of-type th { text-align: right; }
tip_pct total_bill count mean max count mean max day smoker Fri No 4 0.151650 0.187735 4 18.420000 22.75 Yes 15 0.174783 0.263480 15 16.813333 40.17 Sat No 45 0.158048 0.291990 45 19.661778 48.33 Yes 42 0.147906 0.325733 42 21.276667 50.81 Sun No 57 0.160113 0.252672 57 20.506667 48.17 Yes 19 0.187250 0.710345 19 24.120000 45.35 Thur No 45 0.160298 0.266312 45 17.113111 41.19 Yes 17 0.163863 0.241255 17 19.190588 43.11
result['tip_pct' ]
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
count mean max day smoker Fri No 4 0.151650 0.187735 Yes 15 0.174783 0.263480 Sat No 45 0.158048 0.291990 Yes 42 0.147906 0.325733 Sun No 57 0.160113 0.252672 Yes 19 0.187250 0.710345 Thur No 45 0.160298 0.266312 Yes 17 0.163863 0.241255
ftuples = [('Durchschnitt' ,'mean' ,),('Abweichung' ,np.var)]
grouped.agg({'tip' :np.max,'size' :'sum' })
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
tip size day smoker Fri No 3.50 9 Yes 4.73 31 Sat No 9.00 115 Yes 10.00 104 Sun No 6.00 167 Yes 6.50 49 Thur No 6.70 112 Yes 5.00 40
grouped.agg({'tip_pct' :['min' ,'max' ,'mean' ,'std' ],'size' :'sum' })
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead tr th { text-align: left; } .dataframe thead tr:last-of-type th { text-align: right; }
tip_pct size min max mean std sum day smoker Fri No 0.120385 0.187735 0.151650 0.028123 9 Yes 0.103555 0.263480 0.174783 0.051293 31 Sat No 0.056797 0.291990 0.158048 0.039767 115 Yes 0.035638 0.325733 0.147906 0.061375 104 Sun No 0.059447 0.252672 0.160113 0.042347 167 Yes 0.065660 0.710345 0.187250 0.154134 49 Thur No 0.072961 0.266312 0.160298 0.038774 112 Yes 0.090014 0.241255 0.163863 0.039389 40
### 原始索引
tips.groupby(['day' ,'smoker' ],as_index=False ).mean()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
day smoker total_bill tip size tip_pct 0 Fri No 18.420000 2.812500 2.250000 0.151650 1 Fri Yes 16.813333 2.714000 2.066667 0.174783 2 Sat No 19.661778 3.102889 2.555556 0.158048 3 Sat Yes 21.276667 2.875476 2.476190 0.147906 4 Sun No 20.506667 3.167895 2.929825 0.160113 5 Sun Yes 24.120000 3.516842 2.578947 0.187250 6 Thur No 17.113111 2.673778 2.488889 0.160298 7 Thur Yes 19.190588 3.030000 2.352941 0.163863
## split-apply-combine
def top (df, n=5 , column='tip_pct' ) :
....: return df.sort_values(by=column)[-n:]
top(tips,n=6 )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
total_bill tip smoker day time size tip_pct 109 14.31 4.00 Yes Sat Dinner 2 0.279525 183 23.17 6.50 Yes Sun Dinner 4 0.280535 232 11.61 3.39 No Sat Dinner 2 0.291990 67 3.07 1.00 Yes Sat Dinner 1 0.325733 178 9.60 4.00 Yes Sun Dinner 2 0.416667 172 7.25 5.15 Yes Sun Dinner 2 0.710345
tips.groupby('smoker' ).apply(top)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
total_bill tip smoker day time size tip_pct smoker No 88 24.71 5.85 No Thur Lunch 2 0.236746 185 20.69 5.00 No Sun Dinner 5 0.241663 51 10.29 2.60 No Sun Dinner 2 0.252672 149 7.51 2.00 No Thur Lunch 2 0.266312 232 11.61 3.39 No Sat Dinner 2 0.291990 Yes 109 14.31 4.00 Yes Sat Dinner 2 0.279525 183 23.17 6.50 Yes Sun Dinner 4 0.280535 67 3.07 1.00 Yes Sat Dinner 1 0.325733 178 9.60 4.00 Yes Sun Dinner 2 0.416667 172 7.25 5.15 Yes Sun Dinner 2 0.710345
tips
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
total_bill tip smoker day time size tip_pct 0 16.99 1.01 No Sun Dinner 2 0.059447 1 10.34 1.66 No Sun Dinner 3 0.160542 2 21.01 3.50 No Sun Dinner 3 0.166587 3 23.68 3.31 No Sun Dinner 2 0.139780 4 24.59 3.61 No Sun Dinner 4 0.146808 5 25.29 4.71 No Sun Dinner 4 0.186240 6 8.77 2.00 No Sun Dinner 2 0.228050 7 26.88 3.12 No Sun Dinner 4 0.116071 8 15.04 1.96 No Sun Dinner 2 0.130319 9 14.78 3.23 No Sun Dinner 2 0.218539 10 10.27 1.71 No Sun Dinner 2 0.166504 11 35.26 5.00 No Sun Dinner 4 0.141804 12 15.42 1.57 No Sun Dinner 2 0.101816 13 18.43 3.00 No Sun Dinner 4 0.162778 14 14.83 3.02 No Sun Dinner 2 0.203641 15 21.58 3.92 No Sun Dinner 2 0.181650 16 10.33 1.67 No Sun Dinner 3 0.161665 17 16.29 3.71 No Sun Dinner 3 0.227747 18 16.97 3.50 No Sun Dinner 3 0.206246 19 20.65 3.35 No Sat Dinner 3 0.162228 20 17.92 4.08 No Sat Dinner 2 0.227679 21 20.29 2.75 No Sat Dinner 2 0.135535 22 15.77 2.23 No Sat Dinner 2 0.141408 23 39.42 7.58 No Sat Dinner 4 0.192288 24 19.82 3.18 No Sat Dinner 2 0.160444 25 17.81 2.34 No Sat Dinner 4 0.131387 26 13.37 2.00 No Sat Dinner 2 0.149589 27 12.69 2.00 No Sat Dinner 2 0.157604 28 21.70 4.30 No Sat Dinner 2 0.198157 29 19.65 3.00 No Sat Dinner 2 0.152672 … … … … … … … … 214 28.17 6.50 Yes Sat Dinner 3 0.230742 215 12.90 1.10 Yes Sat Dinner 2 0.085271 216 28.15 3.00 Yes Sat Dinner 5 0.106572 217 11.59 1.50 Yes Sat Dinner 2 0.129422 218 7.74 1.44 Yes Sat Dinner 2 0.186047 219 30.14 3.09 Yes Sat Dinner 4 0.102522 220 12.16 2.20 Yes Fri Lunch 2 0.180921 221 13.42 3.48 Yes Fri Lunch 2 0.259314 222 8.58 1.92 Yes Fri Lunch 1 0.223776 223 15.98 3.00 No Fri Lunch 3 0.187735 224 13.42 1.58 Yes Fri Lunch 2 0.117735 225 16.27 2.50 Yes Fri Lunch 2 0.153657 226 10.09 2.00 Yes Fri Lunch 2 0.198216 227 20.45 3.00 No Sat Dinner 4 0.146699 228 13.28 2.72 No Sat Dinner 2 0.204819 229 22.12 2.88 Yes Sat Dinner 2 0.130199 230 24.01 2.00 Yes Sat Dinner 4 0.083299 231 15.69 3.00 Yes Sat Dinner 3 0.191205 232 11.61 3.39 No Sat Dinner 2 0.291990 233 10.77 1.47 No Sat Dinner 2 0.136490 234 15.53 3.00 Yes Sat Dinner 2 0.193175 235 10.07 1.25 No Sat Dinner 2 0.124131 236 12.60 1.00 Yes Sat Dinner 2 0.079365 237 32.83 1.17 Yes Sat Dinner 2 0.035638 238 35.83 4.67 No Sat Dinner 3 0.130338 239 29.03 5.92 No Sat Dinner 3 0.203927 240 27.18 2.00 Yes Sat Dinner 2 0.073584 241 22.67 2.00 Yes Sat Dinner 2 0.088222 242 17.82 1.75 No Sat Dinner 2 0.098204 243 18.78 3.00 No Thur Dinner 2 0.159744
244 rows × 7 columns
tips.groupby(['smoker' ,'day' ]).apply(top,n=1 ,column = 'total_bill' )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
total_bill tip smoker day time size tip_pct smoker day No Fri 94 22.75 3.25 No Fri Dinner 2 0.142857 Sat 212 48.33 9.00 No Sat Dinner 4 0.186220 Sun 156 48.17 5.00 No Sun Dinner 6 0.103799 Thur 142 41.19 5.00 No Thur Lunch 5 0.121389 Yes Fri 95 40.17 4.73 Yes Fri Dinner 4 0.117750 Sat 170 50.81 10.00 Yes Sat Dinner 3 0.196812 Sun 182 45.35 3.50 Yes Sun Dinner 3 0.077178 Thur 197 43.11 5.00 Yes Thur Lunch 4 0.115982
result = tips.groupby('smoker' )['tip_pct' ].describe()
result
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
count mean std min 25% 50% 75% max smoker No 151.0 0.159328 0.039910 0.056797 0.136906 0.155625 0.185014 0.291990 Yes 93.0 0.163196 0.085119 0.035638 0.106771 0.153846 0.195059 0.710345
result.unstack('smoker' )
smoker count No 151.000000 Yes 93.000000 mean No 0.159328 Yes 0.163196 std No 0.039910 Yes 0.085119 min No 0.056797 Yes 0.035638 25% No 0.136906 Yes 0.106771 50% No 0.155625 Yes 0.153846 75% No 0.185014 Yes 0.195059 max No 0.291990 Yes 0.710345 dtype: float64 #### lambda
f = lambda x:x.describe()
grouped.apply(f)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
total_bill tip size tip_pct day smoker Fri No count 4.000000 4.000000 4.000000 4.000000 mean 18.420000 2.812500 2.250000 0.151650 std 5.059282 0.898494 0.500000 0.028123 min 12.460000 1.500000 2.000000 0.120385 25% 15.100000 2.625000 2.000000 0.137239 50% 19.235000 3.125000 2.000000 0.149241 75% 22.555000 3.312500 2.250000 0.163652 max 22.750000 3.500000 3.000000 0.187735 Yes count 15.000000 15.000000 15.000000 15.000000 mean 16.813333 2.714000 2.066667 0.174783 std 9.086388 1.077668 0.593617 0.051293 min 5.750000 1.000000 1.000000 0.103555 25% 11.690000 1.960000 2.000000 0.133739 50% 13.420000 2.500000 2.000000 0.173913 75% 18.665000 3.240000 2.000000 0.209240 max 40.170000 4.730000 4.000000 0.263480 Sat No count 45.000000 45.000000 45.000000 45.000000 mean 19.661778 3.102889 2.555556 0.158048 std 8.939181 1.642088 0.784960 0.039767 min 7.250000 1.000000 1.000000 0.056797 25% 14.730000 2.010000 2.000000 0.136240 50% 17.820000 2.750000 2.000000 0.150152 75% 20.650000 3.390000 3.000000 0.183915 max 48.330000 9.000000 4.000000 0.291990 Yes count 42.000000 42.000000 42.000000 42.000000 mean 21.276667 2.875476 2.476190 0.147906 std 10.069138 1.630580 0.862161 0.061375 min 3.070000 1.000000 1.000000 0.035638 25% 13.405000 2.000000 2.000000 0.091797 50% 20.390000 2.690000 2.000000 0.153624 … … … … … … … Sun No std 8.130189 1.224785 1.032674 0.042347 min 8.770000 1.010000 2.000000 0.059447 25% 14.780000 2.000000 2.000000 0.139780 50% 18.430000 3.020000 3.000000 0.161665 75% 25.000000 3.920000 4.000000 0.185185 max 48.170000 6.000000 6.000000 0.252672 Yes count 19.000000 19.000000 19.000000 19.000000 mean 24.120000 3.516842 2.578947 0.187250 std 10.442511 1.261151 0.901591 0.154134 min 7.250000 1.500000 2.000000 0.065660 25% 17.165000 3.000000 2.000000 0.097723 50% 23.100000 3.500000 2.000000 0.138122 75% 32.375000 4.000000 3.000000 0.215325 max 45.350000 6.500000 5.000000 0.710345 Thur No count 45.000000 45.000000 45.000000 45.000000 mean 17.113111 2.673778 2.488889 0.160298 std 7.721728 1.282964 1.179796 0.038774 min 7.510000 1.250000 1.000000 0.072961 25% 11.690000 1.800000 2.000000 0.137741 50% 15.950000 2.180000 2.000000 0.153492 75% 20.270000 3.000000 2.000000 0.184843 max 41.190000 6.700000 6.000000 0.266312 Yes count 17.000000 17.000000 17.000000 17.000000 mean 19.190588 3.030000 2.352941 0.163863 std 8.355149 1.113491 0.701888 0.039389 min 10.340000 2.000000 2.000000 0.090014 25% 13.510000 2.000000 2.000000 0.148038 50% 16.470000 2.560000 2.000000 0.153846 75% 19.810000 4.000000 2.000000 0.194837 max 43.110000 5.000000 4.000000 0.241255
64 rows × 4 columns
### 抑制关键字
tips.groupby('smoker' ,group_keys = False ).apply(top)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
total_bill tip smoker day time size tip_pct 88 24.71 5.85 No Thur Lunch 2 0.236746 185 20.69 5.00 No Sun Dinner 5 0.241663 51 10.29 2.60 No Sun Dinner 2 0.252672 149 7.51 2.00 No Thur Lunch 2 0.266312 232 11.61 3.39 No Sat Dinner 2 0.291990 109 14.31 4.00 Yes Sat Dinner 2 0.279525 183 23.17 6.50 Yes Sun Dinner 4 0.280535 67 3.07 1.00 Yes Sat Dinner 1 0.325733 178 9.60 4.00 Yes Sun Dinner 2 0.416667 172 7.25 5.15 Yes Sun Dinner 2 0.710345
### 填充缺失值
s = pd.Series(np.random.randn(6 ))
s[::2 ] = np.nan
s
0 NaN 1 -0.975734 2 NaN 3 -0.359448 4 NaN 5 1.577221 dtype: float64
s.fillna(s.mean())
0 0.080680 1 -0.975734 2 0.080680 3 -0.359448 4 0.080680 5 1.577221 dtype: float64
states = ['Ohio' , 'New York' , 'Vermont' , 'Florida' ,
'Oregon' , 'Nevada' , 'California' , 'Idaho' ]
group_key = ['East' ] * 4 + ['West' ] * 4
data = pd.Series(np.random.randn(8 ),index = states)
data
Ohio -0.017642 New York -0.655295 Vermont 0.469594 Florida -0.161041 Oregon 0.795937 Nevada 1.144693 California -1.096555 Idaho 0.166461 dtype: float64
data[['Vermont' ,'Nevada' ,'Idaho' ]] = np.nan
data
Ohio -0.017642 New York -0.655295 Vermont NaN Florida -0.161041 Oregon 0.795937 Nevada NaN California -1.096555 Idaho NaN dtype: float64
data.groupby(group_key).mean()
East -0.277993 West -0.150309 dtype: float64
fill_mean = lambda g: g.fillna(g.mean())
data.groupby(group_key).apply(fill_mean)
Ohio -0.017642 New York -0.655295 Vermont -0.277993 Florida -0.161041 Oregon 0.795937 Nevada -0.150309 California -1.096555 Idaho -0.150309 dtype: float64 ## 透视表
tips.pivot_table(index = ['day' ,'smoker' ])
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
size tip tip_pct total_bill day smoker Fri No 2.250000 2.812500 0.151650 18.420000 Yes 2.066667 2.714000 0.174783 16.813333 Sat No 2.555556 3.102889 0.158048 19.661778 Yes 2.476190 2.875476 0.147906 21.276667 Sun No 2.929825 3.167895 0.160113 20.506667 Yes 2.578947 3.516842 0.187250 24.120000 Thur No 2.488889 2.673778 0.160298 17.113111 Yes 2.352941 3.030000 0.163863 19.190588
tips.pivot_table(['tip_pct' ,'size' ],index=['time' ,'day' ],columns = 'smoker' )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead tr th { text-align: left; } .dataframe thead tr:last-of-type th { text-align: right; }
size tip_pct smoker No Yes No Yes time day Dinner Fri 2.000000 2.222222 0.139622 0.165347 Sat 2.555556 2.476190 0.158048 0.147906 Sun 2.929825 2.578947 0.160113 0.187250 Thur 2.000000 NaN 0.159744 NaN Lunch Fri 3.000000 1.833333 0.187735 0.188937 Thur 2.500000 2.352941 0.160311 0.163863
tips.pivot_table(['tip_pct' ,'size' ],index=['time' ,'day' ],columns='smoker' ,margins=True )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead tr th { text-align: left; } .dataframe thead tr:last-of-type th { text-align: right; }
size tip_pct smoker No Yes All No Yes All time day Dinner Fri 2.000000 2.222222 2.166667 0.139622 0.165347 0.158916 Sat 2.555556 2.476190 2.517241 0.158048 0.147906 0.153152 Sun 2.929825 2.578947 2.842105 0.160113 0.187250 0.166897 Thur 2.000000 NaN 2.000000 0.159744 NaN 0.159744 Lunch Fri 3.000000 1.833333 2.000000 0.187735 0.188937 0.188765 Thur 2.500000 2.352941 2.459016 0.160311 0.163863 0.161301 All 2.668874 2.408602 2.569672 0.159328 0.163196 0.160803
tips.pivot_table('tip_pct' ,index=['time' ,'smoker' ],columns='day' ,aggfunc=len,margins=True )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
day Fri Sat Sun Thur All time smoker Dinner No 3.0 45.0 57.0 1.0 106.0 Yes 9.0 42.0 19.0 NaN 70.0 Lunch No 1.0 NaN NaN 44.0 45.0 Yes 6.0 NaN NaN 17.0 23.0 All 19.0 87.0 76.0 62.0 244.0
tips.pivot_table('tip_pct' ,index=['time' ,'smoker' ],columns='day' ,aggfunc='mean' ,fill_value=0 ,margins=True )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
day Fri Sat Sun Thur All time smoker Dinner No 0.139622 0.158048 0.160113 0.159744 0.158653 Yes 0.165347 0.147906 0.187250 0.000000 0.160828 Lunch No 0.187735 0.000000 0.000000 0.160311 0.160920 Yes 0.188937 0.000000 0.000000 0.163863 0.170404 All 0.169913 0.153152 0.166897 0.161276 0.160803
method: Function name Description values Column name or names to aggregate; by default aggregates all numeric columns index Column names or other group keys to group on the rows of the resulting pivot table columns Column names or other group keys to group on the columns of the resulting pivot table aggfunc Aggregation function or list of functions (‘mean’ by default); can be any function valid in a groupby context fill_value Replace missing values in result table dropna If True, do not include columns whose entries are all NA margins Add row/column subtotals and grand total (False by default)
#### crosstab
pd.crosstab([tips.time,tips.day],tips.smoker,margins=True )
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
smoker No Yes All time day Dinner Fri 3 9 12 Sat 45 42 87 Sun 57 19 76 Thur 1 0 1 Lunch Fri 1 6 7 Thur 44 17 61 All 151 93 244