利用Python进行数据分析第二版之group basic

## groupby
import pandas as pd
import numpy as np
df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
 'key2' : ['one', 'two', 'one', 'two', 'one'],
'data1' : np.random.randn(5),
'data2' : np.random.randn(5)})
df
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
data1data2key1key2
00.1768160.713241aone
11.0800560.110213atwo
20.464790-1.689117bone
32.650545-0.022297btwo
41.5707580.371698aone
grouped = df.data1.groupby(df.key1)
grouped
### mean
grouped.mean()
key1 a 0.942543 b 1.557667 Name: data1, dtype: float64 ### 多个key
means = df.data1.groupby([df.key1,df.key2]).mean()
means
key1 key2 a one 0.873787 two 1.080056 b one 0.464790 two 2.650545 Name: data1, dtype: float64 ### 行to列
means.unstack()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
key2onetwo
key1
a0.8737871.080056
b0.4647902.650545
### 其他key
states = np.array(['Ohio','California','California','Ohio','Ohio'])
years = np.array([2005,2005,2006,2005,2006])
df.data1.groupby([states,years]).mean()
California 2005 1.080056 2006 0.464790 Ohio 2005 1.413680 2006 1.570758 Name: data1, dtype: float64 ### key类型不限制
df.groupby('key1').mean()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
data1data2
key1
a0.9425430.398384
b1.557667-0.855707
df.groupby(['key1','key2']).mean()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
data1data2
key1key2
aone0.8737870.542469
two1.0800560.110213
bone0.464790-1.689117
two2.650545-0.022297
### size
df.groupby(['key1','key2']).size()
key1 key2 a one 2 two 1 b one 1 two 1 dtype: int64 ### 迭代
for name,group in df.groupby('key1'):
    print(name)
    print(group)
a data1 data2 key1 key2 0 0.176816 0.713241 a one 1 1.080056 0.110213 a two 4 1.570758 0.371698 a one b data1 data2 key1 key2 2 0.464790 -1.689117 b one 3 2.650545 -0.022297 b two ### 多个key
for (k1,k2),group in df.groupby(['key1','key2']):
    print((k1,k2))
    print(group)
(‘a’, ‘one’) data1 data2 key1 key2 0 0.176816 0.713241 a one 4 1.570758 0.371698 a one (‘a’, ‘two’) data1 data2 key1 key2 1 1.080056 0.110213 a two (‘b’, ‘one’) data1 data2 key1 key2 2 0.46479 -1.689117 b one (‘b’, ‘two’) data1 data2 key1 key2 3 2.650545 -0.022297 b two ### dict
pieces = dict(list(df.groupby('key1')))
pieces['b']
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
data1data2key1key2
20.464790-1.689117bone
32.650545-0.022297btwo
df.dtypes
data1 float64 data2 float64 key1 object key2 object dtype: object ### 轴
grouped = df.groupby(df.dtypes,axis=1)
for dtype,group in grouped:
    print(dtype)
    print(group)
float64 data1 data2 0 0.176816 0.713241 1 1.080056 0.110213 2 0.464790 -1.689117 3 2.650545 -0.022297 4 1.570758 0.371698 object key1 key2 0 a one 1 a two 2 b one 3 b two 4 a one ### 列
df.groupby('key1')['data1']
df.groupby('key1')[['data2']]
df['data1'].groupby(df['key1'])
df[['data2']].groupby(df['key1'])
df.groupby(['key1','key2'])[['data2']].mean()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
data2
key1key2
aone0.542469
two0.110213
bone-1.689117
two-0.022297
s_grouped = df.groupby(['key1','key2'])['data2']
s_grouped
s_grouped.mean()
key1 key2 a one 0.542469 two 0.110213 b one -1.689117 two -0.022297 Name: data2, dtype: float64 ### dict和series
 people = pd.DataFrame(np.random.randn(5, 5), columns=['a', 'b', 'c', 'd', 'e'],index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people.iloc[2:3,[1,2]] = np.nan
people
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
abcde
Joe0.0145650.122691-1.9006610.9348660.929380
Steve0.229198-0.8154731.129672-0.405952-0.194390
Wes0.221778NaNNaN0.1398410.913400
Jim-0.5360790.1536320.5939580.545882-0.272977
Travis0.4686271.192111-0.117644-0.976098-0.059373
#### map
 mapping = {'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red', 'f' : 'orange'}
by_column = people.groupby(mapping,axis=1)
by_column.sum()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
bluered
Joe-0.9657951.066635
Steve0.723719-0.780666
Wes0.1398411.135178
Jim1.139840-0.655424
Travis-1.0937421.601365
#### series
mao_series = pd.Series(mapping)
mao_series
a red b red c blue d blue e red f orange dtype: object
people.groupby(mao_series,axis=1).count()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
bluered
Joe23
Steve23
Wes12
Jim23
Travis23
### 函数
people.groupby(len).sum()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
abcde
3-0.2997360.276323-1.3067031.6205901.569803
50.229198-0.8154731.129672-0.405952-0.194390
60.4686271.192111-0.117644-0.976098-0.059373
key_list = ['one','one','one','two','two']
people.groupby([len,key_list]).min()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
abcde
3one0.0145650.122691-1.9006610.1398410.913400
two-0.5360790.1536320.5939580.545882-0.272977
5one0.229198-0.8154731.129672-0.405952-0.194390
6two0.4686271.192111-0.117644-0.976098-0.059373
### 行索引
 columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'], [1, 3, 5, 1, 3]],names=['cty', 'tenor'])
 hier_df = pd.DataFrame(np.random.randn(4, 5), columns=columns)
hier_df
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead tr th { text-align: left; }
ctyUSJP
tenor13513
0-1.5603630.1941680.6575291.3884751.330651
10.1782430.710925-0.0921630.5233471.669727
2-0.8352992.223159-0.992638-1.314376-0.385737
3-0.5233340.6105151.354068-1.3950620.580230
hier_df.groupby(level = 'cty',axis=1).count()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
ctyJPUS
023
123
223
323
## Aggregation groupby methods: Function name Description count Number of non-NA values in the group sum Sum of non-NA values mean Mean of non-NA values median Arithmetic median of non-NA values std, var Unbiased (n – 1 denominator) standard deviation and variance min, max Minimum and maximum of non-NA values prod Product of non-NA values first, last First and last non-NA values
df
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
data1data2key1key2
00.1768160.713241aone
11.0800560.110213atwo
20.464790-1.689117bone
32.650545-0.022297btwo
41.5707580.371698aone
grouped = df.groupby('key1')
grouped.data1.quantile(0.9)
key1 a 1.472618 b 2.431969 Name: data1, dtype: float64
def peak_to_peak(arr):
    return arr.max() - arr.min()
grouped.agg(peak_to_peak)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
data1data2
key1
a1.3939420.603028
b2.1857551.666820
grouped.describe()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead tr th { text-align: left; } .dataframe thead tr:last-of-type th { text-align: right; }
data1data2
countmeanstdmin25%50%75%maxcountmeanstdmin25%50%75%max
key1
a3.00.9425430.7070720.1768160.6284361.0800561.3254071.5707583.00.3983840.3023980.1102130.2409550.3716980.5424690.713241
b2.01.5576671.5455620.4647901.0112281.5576672.1041062.6505452.0-0.8557071.178620-1.689117-1.272412-0.855707-0.439002-0.022297
### 复杂函数应用
tips = pd.read_csv('/Users/meininghang/Downloads/pydata-book-2nd-edition/examples/tips.csv')
tips['tip_pct'] = tips.tip / tips.total_bill
tips[:6]
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
total_billtipsmokerdaytimesizetip_pct
016.991.01NoSunDinner20.059447
110.341.66NoSunDinner30.160542
221.013.50NoSunDinner30.166587
323.683.31NoSunDinner20.139780
424.593.61NoSunDinner40.146808
525.294.71NoSunDinner40.186240
grouped = tips.groupby(['day','smoker'])
grouped_pct = grouped['tip_pct']
grouped_pct.agg('mean')
day smoker Fri No 0.151650 Yes 0.174783 Sat No 0.158048 Yes 0.147906 Sun No 0.160113 Yes 0.187250 Thur No 0.160298 Yes 0.163863 Name: tip_pct, dtype: float64
grouped_pct.agg(['mean','std',peak_to_peak])
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
meanstdpeak_to_peak
daysmoker
FriNo0.1516500.0281230.067349
Yes0.1747830.0512930.159925
SatNo0.1580480.0397670.235193
Yes0.1479060.0613750.290095
SunNo0.1601130.0423470.193226
Yes0.1872500.1541340.644685
ThurNo0.1602980.0387740.193350
Yes0.1638630.0393890.151240
grouped_pct.agg([('foo','mean'),('bar',np.std)])
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
foobar
daysmoker
FriNo0.1516500.028123
Yes0.1747830.051293
SatNo0.1580480.039767
Yes0.1479060.061375
SunNo0.1601130.042347
Yes0.1872500.154134
ThurNo0.1602980.038774
Yes0.1638630.039389
functions = ['count','mean','max']
result = grouped['tip_pct','total_bill'].agg(functions)
result
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead tr th { text-align: left; } .dataframe thead tr:last-of-type th { text-align: right; }
tip_pcttotal_bill
countmeanmaxcountmeanmax
daysmoker
FriNo40.1516500.187735418.42000022.75
Yes150.1747830.2634801516.81333340.17
SatNo450.1580480.2919904519.66177848.33
Yes420.1479060.3257334221.27666750.81
SunNo570.1601130.2526725720.50666748.17
Yes190.1872500.7103451924.12000045.35
ThurNo450.1602980.2663124517.11311141.19
Yes170.1638630.2412551719.19058843.11
result['tip_pct']
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
countmeanmax
daysmoker
FriNo40.1516500.187735
Yes150.1747830.263480
SatNo450.1580480.291990
Yes420.1479060.325733
SunNo570.1601130.252672
Yes190.1872500.710345
ThurNo450.1602980.266312
Yes170.1638630.241255
ftuples = [('Durchschnitt','mean',),('Abweichung',np.var)]
grouped.agg({'tip':np.max,'size':'sum'})
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
tipsize
daysmoker
FriNo3.509
Yes4.7331
SatNo9.00115
Yes10.00104
SunNo6.00167
Yes6.5049
ThurNo6.70112
Yes5.0040
grouped.agg({'tip_pct':['min','max','mean','std'],'size':'sum'})
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead tr th { text-align: left; } .dataframe thead tr:last-of-type th { text-align: right; }
tip_pctsize
minmaxmeanstdsum
daysmoker
FriNo0.1203850.1877350.1516500.0281239
Yes0.1035550.2634800.1747830.05129331
SatNo0.0567970.2919900.1580480.039767115
Yes0.0356380.3257330.1479060.061375104
SunNo0.0594470.2526720.1601130.042347167
Yes0.0656600.7103450.1872500.15413449
ThurNo0.0729610.2663120.1602980.038774112
Yes0.0900140.2412550.1638630.03938940
### 原始索引
tips.groupby(['day','smoker'],as_index=False).mean()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
daysmokertotal_billtipsizetip_pct
0FriNo18.4200002.8125002.2500000.151650
1FriYes16.8133332.7140002.0666670.174783
2SatNo19.6617783.1028892.5555560.158048
3SatYes21.2766672.8754762.4761900.147906
4SunNo20.5066673.1678952.9298250.160113
5SunYes24.1200003.5168422.5789470.187250
6ThurNo17.1131112.6737782.4888890.160298
7ThurYes19.1905883.0300002.3529410.163863
## split-apply-combine
def top(df, n=5, column='tip_pct'):
   ....:     return df.sort_values(by=column)[-n:]
top(tips,n=6)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
total_billtipsmokerdaytimesizetip_pct
10914.314.00YesSatDinner20.279525
18323.176.50YesSunDinner40.280535
23211.613.39NoSatDinner20.291990
673.071.00YesSatDinner10.325733
1789.604.00YesSunDinner20.416667
1727.255.15YesSunDinner20.710345
tips.groupby('smoker').apply(top)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
total_billtipsmokerdaytimesizetip_pct
smoker
No8824.715.85NoThurLunch20.236746
18520.695.00NoSunDinner50.241663
5110.292.60NoSunDinner20.252672
1497.512.00NoThurLunch20.266312
23211.613.39NoSatDinner20.291990
Yes10914.314.00YesSatDinner20.279525
18323.176.50YesSunDinner40.280535
673.071.00YesSatDinner10.325733
1789.604.00YesSunDinner20.416667
1727.255.15YesSunDinner20.710345
tips
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
total_billtipsmokerdaytimesizetip_pct
016.991.01NoSunDinner20.059447
110.341.66NoSunDinner30.160542
221.013.50NoSunDinner30.166587
323.683.31NoSunDinner20.139780
424.593.61NoSunDinner40.146808
525.294.71NoSunDinner40.186240
68.772.00NoSunDinner20.228050
726.883.12NoSunDinner40.116071
815.041.96NoSunDinner20.130319
914.783.23NoSunDinner20.218539
1010.271.71NoSunDinner20.166504
1135.265.00NoSunDinner40.141804
1215.421.57NoSunDinner20.101816
1318.433.00NoSunDinner40.162778
1414.833.02NoSunDinner20.203641
1521.583.92NoSunDinner20.181650
1610.331.67NoSunDinner30.161665
1716.293.71NoSunDinner30.227747
1816.973.50NoSunDinner30.206246
1920.653.35NoSatDinner30.162228
2017.924.08NoSatDinner20.227679
2120.292.75NoSatDinner20.135535
2215.772.23NoSatDinner20.141408
2339.427.58NoSatDinner40.192288
2419.823.18NoSatDinner20.160444
2517.812.34NoSatDinner40.131387
2613.372.00NoSatDinner20.149589
2712.692.00NoSatDinner20.157604
2821.704.30NoSatDinner20.198157
2919.653.00NoSatDinner20.152672
21428.176.50YesSatDinner30.230742
21512.901.10YesSatDinner20.085271
21628.153.00YesSatDinner50.106572
21711.591.50YesSatDinner20.129422
2187.741.44YesSatDinner20.186047
21930.143.09YesSatDinner40.102522
22012.162.20YesFriLunch20.180921
22113.423.48YesFriLunch20.259314
2228.581.92YesFriLunch10.223776
22315.983.00NoFriLunch30.187735
22413.421.58YesFriLunch20.117735
22516.272.50YesFriLunch20.153657
22610.092.00YesFriLunch20.198216
22720.453.00NoSatDinner40.146699
22813.282.72NoSatDinner20.204819
22922.122.88YesSatDinner20.130199
23024.012.00YesSatDinner40.083299
23115.693.00YesSatDinner30.191205
23211.613.39NoSatDinner20.291990
23310.771.47NoSatDinner20.136490
23415.533.00YesSatDinner20.193175
23510.071.25NoSatDinner20.124131
23612.601.00YesSatDinner20.079365
23732.831.17YesSatDinner20.035638
23835.834.67NoSatDinner30.130338
23929.035.92NoSatDinner30.203927
24027.182.00YesSatDinner20.073584
24122.672.00YesSatDinner20.088222
24217.821.75NoSatDinner20.098204
24318.783.00NoThurDinner20.159744

244 rows × 7 columns

tips.groupby(['smoker','day']).apply(top,n=1,column = 'total_bill')
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
total_billtipsmokerdaytimesizetip_pct
smokerday
NoFri9422.753.25NoFriDinner20.142857
Sat21248.339.00NoSatDinner40.186220
Sun15648.175.00NoSunDinner60.103799
Thur14241.195.00NoThurLunch50.121389
YesFri9540.174.73YesFriDinner40.117750
Sat17050.8110.00YesSatDinner30.196812
Sun18245.353.50YesSunDinner30.077178
Thur19743.115.00YesThurLunch40.115982
result = tips.groupby('smoker')['tip_pct'].describe()
result
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
countmeanstdmin25%50%75%max
smoker
No151.00.1593280.0399100.0567970.1369060.1556250.1850140.291990
Yes93.00.1631960.0851190.0356380.1067710.1538460.1950590.710345
result.unstack('smoker')
smoker count No 151.000000 Yes 93.000000 mean No 0.159328 Yes 0.163196 std No 0.039910 Yes 0.085119 min No 0.056797 Yes 0.035638 25% No 0.136906 Yes 0.106771 50% No 0.155625 Yes 0.153846 75% No 0.185014 Yes 0.195059 max No 0.291990 Yes 0.710345 dtype: float64 #### lambda
f = lambda x:x.describe()
grouped.apply(f)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
total_billtipsizetip_pct
daysmoker
FriNocount4.0000004.0000004.0000004.000000
mean18.4200002.8125002.2500000.151650
std5.0592820.8984940.5000000.028123
min12.4600001.5000002.0000000.120385
25%15.1000002.6250002.0000000.137239
50%19.2350003.1250002.0000000.149241
75%22.5550003.3125002.2500000.163652
max22.7500003.5000003.0000000.187735
Yescount15.00000015.00000015.00000015.000000
mean16.8133332.7140002.0666670.174783
std9.0863881.0776680.5936170.051293
min5.7500001.0000001.0000000.103555
25%11.6900001.9600002.0000000.133739
50%13.4200002.5000002.0000000.173913
75%18.6650003.2400002.0000000.209240
max40.1700004.7300004.0000000.263480
SatNocount45.00000045.00000045.00000045.000000
mean19.6617783.1028892.5555560.158048
std8.9391811.6420880.7849600.039767
min7.2500001.0000001.0000000.056797
25%14.7300002.0100002.0000000.136240
50%17.8200002.7500002.0000000.150152
75%20.6500003.3900003.0000000.183915
max48.3300009.0000004.0000000.291990
Yescount42.00000042.00000042.00000042.000000
mean21.2766672.8754762.4761900.147906
std10.0691381.6305800.8621610.061375
min3.0700001.0000001.0000000.035638
25%13.4050002.0000002.0000000.091797
50%20.3900002.6900002.0000000.153624
SunNostd8.1301891.2247851.0326740.042347
min8.7700001.0100002.0000000.059447
25%14.7800002.0000002.0000000.139780
50%18.4300003.0200003.0000000.161665
75%25.0000003.9200004.0000000.185185
max48.1700006.0000006.0000000.252672
Yescount19.00000019.00000019.00000019.000000
mean24.1200003.5168422.5789470.187250
std10.4425111.2611510.9015910.154134
min7.2500001.5000002.0000000.065660
25%17.1650003.0000002.0000000.097723
50%23.1000003.5000002.0000000.138122
75%32.3750004.0000003.0000000.215325
max45.3500006.5000005.0000000.710345
ThurNocount45.00000045.00000045.00000045.000000
mean17.1131112.6737782.4888890.160298
std7.7217281.2829641.1797960.038774
min7.5100001.2500001.0000000.072961
25%11.6900001.8000002.0000000.137741
50%15.9500002.1800002.0000000.153492
75%20.2700003.0000002.0000000.184843
max41.1900006.7000006.0000000.266312
Yescount17.00000017.00000017.00000017.000000
mean19.1905883.0300002.3529410.163863
std8.3551491.1134910.7018880.039389
min10.3400002.0000002.0000000.090014
25%13.5100002.0000002.0000000.148038
50%16.4700002.5600002.0000000.153846
75%19.8100004.0000002.0000000.194837
max43.1100005.0000004.0000000.241255

64 rows × 4 columns

### 抑制关键字
tips.groupby('smoker',group_keys = False).apply(top)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
total_billtipsmokerdaytimesizetip_pct
8824.715.85NoThurLunch20.236746
18520.695.00NoSunDinner50.241663
5110.292.60NoSunDinner20.252672
1497.512.00NoThurLunch20.266312
23211.613.39NoSatDinner20.291990
10914.314.00YesSatDinner20.279525
18323.176.50YesSunDinner40.280535
673.071.00YesSatDinner10.325733
1789.604.00YesSunDinner20.416667
1727.255.15YesSunDinner20.710345
### 填充缺失值
s = pd.Series(np.random.randn(6))
s[::2] = np.nan
s
0 NaN 1 -0.975734 2 NaN 3 -0.359448 4 NaN 5 1.577221 dtype: float64
s.fillna(s.mean())
0 0.080680 1 -0.975734 2 0.080680 3 -0.359448 4 0.080680 5 1.577221 dtype: float64
states = ['Ohio', 'New York', 'Vermont', 'Florida',

          'Oregon', 'Nevada', 'California', 'Idaho']
group_key = ['East'] * 4 + ['West'] * 4
data = pd.Series(np.random.randn(8),index = states)
data
Ohio -0.017642 New York -0.655295 Vermont 0.469594 Florida -0.161041 Oregon 0.795937 Nevada 1.144693 California -1.096555 Idaho 0.166461 dtype: float64
data[['Vermont','Nevada','Idaho']] = np.nan
data
Ohio -0.017642 New York -0.655295 Vermont NaN Florida -0.161041 Oregon 0.795937 Nevada NaN California -1.096555 Idaho NaN dtype: float64
data.groupby(group_key).mean()
East -0.277993 West -0.150309 dtype: float64
fill_mean = lambda g: g.fillna(g.mean())
data.groupby(group_key).apply(fill_mean)
Ohio -0.017642 New York -0.655295 Vermont -0.277993 Florida -0.161041 Oregon 0.795937 Nevada -0.150309 California -1.096555 Idaho -0.150309 dtype: float64 ## 透视表
tips.pivot_table(index = ['day','smoker'])
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
sizetiptip_pcttotal_bill
daysmoker
FriNo2.2500002.8125000.15165018.420000
Yes2.0666672.7140000.17478316.813333
SatNo2.5555563.1028890.15804819.661778
Yes2.4761902.8754760.14790621.276667
SunNo2.9298253.1678950.16011320.506667
Yes2.5789473.5168420.18725024.120000
ThurNo2.4888892.6737780.16029817.113111
Yes2.3529413.0300000.16386319.190588
tips.pivot_table(['tip_pct','size'],index=['time','day'],columns = 'smoker')
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead tr th { text-align: left; } .dataframe thead tr:last-of-type th { text-align: right; }
sizetip_pct
smokerNoYesNoYes
timeday
DinnerFri2.0000002.2222220.1396220.165347
Sat2.5555562.4761900.1580480.147906
Sun2.9298252.5789470.1601130.187250
Thur2.000000NaN0.159744NaN
LunchFri3.0000001.8333330.1877350.188937
Thur2.5000002.3529410.1603110.163863
tips.pivot_table(['tip_pct','size'],index=['time','day'],columns='smoker',margins=True)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead tr th { text-align: left; } .dataframe thead tr:last-of-type th { text-align: right; }
sizetip_pct
smokerNoYesAllNoYesAll
timeday
DinnerFri2.0000002.2222222.1666670.1396220.1653470.158916
Sat2.5555562.4761902.5172410.1580480.1479060.153152
Sun2.9298252.5789472.8421050.1601130.1872500.166897
Thur2.000000NaN2.0000000.159744NaN0.159744
LunchFri3.0000001.8333332.0000000.1877350.1889370.188765
Thur2.5000002.3529412.4590160.1603110.1638630.161301
All2.6688742.4086022.5696720.1593280.1631960.160803
tips.pivot_table('tip_pct',index=['time','smoker'],columns='day',aggfunc=len,margins=True)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
dayFriSatSunThurAll
timesmoker
DinnerNo3.045.057.01.0106.0
Yes9.042.019.0NaN70.0
LunchNo1.0NaNNaN44.045.0
Yes6.0NaNNaN17.023.0
All19.087.076.062.0244.0
tips.pivot_table('tip_pct',index=['time','smoker'],columns='day',aggfunc='mean',fill_value=0,margins=True)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
dayFriSatSunThurAll
timesmoker
DinnerNo0.1396220.1580480.1601130.1597440.158653
Yes0.1653470.1479060.1872500.0000000.160828
LunchNo0.1877350.0000000.0000000.1603110.160920
Yes0.1889370.0000000.0000000.1638630.170404
All0.1699130.1531520.1668970.1612760.160803

method:
Function name Description
values Column name or names to aggregate; by default aggregates all numeric columns
index Column names or other group keys to group on the rows of the resulting pivot table
columns Column names or other group keys to group on the columns of the resulting pivot table
aggfunc Aggregation function or list of functions (‘mean’ by default); can be any function valid in a groupby context
fill_value Replace missing values in result table
dropna If True, do not include columns whose entries are all NA
margins Add row/column subtotals and grand total (False by default)

#### crosstab
pd.crosstab([tips.time,tips.day],tips.smoker,margins=True)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
smokerNoYesAll
timeday
DinnerFri3912
Sat454287
Sun571976
Thur101
LunchFri167
Thur441761
All15193244
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值