1.应用
'''
选出小费比,最高的5组
'''
#显示所有列
pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', None)
def top(df,n,column):
return df.sort_values(by=column)[-n:]
tips = pd.read_csv('tips.csv')
grouped = tips.groupby(['day','smoker']).apply(top,5,'total_bill')
print(grouped)
'''
total_bill tip sex smoker day time size
day smoker
Fri No 99 12.46 1.50 Male No Fri Dinner 2
223 15.98 3.00 Female No Fri Lunch 3
91 22.49 3.50 Male No Fri Dinner 2
94 22.75 3.25 Female No Fri Dinner 2
Yes 93 16.32 4.30 Female Yes Fri Dinner 2
98 21.01 3.00 Male Yes Fri Dinner 2
96 27.28 4.00 Male Yes Fri Dinner 2
90 28.97 3.00 Male Yes Fri Dinner 2
95 40.17 4.73 Male Yes Fri Dinner 4
Sat No 39 31.27 5.00 Male No Sat Dinner 3
238 35.83 4.67 Female No Sat Dinner 3
23 39.42 7.58 Male No Sat Dinner 4
59 48.27 6.73 Male No Sat Dinner 4
212 48.33 9.00 Male No Sat Dinner 4
Yes 237 32.83 1.17 Male Yes Sat Dinner 2
56 38.01 3.00 Male Yes Sat Dinner 4
207 38.73 3.00 Male Yes Sat Dinner 4
102 44.30 2.50 Female Yes Sat Dinner 3
170 50.81 10.00 Male Yes Sat Dinner 3
'''
2.压缩分组键
#显示所有列
pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', None)
def top(df,n,column):
return df.sort_values(by=column)[-n:]
tips = pd.read_csv('tips.csv')
'''
group_keys=False不用分组键作为索引
'''
grouped = tips.groupby(['day','smoker'],group_keys=False).apply(top,5,'total_bill')
print(grouped)
'''
total_bill tip sex smoker day time size
99 12.46 1.50 Male No Fri Dinner 2
223 15.98 3.00 Female No Fri Lunch 3
91 22.49 3.50 Male No Fri Dinner 2
94 22.75 3.25 Female No Fri Dinner 2
93 16.32 4.30 Female Yes Fri Dinner 2
98 21.01 3.00 Male Yes Fri Dinner 2
96 27.28 4.00 Male Yes Fri Dinner 2
'''
3.分位数与桶分析
frame=pd.DataFrame({
'data1':np.random.randn(1000),
'data2':np.random.rand(1000)})
quarriles=pd.cut(frame['data1'],4,labels=False)
def get(group):
return {
'min':group.min(),
'max':group.max(),
'count':group.count()
}
groupd=frame['data1'].groupby(quarriles)
groupd_1=groupd.apply(get).unstack()
print(groupd_1)
'''
count max min
data1
0 101.0 -1.163979 -2.651598
1 508.0 0.323614 -1.147695
2 352.0 1.805476 0.326078
3 39.0 3.301849 1.824265
'''
4.指定分组的均指填充缺失值
index=['a0','a1','a2','a3','b0','b1','b2','b3']
group_key=['a']*4+['b']*4
frame=pd.DataFrame(np.arange(8),index=index,columns=['num'])
frame[1:2]=np.nan
frame[6:7]=np.nan
frame_1=frame.groupby(group_key,group_keys=False).apply(lambda x:x.fillna(x.mean()))
print(frame_1)
'''
num
a0 0.000000
a1 1.666667
a2 2.000000
a3 3.000000
b0 4.000000
b1 5.000000
b2 5.333333
b3 7.000000
'''
5.分组加权平均与相关性
df=pd.DataFrame({
'category':['a']*4+['b']*4,
'data':np.random.randn(8),
'weights':np.random.rand(8)
})
grouped=df.groupby('category')
get_avg=lambda x :np.average(x['data'],weights=x['weights'])
result=grouped.apply(get_avg)
print(result)
'''
category
a -0.012118
b 0.157650
'''
close_px=pd.read_csv('stock_px.csv',index_col=0,parse_dates=True)
#计算当前元素和先前元素之间的百分比变化,然后丢失缺失值
rets=close_px.pct_change().dropna()
# 计算每一列与'SPX'的相关系数
spx_corr = lambda x : x.corrwith(x['SPX'])
# 按年份分组
by_year = rets.groupby(lambda x:x.year)
# 按月分组
#by_year = rets.groupby(lambda x:x.month)
# 按年份计算各股票与SPX的相关系数
result=by_year.apply(spx_corr)
print(result)
'''
AAPL MSFT XOM SPX
2003 0.541124 0.745174 0.661265 1.0
2004 0.374283 0.588531 0.557742 1.0
2005 0.467540 0.562374 0.631010 1.0
2006 0.428267 0.406126 0.518514 1.0
2007 0.508118 0.658770 0.786264 1.0
2008 0.681434 0.804626 0.828303 1.0
2009 0.707103 0.654902 0.797921 1.0
2010 0.710105 0.730118 0.839057 1.0
2011 0.691931 0.800996 0.859975 1.0
'''
# 任意两股票的相关系数如下计算
result2=by_year.apply( lambda g : g['AAPL'].corr(g['MSFT']))
print(result2)
'''
2003 0.480868
2004 0.259024
2005 0.300093
2006 0.161735
2007 0.417738
2008 0.611901
2009 0.432738
2010 0.571946
2011 0.581987
'''