close_px=pd.read_csv('e:/stock_px.csv',parse_dates=True,index_col=0)
close_px[-4:]
#计算日收益率与SPX年度相关系数组成的DataFrame
rets=close_px.pct_change().dropna()
spx_corr=lambda x:x.corrwith(x['SPX'])
by_year=rets.groupby(lambda x:x.year)
by_year.apply(spx_corr)
AAPL MSFT XOM SPX
2003 0.541124 0.745174 0.661265 1.0
2004 0.374283 0.588531 0.557742 1.0
2005 0.467540 0.562374 0.631010 1.0
2006 0.428267 0.406126 0.518514 1.0
2007 0.508118 0.658770 0.786264 1.0
2008 0.681434 0.804626 0.828303 1.0
2009 0.707103 0.654902 0.797921 1.0
2010 0.710105 0.730118 0.839057 1.0
2011 0.691931 0.800996 0.859975 1.0
#计算列与列之间的相关系数
by_year.apply(lambda g:g['AAPL'].corr(g['MSFT']))
2003 0.480868
2004 0.259024
2005 0.300093
2006 0.161735
2007 0.417738
2008 0.611901
2009 0.432738
2010 0.571946
2011 0.581987
dtype: float64
import statsmodels.api as sm
def regress(data,yvar,xvars):
Y=data[yvar]
X=data[xvars]
X['intercept']=1.
result=sm.OLS(Y,X).fit()
return result.params
#按年计算AAPL对SPX收益率的线性回归
by_year.apply(regress,'AAPL',['SPX'])
Out[161]:
SPX intercept
2003 1.195406 0.000710
2004 1.363463 0.004201
2005 1.766415 0.003246
2006 1.645496 0.000080
2007 1.198761 0.003438
2008 0.968016 -0.001110
2009 0.879103 0.002954
2010 1.052608 0.001261
2011 0.806605 0.001514
透视表和交叉表
tips.pivot_table(index=['sex','smoker'])
Out[163]:
size tip tip_pct total_bill
sex smoker
Female No 2.592593 2.773519 0.156921 18.105185
Yes 2.242424 2.931515 0.182150 17.977879
Male No 2.711340 3.113402 0.160669 19.791237
Yes 2.500000 3.051167 0.152771 22.284500
#只想聚合tip_pct和size,根据day分组,将smoker放在列上,day放在行上
tips.pivot_table(['tip_pct','size'],index=['sex','day'],columns='smoker')
tip_pct size
smoker No Yes No Yes
sex day
Female Fri 0.165296 0.209129 2.500000 2.000000
Sat 0.147993 0.163817 2.307692 2.200000
Sun 0.165710 0.237075 3.071429 2.500000
Thur 0.155971 0.163073 2.480000 2.428571
Male Fri 0.138005 0.144730 2.000000 2.125000
Sat 0.162132 0.139067 2.656250 2.629630
Sun 0.158291 0.173964 2.883721 2.600000
Thur 0.165706 0.164417 2.500000 2.300000
#加边界
tips.pivot_table(['tip_pct','size'],index=['sex','day'],columns='smoker',margins=True)
Out[166]:
tip_pct size
smoker No Yes All No Yes All
sex day
Female Fri 0.165296 0.209129 0.199388 2.500000 2.000000 2.111111
Sat 0.147993 0.163817 0.156470 2.307692 2.200000 2.250000
Sun 0.165710 0.237075 0.181569 3.071429 2.500000 2.944444
Thur 0.155971 0.163073 0.157525 2.480000 2.428571 2.468750
Male Fri 0.138005 0.144730 0.143385 2.000000 2.125000 2.100000
Sat 0.162132 0.139067 0.151577 2.656250 2.629630 2.644068
Sun 0.158291 0.173964 0.162344 2.883721 2.600000 2.810345
Thur 0.165706 0.164417 0.165276 2.500000 2.300000 2.433333
All 0.159328 0.163196 0.160803 2.668874 2.408602 2.569672
要使用其他的聚合函数,将其传给aggfunc即可,使用count或len可以得到有关分组大小的交叉表:
tips.pivot_table('tip_pct',index=['sex','smoker'],columns='day',aggfunc=len,margins=True)
Out[167]:
day Fri Sat Sun Thur All
sex smoker
Female No 2.0 13.0 14.0 25.0 54.0
Yes 7.0 15.0 4.0 7.0 33.0
Male No 2.0 32.0 43.0 20.0 97.0
Yes 8.0 27.0 15.0 10.0 60.0
All 19.0 87.0 76.0 62.0 244.0
如果存在空的组合,设置一个fill_value
tips.pivot_table('size',index=['time','sex','smoker'],columns='day',aggfunc='sum',fill_value=0)
Out[168]:
day Fri Sat Sun Thur
time sex smoker
Dinner Female No 2 30 43 2
Yes 8 33 10 0
Male No 4 85 124 0
Yes 12 71 39 0
Lunch Female No 3 0 0 60
Yes 6 0 0 17
Male No 0 0 0 50
Yes 5 0 0 23
交叉表
data=DataFrame({'Sample':range(1,11),'Gender':['Female','Male','Female','Male','Male','Male',
'Female','Female','Male','Female'],'Handedness':['Right-handed','Left-handed',
'Right-handed','Right-handed','Left-handed','Right-handed','Right-handed','Left-handed','Right-handed','Right-handed']})
data
data1=data[['Sample','Gender','Handedness']]
data1
Sample Gender Handedness
0 1 Female Right-handed
1 2 Male Left-handed
2 3 Female Right-handed
3 4 Male Right-handed
4 5 Male Left-handed
5 6 Male Right-handed
6 7 Female Right-handed
7 8 Female Left-handed
8 9 Male Right-handed
9 10 Female Right-handed
pd.crosstab(data1.Gender,data1.Handedness,margins=True)
Handedness Left-handed Right-handed All
Gender
Female 1 4 5
Male 2 3 5
All 3 7 10
pd.crosstab([data1.Gender,data1.Handedness],data1.Sample,margins=True)
Out[185]:
Sample 1 2 3 4 5 6 7 8 9 10 All
Gender Handedness
Female Left-handed 0 0 0 0 0 0 0 1 0 0 1
Right-handed 1 0 1 0 0 0 1 0 0 1 4
Male Left-handed 0 1 0 0 1 0 0 0 0 0 2
Right-handed 0 0 0 1 0 1 0 0 1 0 3
All 1 1 1 1 1 1 1 1 1 1 10