利用Python进行数据分析的学习笔记——chap11

金融和经济数据应用

数据规整化方面的话题

时间序列以及截面对齐

import numpy as np
import pandas as pd
from pandas import DataFrame, Series
stock_px = pd.read_csv("E:\python_study_files\python\pydata-book-2nd-edition\examples\stock_px.csv",parse_dates=True,index_col=0)
prices = stock_px[['AAPL','JNJ','SPX','XOM']]['2011-09-06':'2011-09-14']
prices
AAPLJNJSPXXOM
2011-09-06379.7464.641165.2471.15
2011-09-07383.9365.431198.6273.65
2011-09-08384.1464.951185.9072.82
2011-09-09377.4863.641154.2371.01
2011-09-12379.9463.591162.2771.84
2011-09-13384.6263.611172.8771.65
2011-09-14389.3063.731188.6872.64
stock_pxx = pd.read_csv("E:\\python_study_files\\python\\pydata-book-2nd-edition\\examples\\volume.csv",parse_dates=True,index_col=0)
volume = stock_pxx[['AAPL','JNJ','XOM']]['2011-09-06':'2011-09-12']
volume
AAPLJNJXOM
2011-09-0618173500.015848300.025416300.0
2011-09-0712492000.010759700.023108400.0
2011-09-0814839800.015551500.022434800.0
2011-09-0920171900.017008200.027969100.0
2011-09-1216697300.013448200.026205800.0
prices*volume
AAPLJNJSPXXOM
2011-09-066.901205e+091.024434e+09NaN1.808370e+09
2011-09-074.796054e+097.040072e+08NaN1.701934e+09
2011-09-085.700561e+091.010070e+09NaN1.633702e+09
2011-09-097.614489e+091.082402e+09NaN1.986086e+09
2011-09-126.343972e+098.551710e+08NaN1.882625e+09
2011-09-13NaNNaNNaNNaN
2011-09-14NaNNaNNaNNaN
vwap = (prices*volume).sum()/volume.sum()
vwap
AAPL    380.655181
JNJ      64.394769
SPX            NaN
XOM      72.024288
dtype: float64
vwap.dropna()
AAPL    380.655181
JNJ      64.394769
XOM      72.024288
dtype: float64
prices.align(volume,join='inner')
(              AAPL    JNJ    XOM
 2011-09-06  379.74  64.64  71.15
 2011-09-07  383.93  65.43  73.65
 2011-09-08  384.14  64.95  72.82
 2011-09-09  377.48  63.64  71.01
 2011-09-12  379.94  63.59  71.84,
                   AAPL         JNJ         XOM
 2011-09-06  18173500.0  15848300.0  25416300.0
 2011-09-07  12492000.0  10759700.0  23108400.0
 2011-09-08  14839800.0  15551500.0  22434800.0
 2011-09-09  20171900.0  17008200.0  27969100.0
 2011-09-12  16697300.0  13448200.0  26205800.0)
s1 = Series(range(3),index=['a','b','c'])
s2 = Series(range(4),index=['d','b','c','e'])
s3 = Series(range(3),index=['f','a','c'])
DataFrame({'one':s1,'two':s2,'three':s3})
onetwothree
a0.0NaN1.0
b1.01.0NaN
c2.02.02.0
dNaN0.0NaN
eNaN3.0NaN
fNaNNaN0.0
DataFrame({'one':s1,'two':s2,'three':s3},index=list('face'))
onetwothree
fNaNNaN0.0
a0.0NaN1.0
c2.02.02.0
eNaN3.0NaN

频率不同的时间序列的运算

ts1 = Series(np.random.randn(3),index=pd.date_range('2012-6-13',periods=3,freq='W-WED'))
ts1
2012-06-13    1.581292
2012-06-20    0.891047
2012-06-27   -0.209271
Freq: W-WED, dtype: float64
ts1.resample('B')
<pandas.core.resample.DatetimeIndexResampler object at 0x000001F9A5F93610>
ts1.resample('B').ffill()
2012-06-13    1.581292
2012-06-14    1.581292
2012-06-15    1.581292
2012-06-18    1.581292
2012-06-19    1.581292
2012-06-20    0.891047
2012-06-21    0.891047
2012-06-22    0.891047
2012-06-25    0.891047
2012-06-26    0.891047
2012-06-27   -0.209271
Freq: B, dtype: float64
dates = pd.DatetimeIndex(['2012-6-12','2012-6-17','2012-6-18','2012-6-21','2012-6-22','2012-6-29'])
ts2 = Series(np.random.randn(6),index=dates)
ts2
2012-06-12   -0.183824
2012-06-17    0.716578
2012-06-18   -0.165628
2012-06-21    0.259746
2012-06-22   -0.303279
2012-06-29   -0.699029
dtype: float64
ts1.reindex(ts2.index,method='ffill')
2012-06-12         NaN
2012-06-17    1.581292
2012-06-18    1.581292
2012-06-21    0.891047
2012-06-22    0.891047
2012-06-29   -0.209271
dtype: float64
ts2 + ts1.reindex(ts2.index,method='ffill')
2012-06-12         NaN
2012-06-17    2.297870
2012-06-18    1.415663
2012-06-21    1.150793
2012-06-22    0.587769
2012-06-29   -0.908301
dtype: float64

使用Period

gdp = Series([1.78,1.94,2.08,2.01,2.15,2.31,2.46],
            index=pd.period_range('1984Q2',periods=7,freq='Q-SEP'))
infl = Series([0.025,0.045,0.037,0.04],
             index=pd.period_range('1982',periods=4,freq='A-DEC'))
gdp
1984Q2    1.78
1984Q3    1.94
1984Q4    2.08
1985Q1    2.01
1985Q2    2.15
1985Q3    2.31
1985Q4    2.46
Freq: Q-SEP, dtype: float64
infl
1982    0.025
1983    0.045
1984    0.037
1985    0.040
Freq: A-DEC, dtype: float64
infl_q = infl.asfreq('Q-SEP',how='end')
infl_q
1983Q1    0.025
1984Q1    0.045
1985Q1    0.037
1986Q1    0.040
Freq: Q-SEP, dtype: float64
infl_q.reindex(gdp.index,method='ffill')
1984Q2    0.045
1984Q3    0.045
1984Q4    0.045
1985Q1    0.037
1985Q2    0.037
1985Q3    0.037
1985Q4    0.037
Freq: Q-SEP, dtype: float64

时间和“最当前”数据选取

#生成一个交易日内的日期范围和索引
rng = pd.date_range('2012-06-01 09:30','2012-06-01 15:59',freq='T')
#生成5天的时间点(9:30~15:59之间的值)
rng = rng.append([rng + pd.offsets.BDay(i) for i in range(1,4)])
ts = Series(np.arange(len(rng),dtype=float),index=rng)
ts
2012-06-01 09:30:00       0.0
2012-06-01 09:31:00       1.0
2012-06-01 09:32:00       2.0
2012-06-01 09:33:00       3.0
2012-06-01 09:34:00       4.0
                        ...  
2012-06-06 15:55:00    1555.0
2012-06-06 15:56:00    1556.0
2012-06-06 15:57:00    1557.0
2012-06-06 15:58:00    1558.0
2012-06-06 15:59:00    1559.0
Length: 1560, dtype: float64
from datetime import time
ts[time(10,0)]
2012-06-01 10:00:00      30.0
2012-06-04 10:00:00     420.0
2012-06-05 10:00:00     810.0
2012-06-06 10:00:00    1200.0
dtype: float64
ts.at_time(time(10,0))
2012-06-01 10:00:00      30.0
2012-06-04 10:00:00     420.0
2012-06-05 10:00:00     810.0
2012-06-06 10:00:00    1200.0
dtype: float64
ts.between_time(time(10,0),time(10,1))
2012-06-01 10:00:00      30.0
2012-06-01 10:01:00      31.0
2012-06-04 10:00:00     420.0
2012-06-04 10:01:00     421.0
2012-06-05 10:00:00     810.0
2012-06-05 10:01:00     811.0
2012-06-06 10:00:00    1200.0
2012-06-06 10:01:00    1201.0
dtype: float64
#将该时间序列的大部分内容随机设置为NA
indexer = np.sort(np.random.permutation(len(ts))[700:])
irr_ts = ts.copy()
irr_ts[indexer] = np.nan
irr_ts['2012-06-01 09:50':'2012-06-01 10:00']
2012-06-01 09:50:00    20.0
2012-06-01 09:51:00     NaN
2012-06-01 09:52:00     NaN
2012-06-01 09:53:00    23.0
2012-06-01 09:54:00     NaN
2012-06-01 09:55:00    25.0
2012-06-01 09:56:00     NaN
2012-06-01 09:57:00     NaN
2012-06-01 09:58:00     NaN
2012-06-01 09:59:00     NaN
2012-06-01 10:00:00     NaN
dtype: float64
selection = pd.date_range('2012-06-01 10:00',periods=4,freq='B')
irr_ts.asof(selection)
2012-06-01 10:00:00      25.0
2012-06-04 10:00:00     417.0
2012-06-05 10:00:00     810.0
2012-06-06 10:00:00    1196.0
Freq: B, dtype: float64

拼接多个数据源

data1 = DataFrame(np.ones((6,3),dtype=float),
                 columns=['a','b','c'],
                 index=pd.date_range('6/12/2012',periods=6))
data2 = DataFrame(np.ones((6,3),dtype=float)*2,
                 columns=['a','b','c'],
                 index=pd.date_range('6/13/2012',periods=6))
spliced = pd.concat([data1.loc[:'2012-06-14'],data2.loc['2012-06-15':]])
spliced
abc
2012-06-121.01.01.0
2012-06-131.01.01.0
2012-06-141.01.01.0
2012-06-152.02.02.0
2012-06-162.02.02.0
2012-06-172.02.02.0
2012-06-182.02.02.0
data2 = DataFrame(np.ones((6,4),dtype=float)*2,
                 columns=['a','b','c','d'],
                 index=pd.date_range('6/13/2012',periods=6))
spliced = pd.concat([data1.loc[:'2012-06-14'],data2.loc['2012-06-15':]])
spliced
abcd
2012-06-121.01.01.0NaN
2012-06-131.01.01.0NaN
2012-06-141.01.01.0NaN
2012-06-152.02.02.02.0
2012-06-162.02.02.02.0
2012-06-172.02.02.02.0
2012-06-182.02.02.02.0
spliced_filled = spliced.combine_first(data2)
spliced_filled
abcd
2012-06-121.01.01.0NaN
2012-06-131.01.01.02.0
2012-06-141.01.01.02.0
2012-06-152.02.02.02.0
2012-06-162.02.02.02.0
2012-06-172.02.02.02.0
2012-06-182.02.02.02.0
spliced.update(data2,overwrite=False)
spliced
abcd
2012-06-121.01.01.0NaN
2012-06-131.01.01.02.0
2012-06-141.01.01.02.0
2012-06-152.02.02.02.0
2012-06-162.02.02.02.0
2012-06-172.02.02.02.0
2012-06-182.02.02.02.0
cp_spliced = spliced.copy()
cp_spliced[['a','c']] = data1[['a','c']]
cp_spliced
abcd
2012-06-121.01.01.0NaN
2012-06-131.01.01.02.0
2012-06-141.01.01.02.0
2012-06-151.02.01.02.0
2012-06-161.02.01.02.0
2012-06-171.02.01.02.0
2012-06-18NaN2.0NaN2.0

收益指数和累计收益

#import pandas.io.data as web
price = prices['AAPL']
price
2011-09-06    379.74
2011-09-07    383.93
2011-09-08    384.14
2011-09-09    377.48
2011-09-12    379.94
2011-09-13    384.62
2011-09-14    389.30
Name: AAPL, dtype: float64
price['2011-09-07']/price['2011-09-14']-1
-0.013793989211405067
returns = price.pct_change()
ret_index = (1+returns).cumprod()
ret_index[0] = 1#将第一个值设置为1
ret_index
2011-09-06    1.000000
2011-09-07    1.011034
2011-09-08    1.011587
2011-09-09    0.994049
2011-09-12    1.000527
2011-09-13    1.012851
2011-09-14    1.025175
Name: AAPL, dtype: float64
m_returns = ret_index.resample('BM').last().pct_change()
m_returns['2011']
2011-09-30   NaN
Freq: BM, Name: AAPL, dtype: float64
m_rets = (1+returns).resample('M',kind='period').prod()-1
m_rets['2011']
2011-09    0.025175
Freq: M, Name: AAPL, dtype: float64

分组变换和分析

#随机生成1000个股票代码
import random;random.seed(0)
import string
N = 1000
def rands(n):
    choices = string.ascii_uppercase
    return ''.join([random.choice(choices) for _ in range(n)])
tickers = np.array([rands(5) for _ in range(N)])
M = 500
df = DataFrame({'Momentum':np.random.randn(M) / 200+0.03,
               'Value':np.random.randn(M)/200+0.08,
               'ShortInterest':np.random.randn(M)/200-0.02},
              index=tickers[:M])
ind_names = np.array(['FINANCIAL', 'TECH'])
sampler = np.random.randint(0,len(ind_names),N)
industries = Series(ind_names[sampler],index=tickers,
                   name='industry')
by_industry = df.groupby(industries)
by_industry.mean()
MomentumValueShortInterest
industry
FINANCIAL0.0298850.080125-0.020667
TECH0.0294100.080572-0.019598
by_industry.describe()
MomentumValueShortInterest
countmeanstdmin25%50%75%maxcountmean...75%maxcountmeanstdmin25%50%75%max
industry
FINANCIAL237.00.0298850.0052690.0172540.0265630.0300060.0334590.045238237.00.080125...0.0838560.094185237.0-0.0206670.004603-0.036456-0.023736-0.020716-0.017739-0.006765
TECH263.00.0294100.0045480.0172330.0263890.0293380.0322030.040698263.00.080572...0.0836510.097748263.0-0.0195980.005081-0.033080-0.022999-0.019652-0.016281-0.005551

2 rows × 24 columns

#行业内标准化处理
def zscore(group):
    return (group-group.mean())/group.std()
df_stand = by_industry.apply(zscore)
df_stand.groupby(industries).agg(['mean','std'])
MomentumValueShortInterest
meanstdmeanstdmeanstd
industry
FINANCIAL2.389088e-161.0-7.045466e-161.0-5.801735e-161.0
TECH1.848965e-161.01.228844e-151.0-7.273438e-161.0
#行业内降序排名
ind_rank = by_industry.rank(ascending=False)
ind_rank.groupby(industries).agg(['min','max'])
MomentumValueShortInterest
minmaxminmaxminmax
industry
FINANCIAL1.0237.01.0237.01.0237.0
TECH1.0263.01.0263.01.0263.0
#行业内排名和标准化
by_industry.apply(lambda x:zscore(x.rank()))
MomentumValueShortInterest
MYNBI1.1174540.617886-0.657326
QPMZJ-0.289223-0.972842-0.092026
PLSGQ-1.152272-0.9334860.262543
EJEYD0.233372-1.1814441.079344
TZIRW0.4995680.6441790.433835
............
JPHKQ-1.630168-0.1051720.499568
VACPK-1.196333-0.709912-1.012282
MHNBS-1.3564720.7001150.656358
YBNCI1.0939290.8459720.933486
GXKFD-1.327301-1.400229-0.306300

500 rows × 3 columns

分组因子暴露

from numpy.random import rand
fac1,fac2,fac3 = np.random.rand(3,1000)
ticker_subset = tickers.take(np.random.permutation(N)[:1000])
#因子加权和以及噪声
port = Series(0.7*fac1 - 1.2*fac2 + 0.3*fac3 + rand(1000),
             index=ticker_subset)
factors = DataFrame({'f1':fac1,'f2':fac2,'f3':fac3},
                   index=ticker_subset)
factors.corrwith(port)
f1    0.435330
f2   -0.677363
f3    0.186326
dtype: float64
import statsmodels.api as sm
result = sm.OLS(endog=port, exog=factors).fit()
result.summary()
OLS Regression Results
Dep. Variable:y R-squared (uncentered): 0.761
Model:OLS Adj. R-squared (uncentered): 0.760
Method:Least Squares F-statistic: 1057.
Date:Fri, 04 Mar 2022 Prob (F-statistic):4.67e-309
Time:14:54:06 Log-Likelihood: -277.77
No. Observations: 1000 AIC: 561.5
Df Residuals: 997 BIC: 576.3
Df Model: 3
Covariance Type:nonrobust
coefstd errtP>|t|[0.0250.975]
f1 0.9950 0.029 34.785 0.000 0.939 1.051
f2 -0.8626 0.029 -30.150 0.000 -0.919 -0.806
f3 0.5914 0.030 19.917 0.000 0.533 0.650
Omnibus:33.929 Durbin-Watson: 1.974
Prob(Omnibus): 0.000 Jarque-Bera (JB): 16.242
Skew:-0.054 Prob(JB): 0.000297
Kurtosis: 2.385 Cond. No. 3.20


Notes:
[1] R² is computed without centering (uncentered) since the model does not contain a constant.
[2] Standard Errors assume that the covariance matrix of the errors is correctly specified.
def beta_exposure(chunk,factors=None):
    return sm.OLS(endog=chunk,exog=factors).fit()
by_ind = port.groupby(industries)
exposures = by_ind.apply(beta_exposure,factors=factors)
exposures.unstack()
---------------------------------------------------------------------------

ValueError                                Traceback (most recent call last)

C:\windows in <module>
 

ValueError: The indices for endog and exog are not aligned

十分位和四分位分析

找不到数据,没完成。

'''
price = pd.read_pickle('E:/python_study_files/python/pydata-notebook-master/examples/yahoo_price.pkl')
volume = pd.read_pickle('E:/python_study_files/python/pydata-notebook-master/examples/yahoo_volume.pkl')
#data = price.loc['2006-01-01']
price
'''
import pandas_datareader.data as web
all_data = {ticker:web.get_data_yahoo(ticker) for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']}

#price = pd.DataFrame({ticker: data['Adj Close']
                     for ticker, data in all_data.items()})
#volume = pd.DataFrame({ticker: data['Volume']
                      for ticker, data in all_data.items()})
all_data
  File "C:\windows.py", line 11
    for ticker, data in all_data.items()})
    ^
IndentationError: unexpected indent
#将收益率变换为趋势信号的函数
px = data['AAPL']
returns = px.pct_change()
def to_index(rets):
    index = (1+rets).cumprod()
    first_loc = max(index.notnull().argmax()-1,0)
    index.values[first_loc]=1
    return index
def trend_signal(rets,lookback,lag):
    signal = rets.rolling(window=lookback,min_periods=lookback-5).sum()
    return signal.shift(lag)
signal = trend_signal(returns,100,3)
trade_friday = signal.resample('W-FRI').resample('B').ffill()
trade_rets = trade_friday.shift(1)*returns
to_index(trade_rets).plot()
---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

C:\windows in <module>
    
TypeError: Only valid with DatetimeIndex, TimedeltaIndex or PeriodIndex, but got an instance of 'RangeIndex'
vol = returns.rolling(window=250,min_periods=200).std() * np.sqrt(250)
def sharpe(rets,ann=250):
    return rets.mean()/rets.std() * np.sqrt(ann)
trade_rets.groupby(pd.qcut(vol,4)).agg(sharpe)
---------------------------------------------------------------------------

NameError                                 Traceback (most recent call last)

C:\windows in <module>
     
NameError: name 'trade_rets' is not defined

更多示例应用

信号前沿分析

names = ['AAPL','GOOG','MSFT','DELL','GS','MS','BAC','C']
pxx = load_pkl("E:\python_study_files\python\pydata-notebook-master\examples\yahoo_price.pkl")
pxx
---------------------------------------------------------------------------

AttributeError                            Traceback (most recent call last)

C:\windows\ in <module>

AttributeError: module 'pandas' has no attribute 'load_pkl'

期货合约转仓

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值