金融和经济数据应用
数据规整化方面的话题
时间序列以及截面对齐
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
stock_px = pd. read_csv( "E:\python_study_files\python\pydata-book-2nd-edition\examples\stock_px.csv" , parse_dates= True , index_col= 0 )
prices = stock_px[ [ 'AAPL' , 'JNJ' , 'SPX' , 'XOM' ] ] [ '2011-09-06' : '2011-09-14' ]
prices
AAPL JNJ SPX XOM 2011-09-06 379.74 64.64 1165.24 71.15 2011-09-07 383.93 65.43 1198.62 73.65 2011-09-08 384.14 64.95 1185.90 72.82 2011-09-09 377.48 63.64 1154.23 71.01 2011-09-12 379.94 63.59 1162.27 71.84 2011-09-13 384.62 63.61 1172.87 71.65 2011-09-14 389.30 63.73 1188.68 72.64
stock_pxx = pd. read_csv( "E:\\python_study_files\\python\\pydata-book-2nd-edition\\examples\\volume.csv" , parse_dates= True , index_col= 0 )
volume = stock_pxx[ [ 'AAPL' , 'JNJ' , 'XOM' ] ] [ '2011-09-06' : '2011-09-12' ]
volume
AAPL JNJ XOM 2011-09-06 18173500.0 15848300.0 25416300.0 2011-09-07 12492000.0 10759700.0 23108400.0 2011-09-08 14839800.0 15551500.0 22434800.0 2011-09-09 20171900.0 17008200.0 27969100.0 2011-09-12 16697300.0 13448200.0 26205800.0
prices* volume
AAPL JNJ SPX XOM 2011-09-06 6.901205e+09 1.024434e+09 NaN 1.808370e+09 2011-09-07 4.796054e+09 7.040072e+08 NaN 1.701934e+09 2011-09-08 5.700561e+09 1.010070e+09 NaN 1.633702e+09 2011-09-09 7.614489e+09 1.082402e+09 NaN 1.986086e+09 2011-09-12 6.343972e+09 8.551710e+08 NaN 1.882625e+09 2011-09-13 NaN NaN NaN NaN 2011-09-14 NaN NaN NaN NaN
vwap = ( prices* volume) . sum ( ) / volume. sum ( )
vwap
AAPL 380.655181
JNJ 64.394769
SPX NaN
XOM 72.024288
dtype: float64
vwap. dropna( )
AAPL 380.655181
JNJ 64.394769
XOM 72.024288
dtype: float64
prices. align( volume, join= 'inner' )
( AAPL JNJ XOM
2011-09-06 379.74 64.64 71.15
2011-09-07 383.93 65.43 73.65
2011-09-08 384.14 64.95 72.82
2011-09-09 377.48 63.64 71.01
2011-09-12 379.94 63.59 71.84,
AAPL JNJ XOM
2011-09-06 18173500.0 15848300.0 25416300.0
2011-09-07 12492000.0 10759700.0 23108400.0
2011-09-08 14839800.0 15551500.0 22434800.0
2011-09-09 20171900.0 17008200.0 27969100.0
2011-09-12 16697300.0 13448200.0 26205800.0)
s1 = Series( range ( 3 ) , index= [ 'a' , 'b' , 'c' ] )
s2 = Series( range ( 4 ) , index= [ 'd' , 'b' , 'c' , 'e' ] )
s3 = Series( range ( 3 ) , index= [ 'f' , 'a' , 'c' ] )
DataFrame( { 'one' : s1, 'two' : s2, 'three' : s3} )
one two three a 0.0 NaN 1.0 b 1.0 1.0 NaN c 2.0 2.0 2.0 d NaN 0.0 NaN e NaN 3.0 NaN f NaN NaN 0.0
DataFrame( { 'one' : s1, 'two' : s2, 'three' : s3} , index= list ( 'face' ) )
one two three f NaN NaN 0.0 a 0.0 NaN 1.0 c 2.0 2.0 2.0 e NaN 3.0 NaN
频率不同的时间序列的运算
ts1 = Series( np. random. randn( 3 ) , index= pd. date_range( '2012-6-13' , periods= 3 , freq= 'W-WED' ) )
ts1
2012-06-13 1.581292
2012-06-20 0.891047
2012-06-27 -0.209271
Freq: W-WED, dtype: float64
ts1. resample( 'B' )
<pandas.core.resample.DatetimeIndexResampler object at 0x000001F9A5F93610>
ts1. resample( 'B' ) . ffill( )
2012-06-13 1.581292
2012-06-14 1.581292
2012-06-15 1.581292
2012-06-18 1.581292
2012-06-19 1.581292
2012-06-20 0.891047
2012-06-21 0.891047
2012-06-22 0.891047
2012-06-25 0.891047
2012-06-26 0.891047
2012-06-27 -0.209271
Freq: B, dtype: float64
dates = pd. DatetimeIndex( [ '2012-6-12' , '2012-6-17' , '2012-6-18' , '2012-6-21' , '2012-6-22' , '2012-6-29' ] )
ts2 = Series( np. random. randn( 6 ) , index= dates)
ts2
2012-06-12 -0.183824
2012-06-17 0.716578
2012-06-18 -0.165628
2012-06-21 0.259746
2012-06-22 -0.303279
2012-06-29 -0.699029
dtype: float64
ts1. reindex( ts2. index, method= 'ffill' )
2012-06-12 NaN
2012-06-17 1.581292
2012-06-18 1.581292
2012-06-21 0.891047
2012-06-22 0.891047
2012-06-29 -0.209271
dtype: float64
ts2 + ts1. reindex( ts2. index, method= 'ffill' )
2012-06-12 NaN
2012-06-17 2.297870
2012-06-18 1.415663
2012-06-21 1.150793
2012-06-22 0.587769
2012-06-29 -0.908301
dtype: float64
使用Period
gdp = Series( [ 1.78 , 1.94 , 2.08 , 2.01 , 2.15 , 2.31 , 2.46 ] ,
index= pd. period_range( '1984Q2' , periods= 7 , freq= 'Q-SEP' ) )
infl = Series( [ 0.025 , 0.045 , 0.037 , 0.04 ] ,
index= pd. period_range( '1982' , periods= 4 , freq= 'A-DEC' ) )
gdp
1984Q2 1.78
1984Q3 1.94
1984Q4 2.08
1985Q1 2.01
1985Q2 2.15
1985Q3 2.31
1985Q4 2.46
Freq: Q-SEP, dtype: float64
infl
1982 0.025
1983 0.045
1984 0.037
1985 0.040
Freq: A-DEC, dtype: float64
infl_q = infl. asfreq( 'Q-SEP' , how= 'end' )
infl_q
1983Q1 0.025
1984Q1 0.045
1985Q1 0.037
1986Q1 0.040
Freq: Q-SEP, dtype: float64
infl_q. reindex( gdp. index, method= 'ffill' )
1984Q2 0.045
1984Q3 0.045
1984Q4 0.045
1985Q1 0.037
1985Q2 0.037
1985Q3 0.037
1985Q4 0.037
Freq: Q-SEP, dtype: float64
时间和“最当前”数据选取
rng = pd. date_range( '2012-06-01 09:30' , '2012-06-01 15:59' , freq= 'T' )
rng = rng. append( [ rng + pd. offsets. BDay( i) for i in range ( 1 , 4 ) ] )
ts = Series( np. arange( len ( rng) , dtype= float ) , index= rng)
ts
2012-06-01 09:30:00 0.0
2012-06-01 09:31:00 1.0
2012-06-01 09:32:00 2.0
2012-06-01 09:33:00 3.0
2012-06-01 09:34:00 4.0
...
2012-06-06 15:55:00 1555.0
2012-06-06 15:56:00 1556.0
2012-06-06 15:57:00 1557.0
2012-06-06 15:58:00 1558.0
2012-06-06 15:59:00 1559.0
Length: 1560, dtype: float64
from datetime import time
ts[ time( 10 , 0 ) ]
2012-06-01 10:00:00 30.0
2012-06-04 10:00:00 420.0
2012-06-05 10:00:00 810.0
2012-06-06 10:00:00 1200.0
dtype: float64
ts. at_time( time( 10 , 0 ) )
2012-06-01 10:00:00 30.0
2012-06-04 10:00:00 420.0
2012-06-05 10:00:00 810.0
2012-06-06 10:00:00 1200.0
dtype: float64
ts. between_time( time( 10 , 0 ) , time( 10 , 1 ) )
2012-06-01 10:00:00 30.0
2012-06-01 10:01:00 31.0
2012-06-04 10:00:00 420.0
2012-06-04 10:01:00 421.0
2012-06-05 10:00:00 810.0
2012-06-05 10:01:00 811.0
2012-06-06 10:00:00 1200.0
2012-06-06 10:01:00 1201.0
dtype: float64
indexer = np. sort( np. random. permutation( len ( ts) ) [ 700 : ] )
irr_ts = ts. copy( )
irr_ts[ indexer] = np. nan
irr_ts[ '2012-06-01 09:50' : '2012-06-01 10:00' ]
2012-06-01 09:50:00 20.0
2012-06-01 09:51:00 NaN
2012-06-01 09:52:00 NaN
2012-06-01 09:53:00 23.0
2012-06-01 09:54:00 NaN
2012-06-01 09:55:00 25.0
2012-06-01 09:56:00 NaN
2012-06-01 09:57:00 NaN
2012-06-01 09:58:00 NaN
2012-06-01 09:59:00 NaN
2012-06-01 10:00:00 NaN
dtype: float64
selection = pd. date_range( '2012-06-01 10:00' , periods= 4 , freq= 'B' )
irr_ts. asof( selection)
2012-06-01 10:00:00 25.0
2012-06-04 10:00:00 417.0
2012-06-05 10:00:00 810.0
2012-06-06 10:00:00 1196.0
Freq: B, dtype: float64
拼接多个数据源
data1 = DataFrame( np. ones( ( 6 , 3 ) , dtype= float ) ,
columns= [ 'a' , 'b' , 'c' ] ,
index= pd. date_range( '6/12/2012' , periods= 6 ) )
data2 = DataFrame( np. ones( ( 6 , 3 ) , dtype= float ) * 2 ,
columns= [ 'a' , 'b' , 'c' ] ,
index= pd. date_range( '6/13/2012' , periods= 6 ) )
spliced = pd. concat( [ data1. loc[ : '2012-06-14' ] , data2. loc[ '2012-06-15' : ] ] )
spliced
a b c 2012-06-12 1.0 1.0 1.0 2012-06-13 1.0 1.0 1.0 2012-06-14 1.0 1.0 1.0 2012-06-15 2.0 2.0 2.0 2012-06-16 2.0 2.0 2.0 2012-06-17 2.0 2.0 2.0 2012-06-18 2.0 2.0 2.0
data2 = DataFrame( np. ones( ( 6 , 4 ) , dtype= float ) * 2 ,
columns= [ 'a' , 'b' , 'c' , 'd' ] ,
index= pd. date_range( '6/13/2012' , periods= 6 ) )
spliced = pd. concat( [ data1. loc[ : '2012-06-14' ] , data2. loc[ '2012-06-15' : ] ] )
spliced
a b c d 2012-06-12 1.0 1.0 1.0 NaN 2012-06-13 1.0 1.0 1.0 NaN 2012-06-14 1.0 1.0 1.0 NaN 2012-06-15 2.0 2.0 2.0 2.0 2012-06-16 2.0 2.0 2.0 2.0 2012-06-17 2.0 2.0 2.0 2.0 2012-06-18 2.0 2.0 2.0 2.0
spliced_filled = spliced. combine_first( data2)
spliced_filled
a b c d 2012-06-12 1.0 1.0 1.0 NaN 2012-06-13 1.0 1.0 1.0 2.0 2012-06-14 1.0 1.0 1.0 2.0 2012-06-15 2.0 2.0 2.0 2.0 2012-06-16 2.0 2.0 2.0 2.0 2012-06-17 2.0 2.0 2.0 2.0 2012-06-18 2.0 2.0 2.0 2.0
spliced. update( data2, overwrite= False )
spliced
a b c d 2012-06-12 1.0 1.0 1.0 NaN 2012-06-13 1.0 1.0 1.0 2.0 2012-06-14 1.0 1.0 1.0 2.0 2012-06-15 2.0 2.0 2.0 2.0 2012-06-16 2.0 2.0 2.0 2.0 2012-06-17 2.0 2.0 2.0 2.0 2012-06-18 2.0 2.0 2.0 2.0
cp_spliced = spliced. copy( )
cp_spliced[ [ 'a' , 'c' ] ] = data1[ [ 'a' , 'c' ] ]
cp_spliced
a b c d 2012-06-12 1.0 1.0 1.0 NaN 2012-06-13 1.0 1.0 1.0 2.0 2012-06-14 1.0 1.0 1.0 2.0 2012-06-15 1.0 2.0 1.0 2.0 2012-06-16 1.0 2.0 1.0 2.0 2012-06-17 1.0 2.0 1.0 2.0 2012-06-18 NaN 2.0 NaN 2.0
收益指数和累计收益
price = prices[ 'AAPL' ]
price
2011-09-06 379.74
2011-09-07 383.93
2011-09-08 384.14
2011-09-09 377.48
2011-09-12 379.94
2011-09-13 384.62
2011-09-14 389.30
Name: AAPL, dtype: float64
price[ '2011-09-07' ] / price[ '2011-09-14' ] - 1
-0.013793989211405067
returns = price. pct_change( )
ret_index = ( 1 + returns) . cumprod( )
ret_index[ 0 ] = 1
ret_index
2011-09-06 1.000000
2011-09-07 1.011034
2011-09-08 1.011587
2011-09-09 0.994049
2011-09-12 1.000527
2011-09-13 1.012851
2011-09-14 1.025175
Name: AAPL, dtype: float64
m_returns = ret_index. resample( 'BM' ) . last( ) . pct_change( )
m_returns[ '2011' ]
2011-09-30 NaN
Freq: BM, Name: AAPL, dtype: float64
m_rets = ( 1 + returns) . resample( 'M' , kind= 'period' ) . prod( ) - 1
m_rets[ '2011' ]
2011-09 0.025175
Freq: M, Name: AAPL, dtype: float64
分组变换和分析
import random; random. seed( 0 )
import string
N = 1000
def rands ( n) :
choices = string. ascii_uppercase
return '' . join( [ random. choice( choices) for _ in range ( n) ] )
tickers = np. array( [ rands( 5 ) for _ in range ( N) ] )
M = 500
df = DataFrame( { 'Momentum' : np. random. randn( M) / 200 + 0.03 ,
'Value' : np. random. randn( M) / 200 + 0.08 ,
'ShortInterest' : np. random. randn( M) / 200 - 0.02 } ,
index= tickers[ : M] )
ind_names = np. array( [ 'FINANCIAL' , 'TECH' ] )
sampler = np. random. randint( 0 , len ( ind_names) , N)
industries = Series( ind_names[ sampler] , index= tickers,
name= 'industry' )
by_industry = df. groupby( industries)
by_industry. mean( )
Momentum Value ShortInterest industry FINANCIAL 0.029885 0.080125 -0.020667 TECH 0.029410 0.080572 -0.019598
by_industry. describe( )
Momentum Value ShortInterest count mean std min 25% 50% 75% max count mean ... 75% max count mean std min 25% 50% 75% max industry FINANCIAL 237.0 0.029885 0.005269 0.017254 0.026563 0.030006 0.033459 0.045238 237.0 0.080125 ... 0.083856 0.094185 237.0 -0.020667 0.004603 -0.036456 -0.023736 -0.020716 -0.017739 -0.006765 TECH 263.0 0.029410 0.004548 0.017233 0.026389 0.029338 0.032203 0.040698 263.0 0.080572 ... 0.083651 0.097748 263.0 -0.019598 0.005081 -0.033080 -0.022999 -0.019652 -0.016281 -0.005551
2 rows × 24 columns
def zscore ( group) :
return ( group- group. mean( ) ) / group. std( )
df_stand = by_industry. apply ( zscore)
df_stand. groupby( industries) . agg( [ 'mean' , 'std' ] )
Momentum Value ShortInterest mean std mean std mean std industry FINANCIAL 2.389088e-16 1.0 -7.045466e-16 1.0 -5.801735e-16 1.0 TECH 1.848965e-16 1.0 1.228844e-15 1.0 -7.273438e-16 1.0
ind_rank = by_industry. rank( ascending= False )
ind_rank. groupby( industries) . agg( [ 'min' , 'max' ] )
Momentum Value ShortInterest min max min max min max industry FINANCIAL 1.0 237.0 1.0 237.0 1.0 237.0 TECH 1.0 263.0 1.0 263.0 1.0 263.0
by_industry. apply ( lambda x: zscore( x. rank( ) ) )
Momentum Value ShortInterest MYNBI 1.117454 0.617886 -0.657326 QPMZJ -0.289223 -0.972842 -0.092026 PLSGQ -1.152272 -0.933486 0.262543 EJEYD 0.233372 -1.181444 1.079344 TZIRW 0.499568 0.644179 0.433835 ... ... ... ... JPHKQ -1.630168 -0.105172 0.499568 VACPK -1.196333 -0.709912 -1.012282 MHNBS -1.356472 0.700115 0.656358 YBNCI 1.093929 0.845972 0.933486 GXKFD -1.327301 -1.400229 -0.306300
500 rows × 3 columns
分组因子暴露
from numpy. random import rand
fac1, fac2, fac3 = np. random. rand( 3 , 1000 )
ticker_subset = tickers. take( np. random. permutation( N) [ : 1000 ] )
port = Series( 0.7 * fac1 - 1.2 * fac2 + 0.3 * fac3 + rand( 1000 ) ,
index= ticker_subset)
factors = DataFrame( { 'f1' : fac1, 'f2' : fac2, 'f3' : fac3} ,
index= ticker_subset)
factors. corrwith( port)
f1 0.435330
f2 -0.677363
f3 0.186326
dtype: float64
import statsmodels. api as sm
result = sm. OLS( endog= port, exog= factors) . fit( )
result. summary( )
OLS Regression Results
Dep. Variable: y R-squared (uncentered): 0.761 Model: OLS Adj. R-squared (uncentered): 0.760 Method: Least Squares F-statistic: 1057. Date: Fri, 04 Mar 2022 Prob (F-statistic): 4.67e-309 Time: 14:54:06 Log-Likelihood: -277.77 No. Observations: 1000 AIC: 561.5 Df Residuals: 997 BIC: 576.3 Df Model: 3 Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975] f1 0.9950 0.029 34.785 0.000 0.939 1.051 f2 -0.8626 0.029 -30.150 0.000 -0.919 -0.806 f3 0.5914 0.030 19.917 0.000 0.533 0.650
Omnibus: 33.929 Durbin-Watson: 1.974 Prob(Omnibus): 0.000 Jarque-Bera (JB): 16.242 Skew: -0.054 Prob(JB): 0.000297 Kurtosis: 2.385 Cond. No. 3.20
Notes:
[1] R² is computed without centering (uncentered) since the model does not contain a constant.
[2] Standard Errors assume that the covariance matrix of the errors is correctly specified.
def beta_exposure ( chunk, factors= None ) :
return sm. OLS( endog= chunk, exog= factors) . fit( )
by_ind = port. groupby( industries)
exposures = by_ind. apply ( beta_exposure, factors= factors)
exposures. unstack( )
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
C:\windows in <module>
ValueError: The indices for endog and exog are not aligned
十分位和四分位分析
找不到数据,没完成。
'''
price = pd.read_pickle('E:/python_study_files/python/pydata-notebook-master/examples/yahoo_price.pkl')
volume = pd.read_pickle('E:/python_study_files/python/pydata-notebook-master/examples/yahoo_volume.pkl')
#data = price.loc['2006-01-01']
price
'''
import pandas_datareader. data as web
all_data = { ticker: web. get_data_yahoo( ticker) for ticker in [ 'AAPL' , 'IBM' , 'MSFT' , 'GOOG' ] }
for ticker, data in all_data. items( ) } )
for ticker, data in all_data. items( ) } )
all_data
File "C:\windows.py", line 11
for ticker, data in all_data.items()})
^
IndentationError: unexpected indent
px = data[ 'AAPL' ]
returns = px. pct_change( )
def to_index ( rets) :
index = ( 1 + rets) . cumprod( )
first_loc = max ( index. notnull( ) . argmax( ) - 1 , 0 )
index. values[ first_loc] = 1
return index
def trend_signal ( rets, lookback, lag) :
signal = rets. rolling( window= lookback, min_periods= lookback- 5 ) . sum ( )
return signal. shift( lag)
signal = trend_signal( returns, 100 , 3 )
trade_friday = signal. resample( 'W-FRI' ) . resample( 'B' ) . ffill( )
trade_rets = trade_friday. shift( 1 ) * returns
to_index( trade_rets) . plot( )
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
C:\windows in <module>
TypeError: Only valid with DatetimeIndex, TimedeltaIndex or PeriodIndex, but got an instance of 'RangeIndex'
vol = returns. rolling( window= 250 , min_periods= 200 ) . std( ) * np. sqrt( 250 )
def sharpe ( rets, ann= 250 ) :
return rets. mean( ) / rets. std( ) * np. sqrt( ann)
trade_rets. groupby( pd. qcut( vol, 4 ) ) . agg( sharpe)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
C:\windows in <module>
NameError: name 'trade_rets' is not defined
更多示例应用
信号前沿分析
names = [ 'AAPL' , 'GOOG' , 'MSFT' , 'DELL' , 'GS' , 'MS' , 'BAC' , 'C' ]
pxx = load_pkl( "E:\python_study_files\python\pydata-notebook-master\examples\yahoo_price.pkl" )
pxx
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
C:\windows\ in <module>
AttributeError: module 'pandas' has no attribute 'load_pkl'
期货合约转仓