借助tushare,计算各个因子,然后根据OLS回归,计算各股票因子分值,排序进行股票购买
导包:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from pylab import mpl
mpl.rcParams['font.sans-serif'] = ['SimHei']
mpl.rcParams['axes.unicode_minus'] = False
import tushare as ts
token = token
pro = ts.pro_api(token)
import statsmodels.api as sm
import talib as ta
获取数据
def get_HS300(start,end):
df1 = pro.index_weight(index_code='399300.SZ',start_date =start,end_date=end)
df1 = list(df1['con_code'])
#删除最近一年上市股票,st股票
df2 = pro.stock_basic(exchange='',list_status='L')
df2 = df2[df2['list_date'].apply(int).values<20190601]
df2 = df2[-df2['name'].apply(lambda x:x.startswith('*ST'))]
df2 = list(df2['ts_code'])
codes = []
for i in df1:
if i in df2:
codes.append(i)
return codes
def get_index(start,end):
data = pro.index_daily(ts_code='399300.SZ',start_date=start,end_date=end)
data.index = pd.to_datetime(data['trade_date'])
data.sort_index(ascending=True,inplace=True)
data= data['pct_chg']
return data
计算因子值
def get_change(lists,start,end):
stock_list= []
for i in lists:
df = pro.daily(ts_code=i,start_date=start,end_date=end)
df2 = pro.daily_basic(ts_code=i,start_date=start,end_date=end)
df = pd.merge(df,df2['ps_ttm'],left_index=True,right_index=True)
df.index = pd.to_datetime(df['trade_date'])
df = df.sort_index(ascending=True)
df['EMAC12'] = ta.EMA(df.close,timeperiod=12)/df.close
df['HO'] = df.high-df.open
df['OL'] = df.open-df.low
df['HCY'] = df.high-df.close.shift(1)
df['CYL'] = df.close.shift(1)-df.low
df['AR'] = ta.SUM(df.HO,timeperiod=26)/ta.SUM(df.OL,timeperiod=26)*100
df['BR'] = ta.SUM(df.HCY,timeperiod=26)/ta.SUM(df.CYL,timeperiod=26)*100
df['ARBR'] = df['AR'] - df['BR']
df = df.dropna()
df = df[['pct_chg','EMAC12','ARBR','ps_ttm']]
stock_list.append(df)
return stock_list
中性化:
def MAD(data):
data= pd.DataFrame(data)
data = (data-data.mean())/data.std()
for i in range(len(data.columns)):
MAD = np.median(abs(data.iloc[:,i]-np.median(data.iloc[:,i])))
MAX = np.median(data.iloc[:,i]) + 3*1.4826*MAD
MIN = np.median(data.iloc[:,i]) - 3*1.4826*MAD
data.iloc[:,i][data.iloc[:,i]>MAX] = MAX
data.iloc[:,i][data.iloc[:,i]<MIN] = MIN
data = (data - data.min()) / (data.max() - data.min())
return data
获取基准指数数据:
codes = get_HS300('20200601','20231203')
x =get_change(codes,20200401,20200619)
rf = 1.04**(1/360)-1
for i in x:
i['change'] = i['pct_chg'] - rf
i['EMAC12_normal'] = MAD(i['EMAC12'])
i['ARBR_normal'] = MAD(i['ARBR'])
i['ps_ttm_normal'] = MAD(i['ps_ttm'])
i.drop(['pct_chg','EMAC12','ARBR','ps_ttm'],axis=1,inplace=True)
x[0].head()
获取股票数据:
HS300_index = get_index('20200513','20231209')
HS300_index = HS300_index-rf
HS300_index.head()
stocks = []
for i in x:
stock = pd.merge(i,HS300_index,left_index=True,right_index=True)
stock.columns=['日涨跌','EMAC12因子','ARBR因子','滚动市销率因子','市场风险溢价因子']
stocks.append(stock)
stocks[0].head()
ols计算回归
results =pd.DataFrame()
for i in range(len(stocks)):
#print(i)
try:
model = sm.OLS(stocks[i]['日涨跌'],sm.add_constant(stocks[i][['EMAC12因子','ARBR因子','滚动市销率因子','市场风险溢价因子']].values))
result = model.fit()
results[codes[i]] = result.params
except:
pass
results.head()
根据常数项,判断离散程度,选择最大的,构建股票池:
z = results.sort_values(by=['const'],axis=1)
z.head()