导包,收集数据
import numpy as np
import pandas as pd
import talib
import warnings
warnings.filterwarnings('ignore')
import tushare as ts
data = ts.get_k_data(code='hs300', start='2005-04-08', end='2023-11-08', ktype='D')
data = data.set_index('date')
data = data[['open', 'high', 'low', 'close']]
print('样本数目:%d' %data.shape[0])
print(data.head(10))
print(45*'-')
print(data.tail(10))
数据准备,制定因子:
data['ema'] = talib.EMA(data['close'].values, timeperiod=20)
data['stddev']= talib.STDDEV(data['close'].values, timeperiod=20, nbdev=1)
data['slope'] = talib.LINEARREG_SLOPE(data['close'].values, timeperiod=5)
data['rsi'] = talib.RSI(data['close'].values, timeperiod = 14)
data['wr'] = talib.WILLR(data['high'].values, data['low'].values, data['close'].values, timeperiod=7)
data.tail(10)
计算涨跌幅:
data['pct'] = data['close'].shift(-1) / data['close'] - 1.0
data['rise'] = data['pct'].apply(lambda x: 1 if x>0 else 0)
#删除缺失值
data = data.dropna()
data.tail(10)
训练模型:
# 划分训练集和测试集
num_train = round(len(data)*0.8)
data_train = data.iloc[:num_train, :]
data_test = data.iloc[num_train:, :]
# 训练集数据和标签
X_train = data_train[['ema', 'stddev', 'slope', 'rsi', 'wr']].values
y_train = data_train['rise']
# 测试集数据和标签
X_test = data_test[['ema', 'stddev', 'slope', 'rsi', 'wr']].values
y_test = data_test['rise']
print(X_train[:10])
print(45*'-')
print(X_test[:10])
标准化:
from sklearn.preprocessing import StandardScaler
print('---标准化之前---')
print('训练集的均值:')
print(X_train.mean(axis=0))
print('训练集的标准差:')
print(X_train.std(axis=0))
# 对数据进行标准化
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
print('---标准化之后---')
print('训练集的均值:')
print(X_train.mean(axis=0))
print('训练集的标准差:')
print(X_train.std(axis=0))
from sklearn.svm import SVC
classifier = SVC(C=1.0, kernel='rbf')
classifier.fit(X_train, y_train)
print(classifier)
测试模型:
y_train_pred = classifier.predict(X_train)
y_test_pred = classifier.predict(X_test)
data_train['pred'] = y_train_pred
data_test['pred'] = y_test_pred
accuracy_train = 100 * data_train[data_train.rise==data_train.pred].shape[0] / data_train.shape[0]
accuracy_test = 100 * data_test[data_test.rise==data_test.pred].shape[0] / data_test.shape[0]
print('训练集预测准确率:%.2f%%' %accuracy_train)
print('测试集预测准确率:%.2f%%' %accuracy_test)
假设指数可以多空交易,如果模型预测为1(上涨),第二天策略的收益率就是指数的涨幅,如果模型预测为0(下跌),第二天策略的收益率就是指数的涨幅的相反数,有了每天的日收益率之后,通过dataframe自带的累乘函数cumprod,就可以得到择时策略和沪深300指数的净值曲线,为了方(tou)便(lan)起见,不考虑交易费率,以及按照收盘价成交。
import matplotlib.pyplot as plt
#策略日收益率
data_test['strategy_pct'] = data_test.apply(lambda x: x.pct if x.pred>0 else -x.pct, axis=1)
#策略和沪深300的净值
data_test['strategy'] = (1.0 + data_test['strategy_pct']).cumprod()
data_test['hs300'] = (1.0 + data_test['pct']).cumprod()
# 粗略计算年化收益率
annual_return = 100 * (pow(data_test['strategy'].iloc[-1], 250/data_test.shape[0]) - 1.0)
print('SVM 沪深300指数择时策略的年化收益率:%.2f%%' %annual_return)
#将索引从字符串转换为日期格式,方便展示
data_test.index = pd.to_datetime(data_test.index)
ax = data_test[['strategy','hs300']].plot(figsize=(16,9), color=['SteelBlue','Red'],
title='SVM 沪深300指数择时策略净值')
plt.show()