使用GRU预测股票数据
Result Picture
通过下图可看出预测结果有初步的可参考性,但有十分明显的问题,个人想法及讨论放在最后一步。
Back Ground
本文提供完整训练代码,数据来源为上一篇文章中保存进pickle文件的已切割数据
Data Prepare
Load data
import pickle
with open('./dataset_trainval.pickle', 'rb') as f:
dataset_all = pickle.load(f)
print('check dataset keys', dataset_all.keys())
check dataset keys: dict_keys(['X_train', 'X_val', 'y_train', 'y_val'])
Dataset and DataLoader
from torch.utils.data import Dataset, DataLoader
class StockDataset(Dataset):
def __init__(self, data_train, data_label):
self.data_train = data_train
self.data_label = data_label
def __len__(self):
return len(self.data_label)
def __getitem__(self, idx):
return self.data_train[idx,:,:], self.data_label[idx]
X_train, X_val, y_train, y_val = dataset_all['X_train'], dataset_all['X_val'], dataset_all['y_train'], dataset_all['y_val']
stock_dataset_train = StockDataset(X_train, y_train)
stock_dataloader_train = DataLoader(stock_dataset_train, batch_size=32, shuffle=True)
stock_dataset_val = StockDataset(X_val, y_val)
stock_dataloader_val = DataLoader(stock_dataset_val, batch_size=32, shuffle=True)
# module test
for data_i, label_i in stock_dataloader_train:
print('check input tensor shapes', data_i.shape, label_i.shape)
break
check input tensor shapes torch.Size([32, 2, 30]) torch.Size([32])
Model define
import torch
import torch.nn as nn
import tqdm
import os
FEATURE_SIZE = 2
GRU_I_SIZE = 4
GRU_H_SIZE = 8
GRU_NUM_LAYERS = 1
LEARNING_RATE = 0.01
class StockGRU(nn.Module):
def __init__(self):
super(StockGRU, self).__init__()
""" parameters """
self.feature_size = FEATURE_SIZE
self.gru_i_size = GRU_I_SIZE
self.gru_h_size = GRU_H_SIZE
self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
""" layers """
# self.fc_pregru = nn.Linear(FEATURE_SIZE, GRU_I_SIZE)
self.dp_pregru = nn.Dropout(0.2)
self.gru = nn.GRU(GRU_I_SIZE, GRU_H_SIZE, num_layers=GRU_NUM_LAYERS, batch_first=True)
self.gru_bn = nn.BatchNorm1d(GRU_H_SIZE)
self.fc_output = nn.Linear(GRU_H_SIZE, 1)
def forward(self, x):
x = x.permute(0,2,1)
# print('check1', x.shape)
gru_h_init = torch.zeros((GRU_NUM_LAYERS, x.shape[0], GRU_H_SIZE)).to(self.device)
# x_gru_i = self.dp_pregru(self.fc_pregru(x))
x_gru_i = x
# print('check2', x_gru_i.shape)
_, x_gru_o = self.gru(x_gru_i, x_gru_h)
# print('check3', x_gru_o.shape)
x_out = self.fc_output(x_gru_o)
# print('check outputshape', x_out.shape)
return x_out[0,:,:].squeeze()
细节说明
-
对模型结构,超参数,做了细微的调整之后,确定了当前的简单架构
尝试了以下几种方案
a. 在GRU layer
前加Linear
将原始的两个简单特征映射到高维特征空间,以期望得到更丰富的特征表示
b. 对GRU的hidden_size \ input_size \ layer_num
等超参进行微调,调整模型的大小以学习到更丰富的信息
c. 加Dropout
抑制过拟合问题结果:a\b两方案都有非常严重的过拟合、loss曲线剧烈抖动等问题出现,自己推断是原始数据中有效特征实在是太少,贸然增到模型的容量带来的只能是模型学习到大量的噪声。
c方案也无法完全消除过拟合问题。
Train main part
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = StockGRU().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.MSELoss()
loss_train_list, loss_val_list = [], []
model_all_path = './model'
for epoch in range(150):
model.train()
loss_train = 0.0
for i, (X_train, y_train) in enumerate(stock_dataloader_train):
X_train = X_train.to(torch.float32).to(device)
y_train = y_train.to(torch.float32).to(device)
y_predict = model(X_train)
# print('check output', y_predict.shape, y_train.shape)
loss = criterion(y_predict, y_train)
loss_train += loss.item()
optimizer.zero_grad()
loss.backward()
optimizer.step()
loss_train /= len(stock_dataloader_train)
loss_train_list.append(loss_train)
model_path_t = os.path.join(model_all_path, 'ep{}.pkl'.format(epoch+1))
torch.save(model.cpu(), model_path_t)
print('train--epoch {:d}, loss {:f}'.format(epoch+1, loss_train))
model.eval().to(device)
loss_val = 0.0
for i, (X_val, y_val) in enumerate(stock_dataloader_val):
X_val = X_val.to(torch.float32).to(device)
y_val = y_val.to(torch.float32).to(device)
y_predict = model(X_val)
loss = criterion(y_predict, y_val)
loss_val += loss.item()
loss_val /= len(stock_dataloader_val)
loss_val_list.append(loss_val)
print('val--epoch {:d}, loss {:f}'.format(epoch+1, loss_val))
细节说明:
- 为了方便测试中可随意挑选固定参数的model,创建model文件夹,保存每一个epoch的model参数文件
- 训练过程中发现到150轮次左右,模型不再收敛,loss曲线在后续出现了明显尖峰,最终epoch_num定到了150
Test
测试部分,得到45支白酒股票的2023.1.1~2023.3.25的日k线数据,单步滑窗,对模型结果进行预测
Test data process
在Train
和Val
数据中,为了让训练集内部数据不要出现过多重合,同时又要保证数据的量足够大,滑窗取到了5,也就是用6周的数据作窗,1周作为滑窗步长(周六日无交易信息,1周5天)
在Test
数据中,因为要画图观察最终结果,需要连贯曲线,滑窗取到了1
# parameters
WIN_LEN = 30 # input 30 days
WIN_STEP = 1 # 1 overlap
FEAT_NUM = 2 # h-l, close
import numpy as np
data_concat_all = np.zeros((1, FEAT_NUM, WIN_LEN+1))
for key in stock_all.keys():
stock_dict = stock_all[key]
close_list = np.array(stock_dict['close'])
h_l_diff_list = np.array(stock_dict['h_l_diff'])
data_concat = np.vstack((close_list, h_l_diff_list))
data_split = win_split(data_in = data_concat, win_len = WIN_LEN, win_step = WIN_STEP)
stock_all[key] = data_split
细节说明:
- 滑窗为1,原因如上所述
- 与
trainval_dataset
不同,因为要每支股票单独出结果,所以无concatenate操作
Normalization
stock_test_all = {}
for key in stock_all.keys():
stock_out = normalization(stock_all[key])
data_stock = stock_out[:,:,:30]
label_stock = stock_out[:,0,-1]
stock_test_all[key] = {}
stock_test_all[key]['data'] = data_stock
stock_test_all[key]['label'] = label_stock
with open('./dataset_test.pickle', 'wb') as f:
pickle.dump(stock_test_all, f)
说明:与trainval_dataset
类似,归一化数据,保存到pickle文件中
Test main part
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model_path = f'./model/ep{EP_num}.pkl'
model = StockGRU()
model = torch.load(model_path)
criterion = nn.MSELoss()
loss_train_test = []
for key in dataset_all:
key_dataset = dataset_all[key]
X_test, y_test = key_dataset['data'], key_dataset['label']
dataset_test = StockDataset(X_test, y_test)
dataloader_test = DataLoader(dataset_test, batch_size=X_test.shape[0], shuffle=False)
model.eval().to(device)
loss_test = 0.0
for i, (X_test, y_test) in enumerate(dataloader_test):
X_test = X_test.to(torch.float32).to(device)
y_test = y_test.to(torch.float32).to(device)
y_predict = model(X_test)
loss = criterion(y_predict, y_test)
loss_test += loss.item()
key_dataset['predict'] = y_predict.tolist()
key_dataset['loss'] = loss_test
loss_test /= len(dataloader_test)
loss_train_test.append(loss_test)
print('test--name {:s}, loss {:f}'.format(key, loss_test))
细节说明:
- 遍历每支股票的数据,
DataLoader
中,batch_size
取一只股票的数据长度,且Shuffle=False
,以防顺序错乱,无法画图 - 每支股票的数据保存进原来的
dict
中,方便读取观察最终结果
Draw pictures
import matplotlib.pyplot as plt
import numpy as np
key = '600809.SH'
key_dataset = dataset_all[key]
X_test, y_test, predict_list, loss = key_dataset['data'], key_dataset['label'], key_dataset['predict'], key_dataset['loss']
real_stock = np.append(X_test[0,0,:], y_test)
predict_stock = np.append(X_test[0,0,:], np.array(predict_list))
fig = plt.figure(figsize=(20,10))
plt.title(key, fontsize='xx-large')
plt.plot(real_stock, label='real')
plt.plot(predict_stock, label='predict')
plt.legend(fontsize = 'xx-large')
plt.show()
说明:
画图,前30天为股票输入数据,后续分叉开两条曲线,为真实股价与预测股价
Analysis Results
走读所有数据后发现,每支股票都出现了预测值滞后真实值的现象,且现象就像是输入的最后一天股价的上下轻微波动。
个人理解,单纯使用简单的,无法解释的时序神经网络,以‘日k’作颗粒度,预测真实世界中极度抽象的股票价格,准确性一定是无法保证的,只能拿来简单看看做参考。
想要得到准确结果,单日小时甚至分钟、秒级别的价格走势、可量化以及可抽象的其他信息比如经济形势,国际关系等,都需要作为元信息提供给复杂模型比如Transformer
或其他变种架构。