银行股价预测——基于pytorch框架RNN神经网络
任务目标
基于csv数据,建立RNN模型,预测股价
数据来源
自己切割的一份股价数据,无需付费直接下载,链接如下:
数据集下载
完整代码
首先贴上完整代码,可自行理解,下文慢慢解读
import pandas as pd
import torch
from torch import nn
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
#数据预处理
def process_data(type):
#读入数据
if type=='train':
data=pd.read_csv('./train.csv')
elif type=='val':
data=pd.read_csv('./val.csv')
# 以闭市价作为股价构成标签
price=data.loc[:,'close']
#归一化处理,便于后期加速模型收敛
price_norm=price/max(price)
#定义etract_data函数,能够将数据转换为时间序列
def etract_data(data, time_step):
X,y = [],[]
for i in range(len(data) - time_step):
X.append([a for a in data[i:i + time_step]])
y.append([data[i + time_step]])
X = torch.tensor(X)
X = X.reshape(-1, time_step, 1)
y = torch.tensor(y)
y = y.reshape(-1,1)
return X, y
#时间序列时间步长为8,通过前8个数据预测第9个
time_step = 8
batch_size = 1
X,y=etract_data(price_norm,time_step)
#构造迭代器,返回
dataset=TensorDataset(X,y)
#训练集数据随机打乱,验证集保持不变
if type=='train':
shuffle=True
else:
shuffle=False
dataloader=DataLoader(dataset,batch_size=batch_size,shuffle=shuffle)
return dataloader
dataloader={'train':process_data('train'),'val':process_data('val')}
#设计网络(单隐藏层Rnn)
input_size,hidden_size,output_size=1,20,1
#Rnn初始隐藏单元hidden_prev初始化
hidden_prev=torch.zeros(1,1,hidden_size).cuda()
class Net(nn.Module):
def __init__(self):
super(Net,self).__init__()
self.rnn=nn.RNN(
input_size=input_size, #输入特征维度,当前特征为股价,维度为1
hidden_size=hidden_size, #隐藏层神经元个数,或者也叫输出的维度
num_layers=1,
batch_first=True
)
self.linear=nn.Linear(hidden_size,output_size)
def forward(self,X,hidden_prev):
out,ht=self.rnn(X,hidden_prev)
# out[batch_size=1,time_step=8,hidden_size=20]
# ht[num_layer=1,batch_size=1,hidden_size=20]
ht = ht.view(-1, hidden_size) # ht[1,hidden_size=20]
ht=self.linear(ht) #ht[1,1]
return out,ht
#设定超参数,训练模型
model=Net()
model=model.cuda()
criterion=nn.MSELoss()
learning_rate,epochs=0.01,500
optimizer=torch.optim.Adam(model.parameters(),lr=learning_rate)
best_loss=9999
for epoch in range(epochs):
for phase in ['train','val']:
if phase == 'train':
model.train()
elif phase == 'val':
model.eval()
losses = []
for X,y in dataloader[phase]:
X = X.cuda()
y = y.cuda()
optimizer.zero_grad()
_,yy=model(X,hidden_prev)
yy=yy.cuda()
loss = criterion(y, yy)
model.zero_grad()
if phase=='train':
loss.backward()
optimizer.step()
losses.append(loss.item())
epoch_loss=sum(losses)/len(losses)
if phase=='val' and epoch_loss<best_loss:
best_loss=epoch_loss
torch.save(model.state_dict(), "model.pth")
if epoch%50==0: #保留验证集损失最小的模型参数
print("epoch:{},{} loss:{:.8f}".format(epoch+1,phase,epoch_loss))
if phase=='val':
print("the best loss of valuation is:{:.8f}".format(best_loss))
print('*'*50)
#加载模型,绘图查看模型效果
model.load_state_dict(torch.load('model.pth'))
Val_y,Val_predict=[],[]
#将归一化后的数据还原
Val_max_price=max(pd.read_csv('./val.csv').loc[:,'close'])
for X,y in dataloader['val']:
with torch.no_grad():
X = X.cuda()
_,predict=model(X,hidden_prev)
y=y.cpu()
predict=predict.cpu()
Val_y.append(y[0][0]*Val_max_price)
Val_predict.append(predict[0][0]*Val_max_price)
fig=plt.figure(figsize=(8,5),dpi=80)
# 红色表示真实值,绿色表示预测值
plt.plot(Val_y,linestyle='--',color='r')
plt.plot(Val_predict,color='g')
plt.title('stock price')
plt.xlabel('time')
plt.ylabel('price')
plt.show()
流程分析
1.导包
import pandas as pd #读取数据
import torch
from torch import nn #继承nn.Module构建rnn网络
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader #构建迭代器
import matplotlib.pyplot as plt #绘图
2.读入数据并做预处理
1. 查看训练集前五行数据样式
data=pd.read_csv('./train.csv')
print(data.head())
date open high low close volume
0 2016-01-04 30.57 30.57 28.63 28.78 70997200
1 2016-01-05 28.41 29.54 28.23 29.23 87498504
2 2016-01-06 29.03 29.39 28.73 29.26 48012112
3 2016-01-07 28.73 29.25 27.73 28.50 23647604
4 2016-01-08 28.73 29.18 27.63 28.67 98239664
2. 提取出数据中的闭市价格作为股价,并进行归一化处理以便于后续模型加速收敛,可视化数据
price=data.loc[:,'close']
#归一化处理
price_norm=price/max(price)
#可视化
fig=plt.figure(figsize=(8,5))
plt.plot(price_norm)
plt.title('stock price')
plt.xlabel('time')
plt.ylabel('price')
plt.show()
3. 构建时间序列数据
通过extract_data()将数据划分为8个一组,用前8个数据预测第9个数据,故时间步长time_step=8
def etract_data(data, time_step):
X,y = [],[]
for i in range(len(data) - time_step):
X.append([a for a in data[i:i + time_step]])
y.append([data[i + time_step]])
X = torch.tensor(X)
X = X.reshape(-1, time_step, 1)
y = torch.tensor(y)
y = y.reshape(-1,1)
return X, y
time_step=8
X,y=etract_data(price_norm, time_step)
print(X.shape,'\n',X[0])
print(y.shape,'\n',y[0])
X,y格式如下:
torch.Size([517, 8, 1])
tensor([[0.3833],
[0.3893],
[0.3897],
[0.3795],
[0.3818],
[0.3564],
[0.3588],
[0.3558]])
torch.Size([517, 1])
tensor([0.3558])
X有517组数据,每组数据有8个信息,每个信息为1个数(股价)
y对应有517组数据,每组数据为第9个信息的股价
4.构建迭代器
batch_size=1
dataset=TensorDataset(X,y)
dataloader=DataLoader(dataset,batch_size=batch_size,shuffle=True)
5.关键步骤整合
由于训练集和验证集在数据预处理部分的步骤几乎相同,不妨将上述关键步骤整合,通过构建字典简化代码
def process_data(type):
#读入数据
if type=='train':
data=pd.read_csv('./train.csv')
elif type=='val':
data=pd.read_csv('./val.csv')
# 以闭市价作为股价构成标签
price=data.loc[:,'close']
#归一化处理,便于后期加速模型收敛
price_norm=price/max(price)
#定义etract_data函数,能够将数据转换为时间序列
def etract_data(data, time_step):
X,y = [],[]
for i in range(len(data) - time_step):
X.append([a for a in data[i:i + time_step]])
y.append([data[i + time_step]])
X = torch.tensor(X)
X = X.reshape(-1, time_step, 1)
y = torch.tensor(y)
y = y.reshape(-1,1)
return X, y
#时间序列时间步长为8,通过前8个数据预测第9个
time_step = 8
batch_size = 1
X,y=etract_data(price_norm,time_step)
#构造迭代器,返回
dataset=TensorDataset(X,y)
#训练集数据随机打乱,验证集保持不变
if type=='train':
shuffle=True
else:
shuffle=False
dataloader=DataLoader(dataset,batch_size=batch_size,shuffle=shuffle)
return dataloader
dataloader={'train':process_data('train'),'val':process_data('val')}
3.构建单隐藏层Rnn模型
#设计网络(单隐藏层Rnn)
input_size,hidden_size,output_size=1,20,1
#Rnn初始隐藏单元hidden_prev初始化
hidden_prev=torch.zeros(1,1,hidden_size).cuda()
class Net(nn.Module):
def __init__(self):
super(Net,self).__init__()
self.rnn=nn.RNN(
input_size=input_size, #输入特征维度,当前特征为股价,维度为1
hidden_size=hidden_size, #隐藏层神经元个数,或者也叫输出的维度
num_layers=1,
batch_first=True
)
self.linear=nn.Linear(hidden_size,output_size)
def forward(self,X,hidden_prev):
out,ht=self.rnn(X,hidden_prev)
#out[batch_size=1,time_step=8,hidden_size=20]
#ht[num_layer=1,batch_size=1,hidden_size=20]
ht=ht.view(-1,hidden_size) #ht[1,hidden_size=20]
ht=self.linear(ht) #ht[1,1]
return out,ht
4.设计超参数,训练模型
model=Net()
model=model.cuda()
criterion=nn.MSELoss()
learning_rate,epochs=0.01,500
optimizer=torch.optim.Adam(model.parameters(),lr=learning_rate)
best_loss=9999
for epoch in range(epochs):
for phase in ['train','val']:
if phase == 'train':
model.train()
elif phase == 'val':
model.eval()
losses = []
for X,y in dataloader[phase]:
X = X.cuda()
y = y.cuda()
optimizer.zero_grad()
_,yy=model(X,hidden_prev)
yy=yy.cuda()
loss = criterion(y, yy)
model.zero_grad()
if phase=='train':
loss.backward()
optimizer.step()
losses.append(loss.item())
epoch_loss=sum(losses)/len(losses)
if phase=='val' and epoch_loss<best_loss:
best_loss=epoch_loss
torch.save(model.state_dict(), "model.pth")
if epoch%50==0: #保留验证集损失最小的模型参数
print("epoch:{},{} loss:{:.8f}".format(epoch+1,phase,epoch_loss))
if phase=='val':
print("the best loss of valuation is:{:.8f}".format(best_loss))
print('*'*50)
下图为loss变化值:
epoch:1,train loss:0.01066735
epoch:1,val loss:0.00090897
the best loss of valuation is:0.00090897
**************************************************
epoch:51,train loss:0.00064672
epoch:51,val loss:0.00165981
the best loss of valuation is:0.00031547
**************************************************
epoch:101,train loss:0.00102666
epoch:101,val loss:0.00038310
the best loss of valuation is:0.00031547
**************************************************
epoch:151,train loss:0.00086190
epoch:151,val loss:0.00419089
the best loss of valuation is:0.00030221
**************************************************
epoch:201,train loss:0.00153353
epoch:201,val loss:0.00049194
the best loss of valuation is:0.00030221
**************************************************
epoch:251,train loss:0.00159085
epoch:251,val loss:0.00057046
the best loss of valuation is:0.00030221
**************************************************
epoch:301,train loss:0.00191545
epoch:301,val loss:0.00145204
the best loss of valuation is:0.00030221
**************************************************
epoch:351,train loss:0.00159414
epoch:351,val loss:0.00127127
the best loss of valuation is:0.00030221
**************************************************
epoch:401,train loss:0.00166080
epoch:401,val loss:0.00314704
the best loss of valuation is:0.00030221
**************************************************
epoch:451,train loss:0.00135861
epoch:451,val loss:0.00068874
the best loss of valuation is:0.00030221
**************************************************
5.加载模型,绘图查看模型效果
model.load_state_dict(torch.load('model.pth'))
Val_y,Val_predict=[],[]
#将归一化后的数据还原
Val_max_price=max(pd.read_csv('./val.csv').loc[:,'close'])
for X,y in dataloader['val']:
with torch.no_grad():
X = X.cuda()
_,predict=model(X,hidden_prev)
y=y.cpu()
predict=predict.cpu()
Val_y.append(y[0][0]*Val_max_price)
Val_predict.append(predict[0][0]*Val_max_price)
fig=plt.figure(figsize=(8,5),dpi=80)
# 红色表示真实值,绿色表示预测值
plt.plot(Val_y,linestyle='--',color='r')
plt.plot(Val_predict,color='g')
plt.title('stock price')
plt.xlabel('time')
plt.ylabel('price')
plt.show()
下图为验证集股价预测值(绿线)与真实值(红线)折线图: