LSTM模型和ARIMA模型预测中国股票指数准确性的比较分析

#!/usr/bin/env python
# coding: utf-8

# In[3]:


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler

######################1.导入数据
filepath = 'C:\\Users\\14308\\Desktop\\gongshang.csv'
data = pd.read_csv(filepath)
# 将数据按照日期进行排序,确保时间序列递增
#data = data.sort_values('Date')
# 打印前几条数据
print(data.head())
# 打印维度
print(data.shape)


# In[4]:


######################2.将股票数据收盘价(Close)进行可视化展示
# 设置画布大小
plt.figure(figsize=(15, 9))
plt.plot(data[['Close']])
plt.xticks(range(0, data.shape[0], 50), data['Date'].loc[::50], rotation=45,fontsize=20)
plt.yticks(fontsize=20)
plt.title("SH300 Index ClosePrice", fontsize=20, fontweight='bold')
plt.xlabel('Date', fontsize=20)
plt.ylabel('Close Price', fontsize=20)
#plt.savefig('StockPrice.jpg')
plt.show()


# In[5]:


######################3. 特征工程
# 选取Close作为特征
price = data[['Close']]
# 打印相关信息
print(price.info())
print(price)

# 进行不同的数据缩放,将数据缩放到-1和1之间,归一化操作
scaler = MinMaxScaler(feature_range=(-1, 1))
price['Close'] = scaler.fit_transform(price['Close'].values.reshape(-1, 1))
print(price['Close'].shape)
price


# In[6]:


######################4. 数据集制作
# 今天的收盘价预测明天的收盘价
# lookback表示观察的跨度
def split_data(stock, lookback):
    # 将stock转化为ndarray类型
    data_raw = stock.to_numpy()
    data = []

    # you can free play(seq_length)
    # 将data按lookback分组,data为长度为lookback的list
    for index in range(len(data_raw) - lookback):
        data.append(data_raw[index: index + lookback])

    data = np.array(data);
    print(type(data))  # (232, 20, 1)
    # 按照8:2进行训练集、测试集划分
    test_set_size = int(np.round(0.2 * data.shape[0]))
    train_set_size = data.shape[0] - (test_set_size)

    x_train = data[:train_set_size, :-1, :]
    y_train = data[:train_set_size, -1, :]

    x_test = data[train_set_size:, :-1]
    y_test = data[train_set_size:, -1, :]

    return [x_train, y_train, x_test, y_test]

lookback = 20
x_train, y_train, x_test, y_test = split_data(price, lookback)
print('x_train.shape = ', x_train.shape)
print('y_train.shape = ', y_train.shape)
print('x_test.shape = ', x_test.shape)
print('y_test.shape = ', y_test.shape)


# In[7]:


######################5. 模型构建
import torch
import torch.nn as nn

x_train = torch.from_numpy(x_train).type(torch.Tensor)
x_test = torch.from_numpy(x_test).type(torch.Tensor)
# 真实的数据
y_train_lstm = torch.from_numpy(y_train).type(torch.Tensor)
y_test_lstm = torch.from_numpy(y_test).type(torch.Tensor)
y_train_gru = torch.from_numpy(y_train).type(torch.Tensor)
y_test_gru = torch.from_numpy(y_test).type(torch.Tensor)

# 输入的维度为1,只有Close收盘价
input_dim = 1
# 隐藏层特征的维度
hidden_dim = 32
# 循环的layers
num_layers = 1
# 预测后一天的收盘价
output_dim = 1
num_epochs = 100


class LSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, output_dim):
        super(LSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).requires_grad_()
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).requires_grad_()
        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))
        out = self.fc(out[:, -1, :])
        return out



model = LSTM(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim, num_layers=num_layers)
criterion = torch.nn.MSELoss()
optimiser = torch.optim.Adam(model.parameters(), lr=0.01)


# In[8]:


######################6.模型训练
import time

hist = np.zeros(num_epochs)
start_time = time.time()
lstm = []

for t in range(num_epochs):
    y_train_pred = model(x_train)

    loss = criterion(y_train_pred, y_train_lstm)
    print("Epoch ", t, "MSE: ", loss.item())
    hist[t] = loss.item()

    optimiser.zero_grad()
    loss.backward()
    optimiser.step()

training_time = time.time() - start_time
print("Training time: {}".format(training_time))

predict = pd.DataFrame(scaler.inverse_transform(y_train_pred.detach().numpy()))
print(predict)  # 预测值
original = pd.DataFrame(scaler.inverse_transform(y_train_lstm.detach().numpy()))
print(original)  # 真实值


# In[9]:


import seaborn as sns
sns.set_style("darkgrid")

fig = plt.figure()
fig.subplots_adjust(hspace=0.2, wspace=0.2)

plt.subplot(1, 2, 1)
ax = sns.lineplot(x = original.index, y = original[0], label="Data", color='royalblue')
ax = sns.lineplot(x = predict.index, y = predict[0], label="Training Prediction (LSTM)", color='tomato')
print(predict.index)
print("aaaa")
print(predict[0])


ax.set_title('Stock price', size = 14, fontweight='bold')
ax.set_xlabel("Days", size = 14)
ax.set_ylabel("Cost (USD)", size = 14)
ax.set_xticklabels('', size=10)


plt.subplot(1, 2, 2)
ax = sns.lineplot(data=hist, color='royalblue')
ax.set_xlabel("Epoch", size = 14)
ax.set_ylabel("Loss", size = 14)
ax.set_title("Training Loss", size = 14, fontweight='bold')
fig.set_figheight(6)
fig.set_figwidth(16)
plt.show()


# In[10]:


import math, time
from sklearn.metrics import mean_squared_error

# make predictions
y_test_pred = model(x_test)

# invert predictions
y_train_pred = scaler.inverse_transform(y_train_pred.detach().numpy())
y_train = scaler.inverse_transform(y_train_lstm.detach().numpy())
y_test_pred = scaler.inverse_transform(y_test_pred.detach().numpy())
y_test = scaler.inverse_transform(y_test_lstm.detach().numpy())

# calculate root mean squared error
trainScore = math.sqrt(mean_squared_error(y_train[:,0], y_train_pred[:,0]))
print('Train Score: %.2f RMSE' % (trainScore))
testScore = math.sqrt(mean_squared_error(y_test[:,0], y_test_pred[:,0]))
print('Test Score: %.2f RMSE' % (testScore))
lstm.append(trainScore)
lstm.append(testScore)
lstm.append(training_time)
# shift train predictions for plotting
trainPredictPlot = np.empty_like(price)
trainPredictPlot[:, :] = np.nan
trainPredictPlot[lookback:len(y_train_pred)+lookback, :] = y_train_pred

# shift test predictions for plotting
testPredictPlot = np.empty_like(price)
testPredictPlot[:, :] = np.nan
testPredictPlot[len(y_train_pred)+lookback-1:len(price)-1, :] = y_test_pred

original = scaler.inverse_transform(price['Close'].values.reshape(-1,1))

predictions = np.append(trainPredictPlot, testPredictPlot, axis=1)
predictions = np.append(predictions, original, axis=1)
result = pd.DataFrame(predictions)

import plotly.express as px
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Scatter(go.Scatter(x=result.index, y=result[0],
                    mode='lines',
                    name='Train prediction')))
fig.add_trace(go.Scatter(x=result.index, y=result[1],
                    mode='lines',
                    name='Test prediction'))
fig.add_trace(go.Scatter(go.Scatter(x=result.index, y=result[2],
                    mode='lines',
                    name='Actual Value')))
fig.update_layout(
    xaxis=dict(
        showline=True,
        showgrid=True,
        showticklabels=False,
        linecolor='black',
        linewidth=2
    ),
    yaxis=dict(
        title_text='Close (USD)',
        titlefont=dict(
            family='Rockwell',
            size=12,
            color='black',
        ),
        showline=True,
        showgrid=True,
        showticklabels=True,
        linecolor='black',
        linewidth=2,
        ticks='outside',
        tickfont=dict(
            family='Rockwell',
            size=12,
            color='black',
        ),
    ),
    showlegend=True,
    template = 'simple_white'

)



annotations = []
annotations.append(dict(xref='paper', yref='paper', x=0.0, y=1.05,
                              xanchor='left', yanchor='bottom',
                              text='Results (LSTM)',
                              font=dict(family='Rockwell',
                                        size=26,
                                        color='black'),
                              showarrow=False))
fig.update_layout(annotations=annotations)

fig.show()


# <font size=5 face="宋体"> 注意,LSTM网络每次训练的数据都可能不一样,效果并不稳定
#     Train Score: 62.85 RMSE
# Test Score: 52.77 RMSE
#     

# <font size=5>ARIMA部分

# In[12]:


from statsmodels.tsa.stattools import arma_order_select_ic
data2 = pd.read_csv('C:\\Users\\14308\\Desktop\\gongshang.csv',index_col = 'Date')


# In[13]:


#ADF检验
from statsmodels.tsa.stattools import adfuller
adf = adfuller(data2)
adf


# In[14]:


Ddata2 = data2.diff()
Ddata2.fillna(method = 'bfill',inplace=True)
adf2 = adfuller(Ddata2)
adf2


# In[15]:


arma_order_select_ic(Ddata2, ic=['aic', 'bic'], max_ar=4, max_ma=4)


# In[27]:


import statsmodels.api as sm
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
data2_ = pd.read_csv('C:\\Users\\14308\\Desktop\\gongshang.csv',index_col = 'Date')
data2 = data2_.iloc[:782,:]
Ddata2 = data2.diff()
Ddata2.fillna(method = 'bfill',inplace=True)
model = sm.tsa.arima.ARIMA(Ddata2, order=(1,1,2))
result = model.fit()
result.summary() #提取模型系数等信息,保留三位小数;summary2保留四位小数
predict = pd.DataFrame(result.predict(start=0 ,end=440))
print(predict)

predict_origin = predict.cumsum()
predict_origin.iloc[:,:] = predict_origin.iloc[:,:]+data2.iloc[0,0]
print(predict_origin)

ticker_spacing = 200
fig,ax = plt.subplots(1,1)
ax.xaxis.set_major_locator(ticker.MultipleLocator(ticker_spacing ))
ax.plot(data2.index,predict_origin.values,label='predict',c = 'r')
ax.plot(data2.index,data2.values,label='real',c = 'b')
plt.xlabel('Date')
plt.ylabel('ClosePrice',rotation=90)
ax.legend()
plt.show()

import math
error = predict_origin.values - data2.values#样本内预测第一个数据异常
print(error[:10])
RMSE_arma = math.sqrt((error ** 2).mean())
print('arma 80% 样本内预测的均方误差是:',RMSE_arma)

from statsmodels.stats.diagnostic import acorr_ljungbox as lb_test
white_test = lb_test(error, lags=10)
print(white_test)


# In[26]:


###########ARMA模型预测对比
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import arch
import statsmodels.api as sm
data2 = pd.read_csv('C:\\Users\\14308\\Desktop\\gongshang.csv',index_col = 'Date')
Ddata2 = data2.diff()
Ddata2.fillna(method = 'bfill',inplace=True)
X = Ddata2.values
#size = int(len(X) * 0.8)
size = 320

train, test = X[0:size], X[size:len(X)]
history = [x for x in train]
predictions = list()

for t in range(len(test)):
    model = sm.tsa.arima.ARIMA(history, order=(0,1,0))
    model_fit = model.fit()
    output = model_fit.forecast()
    yhat = output[0]
    predictions.append(yhat)
    obs = test[t]
    history.append(obs)
    #print('predicted=%f, expected=%f' % (yhat, obs))


index_use = data2[size:]
# plot
predict = pd.DataFrame(predictions)
predict_origin = predict.cumsum()
predict_origin.iloc[:,:] = predict_origin.iloc[:,:]+data2.iloc[size-2,0]#预测的位置是size-(size-1)的差分?
print(predict_origin.shape)
print(data2.values[size:].shape)

plt.figure(figsize=(30, 20))
ticker_spacing = 30
fig,ax = plt.subplots(1,1)
ax.xaxis.set_major_locator(ticker.MultipleLocator(ticker_spacing ))
ax.plot(index_use.index,predict_origin.values,label='predicted')
ax.plot(index_use.index,data2.values[size:],label='real')
plt.xlabel('Date')
plt.ylabel('ClosePrice',rotation=90)
ax.legend()
plt.show()

import math
error = predict_origin.values - data2.iloc[size:,:].values
print(error[:10])
RMSE_arma = math.sqrt((error ** 2).mean())
print('arma 20% 样本外滚动-均方根误差是:',RMSE_arma)
print(data2.iloc[size-2:size+2,:])


# In[ ]:


print(data2.iloc[0,0])
print(data2.iloc[size-2,0])
print(data2.iloc[size-1,0])
print(data2.iloc[size,0])


# In[ ]:


from statsmodels.tsa.stattools import arma_order_select_ic
data3 = pd.read_csv("C:\\Users\\14308\\Desktop\\投资学(全英文)\\IDX_Idxtrd - 副本.csv",index_col = 'Date')
#ADF检验
from statsmodels.tsa.stattools import adfuller
data3 = data3.iloc[:778,:]
adf = adfuller(data3)
adf


# 不做差分的处理:

# In[ ]:


arma_order_select_ic(data3, ic=['aic', 'bic'], max_ar=4, max_ma=4)


# In[ ]:


import statsmodels.api as sm

model = sm.tsa.arima.ARIMA(data3, order=(1,0,0))
result = model.fit()
result.summary() #提取模型系数等信息,保留三位小数;summary2保留四位小数
predict = pd.DataFrame(result.predict(start=0 ,end=777))
predict.iloc[0,0] = data3.iloc[0,0]
print(predict)

ticker_spacing = 200
fig,ax = plt.subplots(1,1)
ax.xaxis.set_major_locator(ticker.MultipleLocator(ticker_spacing ))
ax.plot(data3.index,predict.values,label='predict',c = 'r')
ax.plot(data3.index,data3.values,label='real',c = 'b')
plt.xlabel('Date')
plt.ylabel('ClosePrice',rotation=90)
ax.legend()
plt.show()

error = predict.values - data3.values#样本内预测第一个数据异常
print(error[:10])
RMSE_arma = math.sqrt((error ** 2).mean())
print('arma 80% 样本内预测的均方误差是:',RMSE_arma)


# In[ ]:





# In[ ]:




  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值