利用LSTM对股票的收盘价进行回归预测

基于机器学习的股票分析

三、利用神经网络对股票的日收盘价进行回归预测
import tushare as ts
import pandas as pd 
import numpy as np
from datetime import datetime
from pandas import read_csv
from pandas import DataFrame
from pandas import concat
from numpy import concatenate
from sklearn.metrics import mean_squared_error
from math import sqrt


#定义一个时间日期转换格式函数
def parse(x):
    return datetime.strptime(x, '%Y %m %d %H')
In [3]:
#增加数据集,从15年到17年,其实数据量不够的,至少得90年代开始吧,只是演示效果
df_CB=ts.get_hist_data('601988', start='2015-01-01', end='2017-12-01')
df_CB = df_CB.sort_index()
In [4]:
'''保存到本地
df_CB.to_csv('G:\\Project\\data\\CB.csv', sep=',' ,index=True)
df_CB_path = 'G:\\Project\\data\\CB.csv'
df_CB = pd.read_csv(df_CB_path)'''
# 得到每一列
cols = list(df_CB)
# 将收盘价放到df的第一列
cols.insert(0, cols.pop(cols.index('close')))
df_CB = df_CB.ix[:, cols]
#保存到本地CB_close3
#df_CB.to_csv('G:\\Project\\data\\CB_close3.csv', encoding = "utf-8",sep=',', header=True, index=True)
#绘图查看
from pandas import read_csv
from matplotlib import pyplot

dataset = read_csv('G:\\Project\\data\\CB_close3.csv',header=0, index_col=0)
values = dataset.values

#这里画出close,open,high,low,volume,ma5
groups = [0, 1, 2, 3, 4, 7]
i = 1

pyplot.figure()
for group in groups:
    pyplot.subplot(len(groups), 1, i)
    pyplot.plot(values[:, group])
    pyplot.title(dataset.columns[group], y=0.5, loc='right')
    i += 1
pyplot.show()

这里写图片描述

# 把时间序列转换为有监督学习问题,转换数据集的函数如下
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = DataFrame(data)
    cols, names = list(), list()
    # 输入序列 (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # 预测序列 (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # 连接在一起
    agg = concat(cols, axis=1)
    agg.columns = names
    # 丢弃缺失值
    if dropnan:
        agg.dropna(inplace=True)
    return agg
#导入原始时间序列数据
dataset = read_csv('G:\\Project\\data\\CB_close3.csv',\
                   header=0, index_col=0)
values = dataset.values

#归一化数据
from sklearn import preprocessing
scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
scaled = scaler.fit_transform(values)

#转换为监督学习数据
reframed = series_to_supervised(scaled, 1, 1)

#只对close进行预测,丢弃不需要的列
reframed.drop(reframed.columns[[15,16,17,18,19,\
                                20,21,22,23,24,25,26,27]],\
                                axis=1, inplace=True)
print(reframed.head())

var1(t-1) var2(t-1) var3(t-1) var4(t-1) var5(t-1) var6(t-1) \
1 0.522267 0.433735 0.480427 0.516432 0.447571 0.777778
2 0.578947 0.514056 0.565836 0.563380 0.448413 0.646465
3 0.570850 0.546185 0.530249 0.638498 0.297727 0.484848
4 0.485830 0.582329 0.505338 0.577465 0.286033 0.292929
5 0.542510 0.473896 0.572954 0.539906 0.441491 0.646465

var7(t-1) var8(t-1) var9(t-1) var10(t-1) var11(t-1) var12(t-1) \
1 0.825848 0.375218 0.311634 0.279312 0.506360 0.677639
2 0.659182 0.439791 0.358726 0.307973 0.526423 0.718259
3 0.479042 0.500000 0.388735 0.332465 0.502962 0.649757
4 0.269960 0.534031 0.416898 0.347577 0.497065 0.605001
5 0.662176 0.561955 0.458449 0.365294 0.497030 0.631418

var13(t-1) var14(t-1) var1(t)
1 0.792126 0.462500 0.578947
2 0.807557 0.462500 0.570850
3 0.799205 0.308333 0.485830
4 0.790240 0.295833 0.542510
5 0.802311 0.454167 0.514170

values = reframed.values

n_train_days = int(len(reframed)*0.8)

#前80% 天的数据作为训练数据
train = values[:n_train_days, :]

#最后20%天的数据作为测试数据
test = values[n_train_days:, :]

#设定输入输出
train_X, train_y = train[:, :-1], train[:, -1]

test_X, test_y = test[:, :-1], test[:, -1]

#数据转化为LSTM的输入数据类型(3D格式,【样本,时间步长,特征】)
train_X = train_X.reshape((train_X.shape[0], 1, train_X.shape[1]))
test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))
print(train_X.shape, train_y.shape, test_X.shape, test_y.shape)

(568, 1, 14) (568,) (143, 1, 14) (143,)

#设计网络结构

from keras.models import Sequential
from keras.layers.recurrent import LSTM
from keras.layers.core import Dense
from matplotlib import pyplot

#在第一个隐层中定义具有50个神经元的LSTM和用于预测close的
#输出层中的1个神经元,输入形状是1个时间步长,具有5个特征。
model = Sequential()
model.add(LSTM(50, input_shape=(train_X.shape[1], train_X.shape[2])))
model.add(Dense(1))
model.compile(loss='mae', optimizer='adam')

#使用平均绝对误差(MAE)损失函数。该模型将拟合50个批量大小为64的
#训练时期。通过在fit()函数中设置validation_data参数来跟踪训练
#过程中的训练和测试失败
#epochs和batch_size调了好长时间效果一直不好
history = model.fit(train_X, train_y, epochs=50, \
                    batch_size=32, validation_data=(test_X, test_y),\
                    verbose=2, shuffle=False)


pyplot.plot(history.history['loss'], label='train')
pyplot.plot(history.history['val_loss'], label='test')
pyplot.legend()
pyplot.show()

Using TensorFlow backend.
….
….
Epoch 50/50
0s - loss: 0.0211 - val_loss: 0.0133

数据太少,效果不好

#计算预测值和实际值之间的均方根误差
yhat = model.predict(test_X)
test_X = test_X.reshape((test_X.shape[0], test_X.shape[2]))

inv_yhat = concatenate((yhat, test_X[:, 1:]), axis=1)
inv_yhat = scaler.inverse_transform(inv_yhat)
inv_yhat = inv_yhat[:,0]

test_y = test_y.reshape((len(test_y), 1))
inv_y = concatenate((test_y, test_X[:, 1:]), axis=1)
inv_y = scaler.inverse_transform(inv_y)
inv_y = inv_y[:,0]

rmse = sqrt(mean_squared_error(inv_y, inv_yhat))
print('Test RMSE: %.3f' % rmse)

#绘图表示预测值和实际值
pyplot.plot(inv_yhat)
pyplot.plot(inv_y)
pyplot.show()

Test RMSE: 0.046
这里写图片描述

关于LSTM这篇不错,http://blog.csdn.net/mylove0414/article/details/55805974

  • 3
    点赞
  • 44
    收藏
    觉得还不错? 一键收藏
  • 8
    评论
评论 8
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值