import pandas as pd
import numpy as np
import pickle
import os, time
import pickle
from tqdm import tqdm
import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from keras.layers import LSTM
from tensorflow.keras.layers import Dropout, Dense, Activation, Input, Permute, Reshape
from tensorflow import keras
from tensorflow.keras import utils
from tensorflow.keras.callbacks import CSVLogger, ReduceLROnPlateau
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error, mean_absolute_percentage_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import random
seed = 1024
tf.random.set_seed(seed)
np.random.seed(seed)
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
df=pd.read_csv('sh.600000.csv')
df.head()
def createSequence(data, windows, pre_timesteps=1):
"""生成序列数据及对应的标签值"""
x = []
y = []
data = data[["open", "low", "close", "high"]].values
for i in range(len(data)-windows-pre_timesteps):
x.append(data[i:i+windows,:])
y.append(data[i+windows:i+windows+pre_timesteps,:][0])
x = np.array(x, dtype="float64")
y = np.array(y, dtype="float64")
return x, y
def create_pkl(window=10, pre_timesteps=1, train_split=0.9):
train = pd.read_csv("sh.600000.csv", index_col=0)
transfer = MinMaxScaler(feature_range=(0, 1))
train[["open", "low", "close", "high"]]=transfer.fit_transform(train[["open", "low", "close", "high"]])
trainval_seq, trainval_label = createSequence(train, window, pre_timesteps)
train_num = int(len(trainval_seq)*train_split)
train_seq = trainval_seq[:train_num]
train_label= trainval_label[:train_num]
val_seq=trainval_seq[train_num:]
val_label=trainval_label[train_num:]
# 保存数据
with open(f'data/train_seq_{window}_{pre_timesteps}_{train_split}.pkl', 'wb') as f:
pickle.dump(train_seq, f)
with open(f'data/train_label_{window}_{pre_timesteps}_{train_split}.pkl', 'wb') as f:
pickle.dump(train_label, f)
with open(f'data/val_seq_{window}_{pre_timesteps}_{train_split}.pkl', 'wb') as f:
pickle.dump(val_seq, f)
with open(f'data/val_label_{window}_{pre_timesteps}_{train_split}.pkl', 'wb') as f:
pickle.dump(val_label, f)
return train_seq, train_label, val_seq, val_label
def get_data(train_split=0.9, batchsize=32, window=10, pre_timesteps=1, overwrite=True):
# 判断处理好的数据是否存在
if not overwrite and os.path.exists(f'data/train_seq_{window}_{pre_timesteps}_{train_split}.pkl'):
print(f'发现 window={window},pre_timesteps={pre_timesteps},train_split={train_split}的数据')
else:
print(f'正在创建window={window},pre_timesteps={pre_timesteps},train_split={train_split}的数据')
_ = create_pkl(window=window, pre_timesteps=pre_timesteps, train_split=train_split)
# 读取数据
with open(f'data/train_seq_{window}_{pre_timesteps}_{train_split}.pkl', 'rb') as f:
train_seq = pickle.load(f)
with open(f'data/train_label_{window}_{pre_timesteps}_{train_split}.pkl', 'rb') as f:
train_label = pickle.load(f)
with open(f'data/val_seq_{window}_{pre_timesteps}_{train_split}.pkl', 'rb') as f:
val_seq = pickle.load(f)
with open(f'data/val_label_{window}_{pre_timesteps}_{train_split}.pkl', 'rb') as f:
val_label = pickle.load(f)
# 数量必须是batchsize的整数倍, 否则出错
num_train = len(train_seq) - len(train_seq)%batchsize
num_val = len(val_seq) - len(val_seq)%batchsize
return (train_seq[:num_train], train_label[:num_train],
val_seq[:num_val], val_label[:num_val])
d = get_data(train_split=0.9, batchsize=32, window=10, pre_timesteps=1, overwrite=True)
[i.shape for i in d]
def lstm(window, featuresize, pre_t):
model = Sequential()
model.add(LSTM(40, input_shape=(window, featuresize),
return_sequences=True))
model.add(LSTM(40, return_sequences=True))
model.add(LSTM(40, return_sequences=False))
# if pre_t==1:
# model.add(Reshape((-1,1,40)))
model.add(Dense(featuresize))
model.add(Activation("relu"))
return model
epochs = 200 # epochs
batchsize = 32 #
learning_rate = 0.01 # 学习率
featuresize = 4 # 特征个数
train_split = 0.9 # train比例
window = 50 # 窗口大小
pre_timesteps = 1 # lstm 只能为1
# get data
train_seq, train_label, val_seq, val_label = \
get_data(train_split, batchsize, window, pre_timesteps, False)
for i in [train_seq, train_label, val_seq, val_label]:
print(i.shape)
# model
model = lstm(window, featuresize, pre_timesteps)
# optimizer
adam = tf.keras.optimizers.Adam(learning_rate=learning_rate)
# 模型编译,设置模型的相关参数:优化器,损失函数和评价指标
model.compile(loss='mse', optimizer=adam, metrics=['acc'])
# 训练模型时每个epoch的参数[epoch,acc,loss,val_acc,val_loss]保存到csv文件中
# loss:训练集损失值,accuracy:训练集准确率,val_loss:测试集损失值,val_accruacy:测试集准确率
log = CSVLogger(f"./lstm_log.csv", separator=",", append=False)
# 当度量停止改进时,降低学习率
reduce = ReduceLROnPlateau(monitor='val_acc',
factor=0.5,
patience=10,
verbose=1,
mode='auto',
min_delta=0.001,
cooldown=0,
min_lr=0.001)
# train/val
use_time = time.time()
model.fit(train_seq,
train_label,
validation_data=[val_seq, val_label],
epochs=epochs,
batch_size=batchsize,
verbose=1,
use_multiprocessing=True,
# workers=2,
callbacks=[log, reduce])
# model.save('best.h5')
use_time = time.time()-use_time
loss, acc = model.evaluate(val_seq, val_label, verbose=1)
print('Loss : {}, Accuracy: {}'.format(loss, acc))
print(f'lstm use time :{use_time:.5f}')
loss, acc = model.evaluate(val_seq, val_label, verbose=1)
print('Loss : {}, Accuracy: {}'.format(loss, acc))
# 预测
pre_y = model.predict(val_seq)
y = val_label[:,-1]
pre = pre_y[:,-1]
mae = mean_absolute_error(y, pre)
mape = mean_absolute_percentage_error(y, pre)
mse = mean_squared_error(y, pre)
rmse = mse**0.5
r2 = r2_score(y, pre)
print(f'MAE :{mae}')
print(f'MSE :{mse}')
print(f'MAPE:{mape}')
import matplotlib.pyplot as plt
name = ["open", "low", "close", "high"]
plt.figure(dpi=100)
plt.plot(range(200,250), val_label[200:250,-1], label='label')
plt.plot(range(200,250), pre_y[100:250,-1], label='pred')
plt.title(name[-1])
plt.legend()
plt.show()
import matplotlib.pyplot as plt
name = ["open", "low", "close", "high"]
len_slice = slice(200,250)
for i in range(val_label.shape[1]):
plt.figure(dpi=100)
plt.plot(val_label[len_slice,i], label='label')
plt.plot(pre_y[len_slice,i], label='pred')
plt.title(name[i])
plt.legend()
plt.show()