在pytorch中LSTM的输入格式为(batch_size, seq_len, input_size),seq_len原意是指LSTM处理翻译问题中每条句子序列的长度,在时序数据处理中对应着一条样本中包含前多少时刻的样本数据,具体数据处理的代码如下:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from torch.utils.data import Dataset
import pandas as pd
import numpy as np
class MyDataSet(Dataset):
def __init__(self, x, y):
super(MyDataSet, self).__init__()
self.x = x
self.y = y
def __len__(self):
return self.x.shape[0]
def __getitem__(self, idx):
return self.x[idx], self.y[idx]
def data_loader(filepath = 'data/dataset.xlsx',seq_len = 6,train_rate = 0.7):
"""Loads Google stock data.
Args:
- train_rate: the ratio between training and testing sets
- seq_len: sequence length
Returns:
- dataset_train: training data
- dataset_val: valid data
- dataset_test: testing data
"""
ori_data = pd.read_excel(filepath).iloc[:,1:].values
scaler = MinMaxScaler(feature_range=(0,1))
norm_data = scaler.fit_transform(ori_data)
data_x = []
data_y = []
for i in range(0, len(norm_data[:,0]) - seq_len):
temp_x = norm_data[i:i + seq_len,:-1]
temp_y = norm_data[i:i + seq_len,-1:]
data_x = data_x + [temp_x]
data_y = data_y + [temp_y]
data_x = np.asarray(data_x).astype(np.float32)
data_y = np.asarray(data_y).astype(np.float32)
train_x, valid_x, test_x = data_x[:int(0.6 * len(norm_data))], data_x[int(0.6 * len(norm_data)):int(0.8 * len(norm_data))], data_x[int(0.8 * len(norm_data)):]
train_y, valid_y, test_y = data_y[:int(0.6 * len(norm_data))], data_y[int(0.6 * len(norm_data)):int(0.8 * len(norm_data))], data_y[int(0.8 * len(norm_data)):]
dataset_train,dataset_val,dataset_test = MyDataSet(train_x,train_y),MyDataSet(valid_x,valid_y),MyDataSet(test_x,test_y)
return dataset_train, dataset_val, dataset_test