import numpy as np
import torch
from torch import nn
import matplotlib.pyplot as plt
"""
Github: Yonv1943 Zen4 Jia1 hao2
https://github.com/Yonv1943/DL_RL_Zoo/blob/master/RNN
The source of training data
https://github.com/L1aoXingyu/
code-of-learn-deep-learning-with-pytorch/blob/master/
chapter5_RNN/time-series/lstm-time-series.ipynb
"""
def run_train_gru():
inp_dim = 3
out_dim = 1
batch_size = 12 * 4
'''load data'''
data = load_data()
data_x = data[:-1, :]
data_y = data[+1:, 0]
assert data_x.shape[1] == inp_dim
train_size = int(len(data_x) * 0.75)
train_x = data_x[:train_size]
train_y = data_y[:train_size]
train_x = train_x.reshape((train_size, inp_dim))
train_y = train_y.reshape((train_size, out_dim))
'''build model'''
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
net = RegGRU(inp_dim, out_dim, mod_dim=12, mid_layers=2).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=1e-2)
'''train'''
var_x = torch.tensor(train_x, dtype=torch.float32, device=device)
var_y = torch.tensor(train_y, dtype=torch.float32, device=device)
batch_var_x = list()
batch_var_y = list()
for i in range(batch_size):
j = train_size - i
batch_var_x.append(var_x[j:])
batch_var_y.append(var_y[j:])
from torch.nn.utils.rnn import pad_sequence
batch_var_x = pad_sequence(batch_var_x)
batch_var_y = pad_sequence(batch_var_y)
with torch.no_grad():
weights = np.tanh(np.arange(len(train_y)) * (np.e / len(train_y)))
weights = torch.tensor(weights, dtype=torch.float32, device=device)
for e in range(256):
out = net(batch_var_x)
# loss = criterion(out, batch_var_y)
loss = (out - batch_var_y) ** 2 * weights
loss = loss.mean()
optimizer.zero_grad()
loss.backward()
optimizer.step()
if e % 100 == 0:
print('Epoch: {}, Loss: {:.5f}'.format(e, loss.item()))
'''eval'''
net = net.eval()
test_x = data_x.copy()
test_x[train_size:, 0] = 0
test_x = test_x[:, np.newaxis, :]
test_x = torch.tensor(test_x, dtype=torch.float32, device=device)
for i in range(train_size, len(data) - 2):
test_y = net(test_x[:i])
test_x[i + 1, 0, 0] = test_y[-1]
pred_y = test_x[1:, 0, 0]
pred_y = pred_y.cpu().data.numpy()
diff_y = pred_y[train_size:] - data_y[train_size:-1]
l1_loss = np.mean(np.abs(diff_y))
l2_loss = np.mean(diff_y ** 2)
print("L1: {:.3f} L2: {:.3f}".format(l1_loss, l2_loss))
plt.plot(pred_y, 'r', label='pred')
plt.plot(data_y, 'b', label='real')
plt.legend(loc='best')
plt.pause(4)
def run_train_lstm():
inp_dim = 3
out_dim = 1
mid_dim = 8
mid_layers = 1
batch_size = 12 * 4
mod_dir = '.'
'''load data'''
data = load_data()
# 舍弃了最后一行
data_x = data[:-1, :]
# 行+1,取第0列的数据,其实取的是第0列除了第一行的数据外的所有列,因为第一行的label也是输入,所以就不该成为预测值了
data_y = data[+1:, 0] # seq_number, seq_year, month
# 如果data_x的列维度不等于inp_dim 则直接报错
assert data_x.shape[1] == inp_dim
train_size = int(len(data_x) * 0.75)
# 取0.75的数据作为训练集的x,y
train_x = data_x[:train_size]
train_y = data_y[:train_size]
train_x = train_x.reshape((train_size, inp_dim))
train_y = train_y.reshape((train_size, out_dim))
'''build model 如果有gpu则用gpu,否则使用cpu'''
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
net = RegLSTM(inp_dim, out_dim, mid_dim, mid_layers).to(device)
# 使用的损失函数是MSE
criterion = nn.MSELoss()
parameters = net.parameters()
optimizer = torch.optim.Adam(net.parameters(), lr=1e-2)
''' train 将ndarray转化成torch,可能转化成cpu识别的格式或者gpu识别的tensor,通过device来决定 '''
var_x = torch.tensor(train_x, dtype=torch.float32, device=device)
var_y = torch.tensor(train_y, dtype=torch.float32, device=device)
batch_var_x = list()
batch_var_y = list()
# 一共108个训练集,只用48batch,是否有些浪费
for i in range(batch_size):
# 不同的起始裁剪位点
j = train_size - i
# 从后往前进行裁剪,最开始0个有效数据,然后一个,2个,3个,最后batch_size个有效数据,但是9个月的数据还是没有用全。。。。
# 在训练中作为输入的客流量数据(年份、月份、本月的客流量)
# test_x = var_x[j:]
# test_y = var_y[j:]
batch_var_x.append(var_x[j:])
# 在训练中作为标签的预测数据(下个月的客流量)
batch_var_y.append(var_y[j:])
from torch.nn.utils.rnn import pad_sequence
# 47*48*3 相当于一共47个样本
batch_var_x = pad_sequence(batch_var_x)
# 47 相当于47个y
batch_var_y = pad_sequence(batch_var_y)
with torch.no_grad():
# 长度越长,weights越大
weights = np.tanh(np.arange(len(train_y)) * (np.e / len(train_y)))
weights = torch.tensor(weights, dtype=torch.float32, device=device)
print("Training Start")
for e in range(304):
# 相当于384次batch, 训练的时候,将seq_number, seq_year, seq_month 都放入模型进行了训练,感觉就像几个有规律组合成的单词,只不过跟前后的单词都有关
out = net(batch_var_x)
# loss = criterion(out, batch_var_y)
# 手动计算mse
loss = (out - batch_var_y) ** 2 * weights
# loss对样本的个数求平均值
loss = loss.mean()
# 设置所有梯度优化器tensor为0
optimizer.zero_grad()
# 反向传播计算梯度
loss.backward()
# 通过梯度下降法进行梯度更新
optimizer.step()
# 每64次batch打印一下当前的损失函数,state_dict() 返回模型所有的状态。
if e % 64 == 0:
print('Epoch: {:4}, Loss: {:.5f}'.format(e, loss.item()))
torch.save(net.state_dict(), '{}/net.pth'.format(mod_dir))
print("Save in:", '{}/net.pth'.format(mod_dir))
'''eval 加载通过torch.save方式存储的模型, '''
net.load_state_dict(torch.load('{}/net.pth'.format(mod_dir), map_location=lambda storage, loc: storage))
# 设置模型成评估模式
net = net.eval()
# data_x 是9+3年的数据
test_x = data_x.copy()
# 我们将后四年的数据设置成0 =>143:3
test_x[train_size:, 0] = 0
# 其实就是扩维度,中间增加一个维度 143:3=>143:1:3
test_x = test_x[:, np.newaxis, :]
# 将test_x ndarray变成tensor
test_x = torch.tensor(test_x, dtype=torch.float32, device=device)
'''simple way but no elegant'''
# for i in range(train_size, len(data) - 2):
# test_y = net(test_x[:i])
# test_x[i, 0, 0] = test_y[-1]
'''elegant way but slightly complicated 优雅的方法但是轻微复杂 '''
eval_size = 1
# 填充一个 mid_layers*eval_size*mid_dim维度的矩阵 1*1*8
zero_ten = torch.zeros((mid_layers, eval_size, mid_dim), dtype=torch.float32, device=device)
test1 = test_x[:train_size] # [0:106] 117
test_y, hc = net.output_y_hc(test_x[:train_size], (zero_ten, zero_ten)) #
test1 = test_y[-1]
test_x[train_size + 1, 0, 0] = test_y[-1] # 107*1*1 #
for i in range(train_size + 1, len(data) - 2): # (108:142]
test11 = test_x[i:i + 1]
test_y, hc = net.output_y_hc(test_x[i:i + 1], hc) # 通过前面的hc,和当前的输入,预测后面的hc,哈哈哈,理解完概念 test_y 就是预测的下一个label,是一个连续的值
test_x[i + 1, 0, 0] = test_y[-1] # 将刚刚预测的结果,会跟seq_year,seq_month组合成下一个输入进行模型
pred_y = test_x[1:, 0, 0] # 去除第一行,因为第一行为输入值,不是输出,因为整个模型的输入是 seq_number,seq_year,seq_month
pred_y = pred_y.cpu().data.numpy() # 将最终预测的值转化成numpy格式
data_test = data_y[train_size:-1]
print(train_size, data_test)
# 最后预测的正常应该是3年的数据,所以我们对比的也应该是3年的数据,但是实际的打印是35条,少一条,因为输入的数据集去除了最后一行,实际的值seq_num输出去除了第一行
diff_y = pred_y[train_size:] - data_y[train_size:-1]
print(diff_y)
print(pred_y[train_size:])
print(pred_y[train_size:])
l1_loss = np.mean(np.abs(diff_y)) # l1范数
l2_loss = np.mean(diff_y ** 2) # l2范数
print("L1: {:.3f} L2: {:.3f}".format(l1_loss, l2_loss))
plt.plot(pred_y, 'r', label='pred')
plt.plot(data_y, 'b', label='real', alpha=0.3)
# [-1,2]代表train,pred之间的分解竖线的y轴的范围。
plt.plot([train_size, train_size], [-1, 2], color='k', label='train | pred')
plt.legend(loc='best')
plt.savefig('lstm_reg.png')
plt.pause(4)
def run_origin():
inp_dim = 2
out_dim = 1
mod_dir = '.'
'''load data'''
data = load_data() # axis1: number, year, month
data_x = np.concatenate((data[:-2, 0:1], data[+1:-1, 0:1]), axis=1)
data_y = data[2:, 0]
train_size = int(len(data_x) * 0.75)
train_x = data_x[:train_size]
train_y = data_y[:train_size]
train_x = train_x.reshape((-1, 1, inp_dim))
train_y = train_y.reshape((-1, 1, out_dim))
'''build model'''
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
net = RegLSTM(inp_dim, out_dim, mid_dim=4, mid_layers=2).to(device)
criterion = nn.SmoothL1Loss()
optimizer = torch.optim.Adam(net.parameters(), lr=1e-2)
'''train'''
var_x = torch.tensor(train_x, dtype=torch.float32, device=device)
var_y = torch.tensor(train_y, dtype=torch.float32, device=device)
print('var_x.size():', var_x.size())
print('var_y.size():', var_y.size())
for e in range(512):
out = net(var_x)
loss = criterion(out, var_y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if (e + 1) % 100 == 0: # 每 100 次输出结果
print('Epoch: {}, Loss: {:.5f}'.format(e + 1, loss.item()))
torch.save(net.state_dict(), '{}/net.pth'.format(mod_dir))
'''eval'''
# net.load_state_dict(torch.load('{}/net.pth'.format(mod_dir), map_location=lambda storage, loc: storage))
net = net.eval() # 转换成测试模式
"""
inappropriate way of seq prediction:
use all real data to predict the number of next month
"""
test_x = data_x.reshape((-1, 1, inp_dim))
var_data = torch.tensor(test_x, dtype=torch.float32, device=device)
eval_y = net(var_data) # 测试集的预测结果
pred_y = eval_y.view(-1).cpu().data.numpy()
plt.plot(pred_y[1:], 'r', label='pred inappr', alpha=0.3)
plt.plot(data_y, 'b', label='real', alpha=0.3)
plt.plot([train_size, train_size], [-1, 2], label='train | pred')
"""
appropriate way of seq prediction:
use real+pred data to predict the number of next 3 years.
"""
test_x = data_x.reshape((-1, 1, inp_dim))
test_x[train_size:] = 0 # delete the data of next 3 years.
test_x = torch.tensor(test_x, dtype=torch.float32, device=device)
for i in range(train_size, len(data) - 2):
test_y = net(test_x[:i])
test_x[i, 0, 0] = test_x[i - 1, 0, 1]
test_x[i, 0, 1] = test_y[-1, 0]
pred_y = test_x.cpu().data.numpy()
pred_y = pred_y[:, 0, 0]
plt.plot(pred_y[2:], 'g', label='pred appr')
plt.legend(loc='best')
plt.savefig('lstm_origin.png')
plt.pause(4)
class RegLSTM(nn.Module):
def __init__(self, inp_dim, out_dim, mid_dim, mid_layers):
super(RegLSTM, self).__init__()
# inp_dim 是LSTM输入张量的维度,我们已经根据我们的数据确定了这个值是3
# mid_dim 是LSTM三个门 (gate) 的网络宽度,也是LSTM输出张量的维度
# num_layers
# input_size: The number of expected features in the input `x`
# hidden_size: The number of features in the hidden state `h`
# num_layers: Number of recurrent layers. E.g., setting ``num_layers=2``
# would mean stacking two LSTMs together to form a `stacked LSTM`,
# with the second LSTM taking in outputs of the first LSTM and
# computing the final results. Default: 1
# bias: If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`.
# Default: ``True``
# batch_first: If ``True``, then the input and output tensors are provided
# as (batch, seq, feature). Default: ``False``
# dropout: If non-zero, introduces a `Dropout` layer on the outputs of each
# LSTM layer except the last layer, with dropout probability equal to
# :attr:`dropout`. Default: 0
# bidirectional: If ``True``, becomes a bidirectional LSTM. Default: ``False``
# proj_size: If ``> 0``, will use LSTM with projections of corresponding size. Default: 0
# inp_dim输入维度,mid_dim:输出维度,mid_layers:循环神经网络的层数,
self.rnn = nn.LSTM(inp_dim, mid_dim, mid_layers) # rnn
# 为了进行时间序列预测,我们在LSTM后面接上两层全连接层(1层亦可),同时改变最终输出张量的维度,我们只需要预测客流量这一个值,因此out_dim 为1。在LSTM后方的全连接层也可以看做是一个回归操作 regression。
#
# 在LSTM后面接上两层全连接层,为何是两层: 理论上足够宽,并且至少存在一层具有任何一种“挤压”性质的激活函数的两层全连接层就能拟合任何连续函数。最先提出这个理论证明的是 Barron et al., 1993,使用了UAT (Universal Approximation Theorem),指出了可以在compact domain拟合任意多项式函数。”
# 实际上对于过于复杂的连续函数,这个「足够宽」不容易满足。并且拟合训练数据并让神经网络具备足够的泛化性的前提是:良好的训练方法(比如批次训练数据满足 独立同分布 (i.i.d.),良好的损失函数,满足Lipschitz连续 etc.),具体内容可以看:(引号内容为摘抄 数学爱好者在知乎上的回答 )“
# lstm 后面还接上了两层全连接
self.reg = nn.Sequential(
nn.Linear(mid_dim, mid_dim),
nn.Tanh(),
nn.Linear(mid_dim, out_dim),
) # regression
def forward(self, x):
y = self.rnn(x)[0] # y, (h, c) = self.rnn(x)
seq_len, batch_size, hid_dim = y.shape
y = y.view(-1, hid_dim)
y = self.reg(y)
y = y.view(seq_len, batch_size, -1)
return y
"""
PyCharm Crtl+click nn.LSTM() jump to code of PyTorch:
Examples::
>>> rnn = nn.LSTM(10, 20, 2)
>>> input = torch.randn(5, 3, 10)
>>> h0 = torch.randn(2, 3, 20)
>>> c0 = torch.randn(2, 3, 20)
>>> output, (hn, cn) = rnn(input, (h0, c0))
"""
# hc c是memory,h是 output
def output_y_hc(self, x, hc):
# hc (1*1*8, 1*1*8)
y, hc = self.rnn(x, hc) # y, (h, c) = self.rnn(x)
# 107*1*8
seq_len, batch_size, hid_dim = y.size()
y = y.view(-1, hid_dim) # 训练集 107*8 测试集 1*8
y = self.reg(y) # 107*1
y = y.view(seq_len, batch_size, -1) # 107*1*1
return y, hc
class RegGRU(nn.Module):
def __init__(self, inp_dim, out_dim, mod_dim, mid_layers):
super(RegGRU, self).__init__()
self.rnn = nn.GRU(inp_dim, mod_dim, mid_layers)
self.reg = nn.Linear(mod_dim, out_dim)
def forward(self, x):
x, h = self.rnn(x) # (seq, batch, hidden)
seq_len, batch_size, hid_dim = x.shape
x = x.view(-1, hid_dim)
x = self.reg(x)
x = x.view(seq_len, batch_size, -1)
return x
def output_y_h(self, x, h):
y, h = self.rnn(x, h)
seq_len, batch_size, hid_dim = y.size()
y = y.view(-1, hid_dim)
y = self.reg(y)
y = y.view(seq_len, batch_size, -1)
return y, h
def load_data():
# passengers number of international airline , 1949-01 ~ 1960-12 per month
seq_number = np.array(
[112., 118., 132., 129., 121., 135., 148., 148., 136., 119., 104.,
118., 115., 126., 141., 135., 125., 149., 170., 170., 158., 133.,
114., 140., 145., 150., 178., 163., 172., 178., 199., 199., 184.,
162., 146., 166., 171., 180., 193., 181., 183., 218., 230., 242.,
209., 191., 172., 194., 196., 196., 236., 235., 229., 243., 264.,
272., 237., 211., 180., 201., 204., 188., 235., 227., 234., 264.,
302., 293., 259., 229., 203., 229., 242., 233., 267., 269., 270.,
315., 364., 347., 312., 274., 237., 278., 284., 277., 317., 313.,
318., 374., 413., 405., 355., 306., 271., 306., 315., 301., 356.,
348., 355., 422., 465., 467., 404., 347., 305., 336., 340., 318.,
362., 348., 363., 435., 491., 505., 404., 359., 310., 337., 360.,
342., 406., 396., 420., 472., 548., 559., 463., 407., 362., 405.,
417., 391., 419., 461., 472., 535., 622., 606., 508., 461., 390.,
432.], dtype=np.float32)
# assert seq_number.shape == (144, )
# plt.plot(seq_number)
# plt.ion()
# plt.pause(1)
# 一维行向量变成二维矩阵,矩阵的第二个维度是1
seq_number = seq_number[:, np.newaxis]
# print(repr(seq))
# 1949~1960, 12 years, 12*12==144 month
seq_year = np.arange(12)
seq_month = np.arange(12)
# [ 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 5 5 5 5 6 6 6 6 6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7 7 7 7 7 8 8 8 8 8 8 8 8 8 8 8 8 9 9 9 9 9 9 9 9 9 9 9 9 10 10 10 10 10 10 10 10 10 10 10 10 11 11 11 11 11 11 11 11 11 11 11 11]
test0 = np.repeat(seq_year, len(seq_month))
print(test0)
# [ 0 1 2 3 4 5 6 7 8 9 10 11 0 1 2 3 4 5 6 7 8 9 10 11 0 1 2 3 4 5 6 7 8 9 10 11 0 1 2 3 4 5 6 7 8 9 10 11 0 1 2 3 4 5 6 7 8 9 10 11 0 1 2 3 4 5 6 7 8 9 10 11 0 1 2 3 4 5 6 7 8 9 10 11 0 1 2 3 4 5 6 7 8 9 10 11 0 1 2 3 4 5 6 7 8 9 10 11 0 1 2 3 4 5 6 7 8 9 10 11 0 1 2 3 4 5 6 7 8 9 10 11 0 1 2 3 4 5 6 7 8 9 10 11]
test1 = np.tile(seq_month, len(seq_year))
print(test1)
# np.repeat:复制的是多维数组的每一个元素;axis来控制复制的行和列
# np.tile:复制的是多维数组本身;
# (144, 2)
seq_year_month = np.transpose(
[np.repeat(seq_year, len(seq_month)),
np.tile(seq_month, len(seq_year))],
) # Cartesian Product
seq = np.concatenate((seq_number, seq_year_month), axis=1)
# Standardization 这个应该成为标准化
seq = (seq - seq.mean(axis=0)) / seq.std(axis=0)
return seq
if __name__ == '__main__':
run_train_lstm()
# run_train_gru()
# run_origin()
通过实际案例理解pytorch中lstm使用技巧
最新推荐文章于 2024-05-26 11:00:33 发布