视频链接: 30、PyTorch LSTM和LSTMP的原理及其手写复现_哔哩哔哩_bilibili
PyTorch LSTM API:https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html?highlight=lstm#torch.nn.LSTM
LSTM API
首先实例化一些参数:
import torch
import torch.nn as nn
# 定义一些常量
batch_size, seq_len, input_size, h_size = 2, 3, 4, 5
input = torch.randn(batch_size, seq_len, input_size) # 随机初始化一个输入序列
c_0 = torch.randn(batch_size, h_size) # 初始值,不会参与训练
h_0 = torch.randn(batch_size, h_size)
调用PyTorch中的 LSTM API:
# 调用官方 LSTM API
lstm_layer = nn.LSTM(input_size, h_size, batch_first=True) # num_layers默认为1
output, (h_n, c_n) = lstm_layer(input, (h_0.unsqueeze(0), c_0.unsqueeze(0))) # (D*num_layers=1, b, hidden_size)
看一下返回的结果的形状:
print(output.shape) # [2,3,5] [b, seq_len, hidden_size]
print(h_n.shape) # [1,2,5] [num_layers, b, hidden_size]
print(c_n.shape) # [1,2,5] [num_layers, b, hidden_size]
这里输出一下lstm_layer中的参数名称及其形状:
for name, para in lstm_layer.named_parameters():
print(name, para.shape)
输出结果如下:
weight_ih_l0 torch.Size([20, 4]) # [4*hidden_size, input_size]
weight_hh_l0 torch.Size([20, 5]) # [4*hidden_size, hidden_size]
bias_ih_l0 torch.Size([20]) # [4*hidden_size]
bias_hh_l0 torch.Size([20]) # [4*hidden_size]
手写 lstm_forward 函数
手写一个lstm_forward函数,实现LSTM的计算原理。官网上的计算公式,如下:
i
t
=
σ
(
W
i
i
x
t
+
b
i
i
+
W
h
i
h
t
−
1
+
b
h
i
)
f
t
=
σ
(
W
i
f
x
t
+
b
i
f
+
W
h
f
h
t
−
1
+
b
h
f
)
g
t
=
tanh
(
W
i
g
x
t
+
b
i
g
+
W
h
g
h
t
−
1
+
b
h
g
)
o
t
=
σ
(
W
i
o
x
t
+
b
i
o
+
W
h
o
h
t
−
1
+
b
h
o
)
c
t
=
f
t
⊙
c
t
+
i
t
⊙
g
t
h
t
=
o
t
⊙
tanh
(
c
t
)
\begin{align} &i_t = \sigma(W_{ii}x_t + b_{ii} + W_{hi}h_{t-1} + b_{hi}) \\ &f_t = \sigma(W_{if}x_t + b_{if} + W_{hf}h_{t-1} + b_{hf}) \\ &g_t = \textup{tanh}(W_{ig}x_t + b_{ig} + W_{hg}h_{t-1} + b_{hg}) \\ &o_t = \sigma(W_{io}x_t + b_{io} + W_{ho}h_{t-1} + b_{ho}) \\ &c_t = f_t \odot c_t + i_t \odot g_t \\ &h_t = o_t \odot \textup{tanh}(c_t) \end{align}
it=σ(Wiixt+bii+Whiht−1+bhi)ft=σ(Wifxt+bif+Whfht−1+bhf)gt=tanh(Wigxt+big+Whght−1+bhg)ot=σ(Wioxt+bio+Whoht−1+bho)ct=ft⊙ct+it⊙gtht=ot⊙tanh(ct)
这里先将lstm_forward函数中的每个参数的维度写出来:
def lstm_forward(input, initial_states, w_ih, w_hh, b_ih, b_hh):
h_0, c_0 = initial_states # 初始状态 [b_size, hidden_size]
b_size, seq_len, input_size = input.shape
h_size = h_0.shape[-1]
h_prev, c_prev = h_0, c_0
# 需要将权重w在batch_size维进行扩维并复制,才能和x与h进行相乘
w_ih_batch = w_ih.unsqueeze(0).tile(b_size, 1, 1) # [4*hidden_size, in_size]->[b_size, ,]
w_hh_batch = w_hh.unsqueeze(0).tile(b_size, 1, 1) # [4*hidden_size, hidden_size]->[b_size, ,]
output_size = h_size
output = torch.zeros(b_size, seq_len, output_size) # 初始化一个输出序列
for t in range(seq_len):
x = input[:, t, :] # 当前时刻的输入向量 [b,in_size]->[b,in_size,1]
w_times_x = torch.bmm(w_ih_batch, x.unsqueeze(-1)).squeeze(-1) # bmm:含有批量大小的矩阵相乘
# [b, 4*hidden_size, 1]->[b, 4*hidden_size]
# 这一步就是计算了 Wii*xt|Wif*xt|Wig*xt|Wio*xt
w_times_h_prev = torch.bmm(w_hh_batch, h_prev.unsqueeze(-1)).squeeze(-1)
# [b, 4*hidden_size, hidden_size]*[b, hidden_size, 1]->[b,4*hidden_size, 1]->[b, 4*hidden_size]
# 这一步就是计算了 Whi*ht-1|Whf*ht-1|Whg*ht-1|Who*ht-1
# 分别计算输入门(i)、遗忘门(f)、cell门(g)、输出门(o) 维度均为 [b, h_size]
i_t = torch.sigmoid(w_times_x[:, :h_size] + w_times_h_prev[:, :h_size] + b_ih[:h_size] + b_hh[:h_size]) # 取前四分之一
f_t = torch.sigmoid(w_times_x[:, h_size:2*h_size] + w_times_h_prev[:, h_size:2*h_size]
+ b_ih[h_size:2*h_size] + b_hh[h_size:2*h_size])
g_t = torch.tanh(w_times_x[:, 2*h_size:3*h_size] + w_times_h_prev[:, 2*h_size:3*h_size]
+ b_ih[2*h_size:3*h_size] + b_hh[2*h_size:3*h_size])
o_t = torch.sigmoid(w_times_x[:, 3*h_size:] + w_times_h_prev[:, 3*h_size:]
+ b_ih[3*h_size:] + b_hh[3*h_size:])
c_prev = f_t * c_prev + i_t * g_t
h_prev = o_t * torch.tanh(c_prev)
output[:, t, :] = h_prev
return output, (h_prev.unsqueeze(0), c_prev.unsqueeze(0)) # 官方是三维,在第0维扩一维
验证一下 lstm_forward 的准确性:
# 这里使用 lstm_layer 中的参数
# 加了me表示自己手写的
output_me, (h_n_me, c_n_me) = lstm_forward(input, (h_0, c_0), lstm_layer.weight_ih_l0,
lstm_layer.weight_hh_l0, lstm_layer.bias_ih_l0, lstm_layer.bias_hh_l0)
打印一下,看两个的计算结果是否相同:
print("PyTorch API output:")
print(output) # [2,3,5] [b, seq_len, hidden_size]
print(h_n) # [1,2,5] [num_layers, b, hidden_size]
print(c_n) # [1,2,5] [num_layers, b, hidden_size]
print("\nlstm_forward function output:")
print(output_me) # [2,3,5] [b, seq_len, hidden_size]
print(h_n_me) # [1,2,5] [num_layers, b, hidden_size]
print(c_n_me)
结果如下,完全一致,说明手写的是对的:
PyTorch API output:
tensor([[[ 0.1671, 0.2493, 0.2603, -0.1448, -0.1951],
[-0.0680, 0.0478, 0.0218, 0.0735, -0.0604],
[ 0.0144, 0.0507, -0.0556, -0.2600, 0.1234]],
[[ 0.4561, -0.0015, -0.0776, -0.0644, -0.5319],
[ 0.1667, 0.0111, 0.0114, -0.1227, -0.2369],
[-0.0220, 0.0637, -0.2353, 0.0404, -0.1309]]],
grad_fn=<TransposeBackward0>)
tensor([[[ 0.0144, 0.0507, -0.0556, -0.2600, 0.1234],
[-0.0220, 0.0637, -0.2353, 0.0404, -0.1309]]],
grad_fn=<StackBackward0>)
tensor([[[ 0.0223, 0.1574, -0.1572, -0.4663, 0.2110],
[-0.0382, 0.6440, -0.4334, 0.0779, -0.3198]]],
grad_fn=<StackBackward0>)
lstm_forward function output:
tensor([[[ 0.1671, 0.2493, 0.2603, -0.1448, -0.1951],
[-0.0680, 0.0478, 0.0218, 0.0735, -0.0604],
[ 0.0144, 0.0507, -0.0556, -0.2600, 0.1234]],
[[ 0.4561, -0.0015, -0.0776, -0.0644, -0.5319],
[ 0.1667, 0.0111, 0.0114, -0.1227, -0.2369],
[-0.0220, 0.0637, -0.2353, 0.0404, -0.1309]]], grad_fn=<CopySlices>)
tensor([[[ 0.0144, 0.0507, -0.0556, -0.2600, 0.1234],
[-0.0220, 0.0637, -0.2353, 0.0404, -0.1309]]],
grad_fn=<UnsqueezeBackward0>)
tensor([[[ 0.0223, 0.1574, -0.1572, -0.4663, 0.2110],
[-0.0382, 0.6440, -0.4334, 0.0779, -0.3198]]],
grad_fn=<UnsqueezeBackward0>)
LSTMP
# 定义一些常量
batch_size, seq_len, input_size, h_size = 2, 3, 4, 5
proj_size = 3 # 要比hidden_size小
input = torch.randn(batch_size, seq_len, input_size)
c_0 = torch.randn(batch_size, h_size)
h_0 = torch.randn(batch_size, proj_size) # 注意这里从原来的 h_size 换成了 proj_size
# 调用官方 LSTM API
lstm_layer = nn.LSTM(input_size, h_size, batch_first=True, proj_size=proj_size)
output, (h_n, c_n) = lstm_layer(input, (h_0.unsqueeze(0), c_0.unsqueeze(0)))
打印一下返回的结果的形状:
print(output.shape) # [2,3,3] [b, seq_len, proj_size]
print(h_n.shape) # [1,2,3] [num_layers, b, proj_size]
print(c_n.shape) # [1,2,5] [num_layers, b, hidden_size]
这里输出一下lstm_layer中的参数名称及其形状:
for name, para in lstm_layer.named_parameters():
print(name, para.shape)
输出结果如下输出结果如下:
weight_ih_l0 torch.Size([20, 4]) # [4*hidden_size, input_size]
weight_hh_l0 torch.Size([20, 3]) # [4*hidden_size, proj_size]
bias_ih_l0 torch.Size([20])
bias_hh_l0 torch.Size([20])
weight_hr_l0 torch.Size([3, 5]) # 这个参数就是对 hidden_state 进行压缩的 [hidden_size, proj_size]
修改 lstm_forward 函数
修改lstm_forward函数,从而能够实现LSTMP:
def lstm_forward(input, initial_states, w_ih, w_hh, b_ih, b_hh, w_hr=None):
h_0, c_0 = initial_states # 初始状态 [b, proj_size][b, hidden_size]
b_size, seq_len, input_size = input.shape
h_size = c_0.shape[-1]
h_prev, c_prev = h_0, c_0
# 需要将权重w在batch_size维进行扩维并复制,才能和x与h进行相乘
w_ih_batch = w_ih.unsqueeze(0).tile(b_size, 1, 1) # [4*hidden_size, in_size]->[b_size, ,]
w_hh_batch = w_hh.unsqueeze(0).tile(b_size, 1, 1) # [4*hidden_size, hidden_size]->[b_size, ,]
if w_hr is not None:
proj_size = w_hr.shape[0]
output_size = proj_size
w_hr_batch = w_hr.unsqueeze(0).tile(b_size, 1, 1) # [proj_size, hidden_size]->[b_size, ,]
else:
output_size = h_size
output = torch.zeros(b_size, seq_len, output_size) # 初始化一个输出序列
for t in range(seq_len):
x = input[:, t, :] # 当前时刻的输入向量 [b,in_size]->[b,in_size,1]
w_times_x = torch.bmm(w_ih_batch, x.unsqueeze(-1)).squeeze(-1) # bmm:含有批量大小的矩阵相乘
# [b, 4*hidden_size, 1]->[b, 4*hidden_size]
# 这一步就是计算了 Wii*xt|Wif*xt|Wig*xt|Wio*xt
w_times_h_prev = torch.bmm(w_hh_batch, h_prev.unsqueeze(-1)).squeeze(-1)
# [b, 4*hidden_size, hidden_size]*[b, hidden_size, 1]->[b,4*hidden_size, 1]->[b, 4*hidden_size]
# 这一步就是计算了 Whi*ht-1|Whf*ht-1|Whg*ht-1|Who*ht-1
# 分别计算输入门(i)、遗忘门(f)、cell门(g)、输出门(o) 维度均为 [b, h_size]
i_t = torch.sigmoid(w_times_x[:, :h_size] + w_times_h_prev[:, :h_size] + b_ih[:h_size] + b_hh[:h_size]) # 取前四分之一
f_t = torch.sigmoid(w_times_x[:, h_size:2*h_size] + w_times_h_prev[:, h_size:2*h_size]
+ b_ih[h_size:2*h_size] + b_hh[h_size:2*h_size])
g_t = torch.tanh(w_times_x[:, 2*h_size:3*h_size] + w_times_h_prev[:, 2*h_size:3*h_size]
+ b_ih[2*h_size:3*h_size] + b_hh[2*h_size:3*h_size])
o_t = torch.sigmoid(w_times_x[:, 3*h_size:] + w_times_h_prev[:, 3*h_size:]
+ b_ih[3*h_size:] + b_hh[3*h_size:])
c_prev = f_t * c_prev + i_t * g_t
h_prev = o_t * torch.tanh(c_prev) # [b_size, h_size]
if w_hr is not None: # 对 h_prev 进行压缩,做projection
h_prev = torch.bmm(w_hr_batch, h_prev.unsqueeze(-1)) # [b,proj_size,hidden_size]*[b,h_size,1]=[b,proj_size,1]
h_prev = h_prev.squeeze(-1) # [b, proj_size]
output[:, t, :] = h_prev
return output, (h_prev.unsqueeze(0), c_prev.unsqueeze(0)) # 官方是三维,在第0维扩一维
验证一下 lstm_forward 的准确性:
output_me, (h_n_me, c_n_me) = lstm_forward(input, (h_0, c_0), lstm_layer.weight_ih_l0, lstm_layer.weight_hh_l0,
lstm_layer.bias_ih_l0, lstm_layer.bias_hh_l0, lstm_layer.weight_hr_l0)
print("PyTorch API output:")
print(output) # [2,3,3] [b, seq_len, proj_size]
print(h_n) # [1,2,3] [num_layers, b, proj_size]
print(c_n) # [1,2,5] [num_layers, b, hidden_size]
print("\nlstm_forward function output:")
print(output_me) # [2,3,3] [b, seq_len, proj_size]
print(h_n_me) # [1,2,3] [num_layers, b, proj_size]
print(c_n_me) # [1,2,5] [num_layers, b, hidden_size]
输出的结果如下,完全一致,说明手写的是对的:
PyTorch API output:
tensor([[[ 0.0392, -0.3149, -0.1264],
[ 0.0141, -0.2619, -0.0760],
[ 0.0306, -0.2166, 0.0915]],
[[-0.0777, -0.1205, -0.0555],
[-0.0646, -0.0926, 0.0391],
[-0.0456, -0.0576, 0.1849]]], grad_fn=<TransposeBackward0>)
tensor([[[ 0.0306, -0.2166, 0.0915],
[-0.0456, -0.0576, 0.1849]]], grad_fn=<StackBackward0>)
tensor([[[ 1.9913, -0.2683, -0.1221, 0.1751, -0.6072],
[-0.2383, -0.2253, -0.0385, -0.8820, -0.1794]]],
grad_fn=<StackBackward0>)
lstm_forward function output:
tensor([[[ 0.0392, -0.3149, -0.1264],
[ 0.0141, -0.2619, -0.0760],
[ 0.0306, -0.2166, 0.0915]],
[[-0.0777, -0.1205, -0.0555],
[-0.0646, -0.0926, 0.0391],
[-0.0456, -0.0576, 0.1849]]], grad_fn=<CopySlices>)
tensor([[[ 0.0306, -0.2166, 0.0915],
[-0.0456, -0.0576, 0.1849]]], grad_fn=<UnsqueezeBackward0>)
tensor([[[ 1.9913, -0.2683, -0.1221, 0.1751, -0.6072],
[-0.2383, -0.2253, -0.0385, -0.8820, -0.1794]]],
grad_fn=<UnsqueezeBackward0>)