1. 模型框架
1.1 记忆单元分类
记忆单元主要是为了存储过去的信息;常见的记忆单元如下:
- RNN
- GRU
- LSTM
1.2 模型类别分类
- 单向循环
- 双向循环
- 多层单向或双向循环
1.3 优缺点
- 优点:
(1)可以处理变长序列
(2)模型大小与序列长度无关
(3)计算量与序列长度呈线性增长
(4)考虑历史信息
(5)便于流式输出
(6)权重时不变 - 缺点:
(1)串行计算比较慢
(2)无法获取太长的历史信息
1.4 应用场景
- AI诗歌生成;属于生成任务,one2many型
- 文本情感分类:many2one
- 词法识别:many2many
- 机器翻译:many2many
- 语言识别生成
- 语言模型
2. 公式说明
2.1 公式
- x t x_t xt·:表示第 t 时刻的输入
- h t h_t ht:表示第 t 时刻的隐藏状态
- h t − 1 h_{t-1} ht−1:表示第 t-1 时刻的隐藏状态
2.2 输入输出大小
- 输入:
- 输出:
3. 代码
以下主要有两种实现方式
- 基于pytorch API 的实现单/双循环神经网络
- 自己根据公式手写的单/双循环神经网络
import torch
from torch import nn
hidden_in = 4
hidden_out = 3
num_layers = 1
# define the RNN layers
rnn_layer = nn.RNN(input_size=hidden_in, hidden_size=hidden_out, num_layers=num_layers,
batch_first=True)
batch_size = 2
sequence_length = 4
# random init the input
my_input = torch.randn(batch_size, sequence_length, hidden_in)
# random init the init hidden state
h_prev = torch.zeros(batch_size, hidden_out)
# my_output is all the state of h_n
# h_n is the final state
my_output, h_n = rnn_layer(my_input, h_prev.unsqueeze(0))
# print(f"my_output={my_output}")
# print(f"my_output.shape={my_output.shape}")
# print(f"h_n={h_n}")
# print(f"h_n.shape={h_n.shape}")
# custom_rnn_function
def custom_rnn_function(input, w_ih, w_hh, b_ih, b_hh, h_prev):
"""
formula:
h_t = tanh(w_{ih}*x_t+b_{ih}+w_{hh}*h_{t-1}+b_{hh})
x_t is the input at time t
:param input: input(batch_size,sequence_length,hidden_in)
:param w_ih: weight w_ih (hidden_out,hidden_in)
:param w_hh: weight w_hh (hidden_out,hidden_out)
:param b_ih: bias b_ih (hidden_out)
:param b_hh: bias b_hh (hidden_out)
:param h_prev: previous hidden h_prev (1,batch_size,hidden_out)
:return: output ,h_n
"""
batch_size, sequence_length, hidden_in = input.shape
hidden_out, hidden_in = w_ih.shape
output = torch.zeros(batch_size, sequence_length, hidden_out)
for t in range(sequence_length):
# input[:,t,:].shape = [batch_size,hidden_in] -> (batch_size,hidden_in,1)
x_t = input[:, t, :].unsqueeze(2)
# w_ih.shape = [hidden_out,hidden_in] -> (batch_size,hidden_out,hidden_in)
w_ih_batch = w_ih.unsqueeze(0).tile(batch_size, 1, 1)
# w_hh = [hidden_out,hidden_out] -> (batch_size,hidden_out,hidden_out)
# h_prev = [batch_size,hidden_out]
w_hh_batch = w_hh.unsqueeze(0).tile(batch_size, 1, 1)
# w_ih_times_x.shape=(batch_size,hidden_out,1) -> (batch_size,hidden_out)
w_ih_times_x = torch.bmm(w_ih_batch, x_t).squeeze(-1)
# w_hh_times_h.shape =(batch_size,hidden_out,1)->(batch_size,hidden_out)
# h_prev = [batch_size,hidden_out] -> (batch_size,hidden_out,1)
# w_hh = [hidden_out,hidden_out] -> (batch_size,hidden_out,hidden_out)
w_hh_times_h = torch.bmm(w_hh_batch, h_prev.unsqueeze(2)).squeeze(-1)
h_prev = torch.tanh((w_ih_times_x + b_ih + w_hh_times_h + b_hh))
output[:, t, :] = h_prev
return output, h_prev.unsqueeze(0)
# get the rnn_layers weights data
custom_w_ih = rnn_layer.weight_ih_l0
custom_w_hh = rnn_layer.weight_hh_l0
custom_bias_ih = rnn_layer.bias_ih_l0
custom_bias_hh = rnn_layer.bias_hh_l0
# sent rnn_layers'weight to custom_rnn_function
# if the output and h_n are the same with two function
# so that our custom function is correct.
custom_output, custom_hn = custom_rnn_function(my_input, custom_w_ih, custom_w_hh,
custom_bias_ih, custom_bias_hh, h_prev)
# print(f"custom_output={custom_output}")
# print(f"custom_hn={custom_hn}")
# print(f"my_output={my_output}")
# print(f"h_n={h_n}")
# print("check whether custom_output is equal to my_output")
# print(torch.isclose(custom_output, my_output))
# print("check whether custom_hn is equal to h_n")
# print(torch.isclose(custom_hn, h_n))
# custom_rnn_function
def bicstm_rnn_function(input, w_ih, w_hh, b_ih, b_hh, h_prev,
w_ih_reverse, w_hh_reverse, b_ih_reverse, b_hh_reverse):
batch_size, sequence_length, hidden_in = input.shape
hidden_out, hidden_in = w_ih.shape
output = torch.zeros(batch_size, sequence_length, hidden_out * 2)
forward_output = custom_rnn_function(input, w_ih, w_hh, b_ih, b_hh, h_prev)[0]
backward_output = custom_rnn_function(torch.flip(input, [1]), w_ih_reverse, w_hh_reverse, b_ih_reverse,
b_hh_reverse,h_prev)[0]
output[:, :, :hidden_out] = forward_output
output[:, :, hidden_out:] = torch.flip(backward_output,[1])
# old
# return output, output[:, -1, :].reshape((batch_size, 2, hidden_out)).transpose(0, 1)
return output, torch.cat([forward_output[:,-1,:].unsqueeze(0),backward_output[:,-1,:].unsqueeze(0)],dim=0)
bi_rnn_layer = nn.RNN(input_size=hidden_in, hidden_size=hidden_out, num_layers=num_layers,
batch_first=True, bidirectional=True)
bi_h_prev = torch.zeros(2, batch_size, hidden_out)
bi_my_output, bi_h_n = bi_rnn_layer(my_input, bi_h_prev)
print(f"bi_my_output={bi_my_output}")
print(f"bi_h_n={bi_h_n}")
for k, v in bi_rnn_layer.named_parameters():
print(k, v, v.shape)
bicstm_weight_ih_l0 = bi_rnn_layer.weight_ih_l0
bicstm_weight_hh_l0 = bi_rnn_layer.weight_hh_l0
bicstm_bias_ih_l0 = bi_rnn_layer.bias_ih_l0
bicstm_bias_hh_l0 = bi_rnn_layer.bias_hh_l0
bicstm_weight_ih_l0_reverse = bi_rnn_layer.weight_ih_l0_reverse
bicstm_weight_hh_l0_reverse = bi_rnn_layer.weight_hh_l0_reverse
bicstm_bias_ih_l0_reverse = bi_rnn_layer.bias_ih_l0_reverse
bicstm_bias_hh_l0_reverse = bi_rnn_layer.bias_hh_l0_reverse
bicstm_output, bicstm_h_n = bicstm_rnn_function(my_input, bicstm_weight_ih_l0, bicstm_weight_hh_l0, bicstm_bias_ih_l0,
bicstm_bias_hh_l0,
bi_h_prev[0], bicstm_weight_ih_l0_reverse, bicstm_weight_hh_l0_reverse,
bicstm_bias_ih_l0_reverse,
bicstm_bias_hh_l0_reverse)
print("pytorch API rnn")
# bi_my_output, bi_h_n
print(f"bi_my_output={bi_my_output}")
print(f"bi_h_n={bi_h_n}")
print("bicstm_output is equal to bi_my_output")
print(torch.isclose(bicstm_output,bi_my_output))
print(torch.isclose(bicstm_output,bi_my_output).shape)
print("bicstm_h_n is equal to bi_h_n")
print(torch.isclose(bicstm_h_n,bi_h_n))
print(torch.isclose(bicstm_h_n,bi_h_n).shape)
print("custom bidirectional rnn")
print(f"bicstm_output={bicstm_output}")
print(f"bicstm_output.shape={bicstm_output.shape}")
print(f"bicstm_h_n={bicstm_h_n}")
print(f"bicstm_h_n.shape={bicstm_h_n.shape}")
print("*"*50)
print(f"bi_my_output={bi_my_output}")
print(f"bi_my_output.shape={bi_my_output.shape}")
print(f"bi_h_n={bi_h_n}")
print(f"bi_h_n.shape={bi_h_n.shape}")
- 结果:
bi_my_output=tensor([[[-0.3309, -0.6514, 0.8314, -0.7133, -0.8930, -0.7533],
[-0.3520, -0.7544, 0.8153, -0.4129, -0.8242, -0.8933],
[ 0.0365, -0.5342, 0.9046, -0.4952, -0.2890, -0.1609],
[-0.3345, -0.4509, 0.6656, -0.7710, -0.4374, -0.2100]],
[[-0.2685, -0.5694, 0.7102, -0.6100, -0.8374, -0.8259],
[ 0.3823, -0.2883, 0.6636, -0.0259, 0.0331, -0.9175],
[ 0.0728, -0.1796, 0.6884, -0.3441, -0.0081, -0.3160],
[ 0.2953, -0.1599, 0.7536, -0.1300, -0.3686, 0.1407]]],
grad_fn=<TransposeBackward1>)
bi_h_n=tensor([[[-0.3345, -0.4509, 0.6656],
[ 0.2953, -0.1599, 0.7536]],
[[-0.7133, -0.8930, -0.7533],
[-0.6100, -0.8374, -0.8259]]], grad_fn=<StackBackward>)
weight_ih_l0 Parameter containing:
tensor([[-0.2153, -0.3879, -0.2379, 0.3674],
[-0.3065, -0.1250, -0.3957, 0.0352],
[ 0.4447, -0.1330, 0.3051, 0.5727]], requires_grad=True) torch.Size([3, 4])
weight_hh_l0 Parameter containing:
tensor([[-0.4278, 0.4842, 0.4345],
[ 0.1279, 0.3849, -0.0556],
[-0.2065, -0.4020, 0.4729]], requires_grad=True) torch.Size([3, 3])
bias_ih_l0 Parameter containing:
tensor([-0.4691, -0.4634, 0.5497], requires_grad=True) torch.Size([3])
bias_hh_l0 Parameter containing:
tensor([ 0.1844, -0.0852, 0.3367], requires_grad=True) torch.Size([3])
weight_ih_l0_reverse Parameter containing:
tensor([[-0.0869, -0.5565, -0.0156, 0.2845],
[-0.4279, -0.1971, -0.3512, 0.3958],
[ 0.4047, 0.1749, -0.4136, 0.5187]], requires_grad=True) torch.Size([3, 4])
weight_hh_l0_reverse Parameter containing:
tensor([[-0.3453, 0.2386, 0.5089],
[-0.3314, 0.3804, 0.4880],
[ 0.5452, -0.0493, 0.3664]], requires_grad=True) torch.Size([3, 3])
bias_ih_l0_reverse Parameter containing:
tensor([-0.3213, -0.5536, 0.1178], requires_grad=True) torch.Size([3])
bias_hh_l0_reverse Parameter containing:
tensor([-0.0876, -0.2065, -0.4133], requires_grad=True) torch.Size([3])
pytorch API rnn
bi_my_output=tensor([[[-0.3309, -0.6514, 0.8314, -0.7133, -0.8930, -0.7533],
[-0.3520, -0.7544, 0.8153, -0.4129, -0.8242, -0.8933],
[ 0.0365, -0.5342, 0.9046, -0.4952, -0.2890, -0.1609],
[-0.3345, -0.4509, 0.6656, -0.7710, -0.4374, -0.2100]],
[[-0.2685, -0.5694, 0.7102, -0.6100, -0.8374, -0.8259],
[ 0.3823, -0.2883, 0.6636, -0.0259, 0.0331, -0.9175],
[ 0.0728, -0.1796, 0.6884, -0.3441, -0.0081, -0.3160],
[ 0.2953, -0.1599, 0.7536, -0.1300, -0.3686, 0.1407]]],
grad_fn=<TransposeBackward1>)
bi_h_n=tensor([[[-0.3345, -0.4509, 0.6656],
[ 0.2953, -0.1599, 0.7536]],
[[-0.7133, -0.8930, -0.7533],
[-0.6100, -0.8374, -0.8259]]], grad_fn=<StackBackward>)
# 重点:我们自己计算的结果跟官方定义的API结果一致为True
bicstm_output is equal to bi_my_output
tensor([[[True, True, True, True, True, True],
[True, True, True, True, True, True],
[True, True, True, True, True, True],
[True, True, True, True, True, True]],
[[True, True, True, True, True, True],
[True, True, True, True, True, True],
[True, True, True, True, True, True],
[True, True, True, True, True, True]]])
torch.Size([2, 4, 6])
bicstm_h_n is equal to bi_h_n
tensor([[[True, True, True],
[True, True, True]],
[[True, True, True],
[True, True, True]]])
torch.Size([2, 2, 3])
custom bidirectional rnn
bicstm_output=tensor([[[-0.3309, -0.6514, 0.8314, -0.7133, -0.8930, -0.7533],
[-0.3520, -0.7544, 0.8153, -0.4129, -0.8242, -0.8933],
[ 0.0365, -0.5342, 0.9046, -0.4952, -0.2890, -0.1609],
[-0.3345, -0.4509, 0.6656, -0.7710, -0.4374, -0.2100]],
[[-0.2685, -0.5694, 0.7102, -0.6100, -0.8374, -0.8259],
[ 0.3823, -0.2883, 0.6636, -0.0259, 0.0331, -0.9175],
[ 0.0728, -0.1796, 0.6884, -0.3441, -0.0081, -0.3160],
[ 0.2953, -0.1599, 0.7536, -0.1300, -0.3686, 0.1407]]],
grad_fn=<CopySlices>)
bicstm_output.shape=torch.Size([2, 4, 6])
bicstm_h_n=tensor([[[-0.3345, -0.4509, 0.6656],
[ 0.2953, -0.1599, 0.7536]],
[[-0.7133, -0.8930, -0.7533],
[-0.6100, -0.8374, -0.8259]]], grad_fn=<CatBackward>)
bicstm_h_n.shape=torch.Size([2, 2, 3])
**************************************************
bi_my_output=tensor([[[-0.3309, -0.6514, 0.8314, -0.7133, -0.8930, -0.7533],
[-0.3520, -0.7544, 0.8153, -0.4129, -0.8242, -0.8933],
[ 0.0365, -0.5342, 0.9046, -0.4952, -0.2890, -0.1609],
[-0.3345, -0.4509, 0.6656, -0.7710, -0.4374, -0.2100]],
[[-0.2685, -0.5694, 0.7102, -0.6100, -0.8374, -0.8259],
[ 0.3823, -0.2883, 0.6636, -0.0259, 0.0331, -0.9175],
[ 0.0728, -0.1796, 0.6884, -0.3441, -0.0081, -0.3160],
[ 0.2953, -0.1599, 0.7536, -0.1300, -0.3686, 0.1407]]],
grad_fn=<TransposeBackward1>)
bi_my_output.shape=torch.Size([2, 4, 6])
bi_h_n=tensor([[[-0.3345, -0.4509, 0.6656],
[ 0.2953, -0.1599, 0.7536]],
[[-0.7133, -0.8930, -0.7533],
[-0.6100, -0.8374, -0.8259]]], grad_fn=<StackBackward>)
bi_h_n.shape=torch.Size([2, 2, 3])
Process finished with exit code 0
4. 小结
关于单循环神经网络来说,我们需要注意的各个权重的矩阵大小,输入的大小问题。对于双循环神经网络来说,有两个顺序反向,第一个顺序是输入的反向,第二个是输出的反向。具体详见上述代码。