1. 模型原理
Seq2Seq是一种基于循环神经网络的编码器-解码器(encoder-decoder)架构实现。它使用两个神经元数量相同的LSTM(或GRU)作为编码器和解码器,如下图所示:
为配合Mini-batch SGD 1模型训练策略,工程实现时要添加两种mask机制,来消除对输入序列和标签序列添加padding值进行(序列)长度补齐带来的噪声干扰。 经典Seq2Seq模型的实现中,引入动态损失函数机制(mask softmax),来消除标签序列中padding值的噪声干扰。
而消除输入序列中padding值的噪声干扰,要通过在注意力机制中添加mask矩阵修正注意力分布来实现,这在作者文章:文本生成:加入注意力机制(Attention)的Seq2Seq模型 中详细阐述。
2. 代码实现
2.1 基于TensorFlow框架的实现
from MyEncoderDecoder import *
class Seq2Seq(EncoderDecoder):
"""readme:
v1.0 初代版本,实现了基础版本的Seq2Seq模型,并引入了注意力机制(选用点积打分函数)
v1.1 更新掩码矩阵生成成方法,令其可以满足多头注意力机制和多头自注意力机制的任务需求
"""
def __init__(self, vocab_size, embed_size, word2index, index2word,
num_hiddens, num_encds=1, num_decds=1,
enc_dropout=0, enc_rtseq=True, enc_rtstate=True,
dec_dropout=0, dec_rtseq=True, dec_rtstate=True,
**kwargs):
super().__init__(**kwargs)
self.word2index = word2index
self.index2word = index2word
self.Hidden = tf.keras.layers.GRU
# model
self.Embed = layers.Embedding(vocab_size, embed_size)
self.Encoders = [
self.Hidden(num_hiddens, dropout=enc_dropout, return_sequences=enc_rtseq, return_state=enc_rtstate) for i in
range(num_encds)]
self.Decoders = [
self.Hidden(num_hiddens, dropout=dec_dropout, return_sequences=dec_rtseq, return_state=dec_rtstate) for i in
range(num_decds)]
self.Output = layers.Dense(vocab_size, activation='Softmax')
def call(self, X, **kwargs):
"""
Input:
:X_len: 样本特征序列的实际长度
:y: 采用序号编码的样本标签序列
:y_len: 样本标签序列的实际长度
Output:
:y_hat: a
:outputs: a
"""
X_len = kwargs['X_len']
y = kwargs['y']
enc_hiddens = []
last_enc_hidden = None
dec_hiddens = []
last_dec_hidden = None
outputs = []
# Encoder
enc_hiddens = self.Embed(X)
for Encd in self.Encoders:
enc_hiddens, last_enc_hidden = Encd(enc_hiddens)
# Decoder
y_hat = tf.constant([self.word2index['<bos>']] * y.shape[0], shape=[y.shape[0], 1], dtype=y.dtype)
for _ in range(y.shape[1]):
dec_hiddens = self.Embed(y_hat)
for Decd in self.Decoders:
dec_hiddens, last_dec_hidden = Decd(dec_hiddens, initial_state=last_enc_hidden)
# Output
output = self.Output(output_features)
token = tf.cast(tf.reshape(tf.argmax(output, axis=1), shape=[-1, 1]), dtype=y.dtype)
y_hat = tf.concat([y_hat, token], axis=1)
outputs.append(tf.reshape(output, shape=[output.shape[0], 1, output.shape[1]]))
y_hat = y_hat[:, 1:]
outputs = tf.concat(outputs, axis=1)
return y_hat, outputs
2.2 基于Pytorch框架的实现
from .RNN import *
from .Attention import *
from .EncoderDecoder import *
class Seq2Seq(EncoderDecoder):
"""readme:
v1.0 初代版本,实现了基础版本的Seq2Seq模型,并引入了注意力机制(选用点积打分函数)
v1.1 更新掩码矩阵生成成方法,令其可以满足多头注意力机制和多头自注意力机制的任务需求
"""
def __init__(self, vocab_size, embed_dim, hidden_dim
, word2index, index2word, pad_idx=0
, hidden_cell=GRU_Cell
, encoder_layers=1, decoder_layers=1
, enc_bid=False, enc_pad_pos='post'
# , enc_dropout=0
, dec_bid=False, ded_pad_pos='post'
# , dec_dropout=0
, device="cpu", **kwargs):
super().__init__()
#
self.HiddenCell = hidden_cell
self.word2index = word2index
self.index2word = index2word
#
self.Embed = nn.Embedding(
vocab_size, embed_dim, padding_idx=pad_idx, dtype=torch.float32
, device=device
)
self.Encoders = nn.Sequential(
RNN_Layer(
self.HiddenCell(embed_dim, hidden_dim, **kwargs, device=device)
, bidirectional=enc_bid, pad_position=enc_pad_pos
)
)
if encoder_layers > 1:
for i in range(encoder_layers-1):
self.Encoders.add_module(
str(i+1), RNN_Layer(
self.HiddenCell(hidden_dim, hidden_dim, **kwargs, device=device)
, bidirectional=enc_bid, pad_position=enc_pad_pos
)
)
self.Decoders = nn.Sequential(
RNN_Layer(
self.HiddenCell(embed_dim, hidden_dim, **kwargs, device=device)
, bidirectional=dec_bid, pad_position=ded_pad_pos
)
)
if decoder_layers > 1:
for i in range(decoder_layers-1):
self.Decoders.add_module(
str(i+1), RNN_Layer(
self.HiddenCell(hidden_dim, hidden_dim, **kwargs, device=device)
, bidirectional=dec_bid, pad_position=ded_pad_pos
)
)
self.Attention = Attention()
self.Output = nn.Linear(hidden_dim + hidden_dim, vocab_size, device=device)
def forward(self, inputs, input_mask, target_time_steps, target_type, **kwargs):
"""
Input:
:X_len: 样本特征序列的实际长度
:y: 采用序号编码的样本标签序列
Output:
:y_hat: a
:outputs: a
"""
enc_last_state = None
dec_last_state = None
outputs = []
# Input
enc_hiddens = self.Embed(inputs)
# Encoder
for Encd in self.Encoders:
enc_hiddens, enc_last_state = Encd(enc_hiddens, input_mask)
# Decoder
y_hat = kwargs['bos']
for _ in range(target_time_steps):
dec_hiddens = self.Embed(y_hat)
for Decd in self.Decoders:
dec_hiddens, dec_last_state = Decd(dec_hiddens, initial_state=enc_last_state)
# Attention
attention = self.Attention(enc_hiddens, queries=dec_last_state[-1], input_mask=input_mask)
output_features = torch.concat([dec_last_state[-1], attention], dim=1)
# Output
output = F.softmax(self.Output(output_features))
token = torch.unsqueeze(
torch.argmax(output, dim=1)
, dim=-1
).type(target_type)
y_hat = torch.concat([y_hat, token], dim=1)
outputs.append(torch.unsqueeze(output, dim=1))
y_hat = y_hat[:, 1:]
outputs = torch.concat(outputs, dim=1)
return y_hat, outputs
Mini-batch Stochastic Gradient Descent,小批量随机梯度下降 ↩︎