文本生成:Seq2Seq

1. 模型原理

Seq2Seq是一种基于循环神经网络的编码器-解码器(encoder-decoder)架构实现。它使用两个神经元数量相同的LSTM(或GRU)作为编码器和解码器,如下图所示:

在这里插入图片描述

图1 Seq2Seq 模型

为配合Mini-batch SGD 1模型训练策略,工程实现时要添加两种mask机制,来消除对输入序列和标签序列添加padding值进行(序列)长度补齐带来的噪声干扰。 经典Seq2Seq模型的实现中,引入动态损失函数机制(mask softmax),来消除标签序列中padding值的噪声干扰。

而消除输入序列中padding值的噪声干扰,要通过在注意力机制中添加mask矩阵修正注意力分布来实现,这在作者文章:文本生成:加入注意力机制(Attention)的Seq2Seq模型 中详细阐述。

2. 代码实现

2.1 基于TensorFlow框架的实现

from MyEncoderDecoder import *

class Seq2Seq(EncoderDecoder):
    """readme:
    v1.0  初代版本,实现了基础版本的Seq2Seq模型,并引入了注意力机制(选用点积打分函数)
    v1.1  更新掩码矩阵生成成方法,令其可以满足多头注意力机制和多头自注意力机制的任务需求
    """

    def __init__(self, vocab_size, embed_size, word2index, index2word,
                 num_hiddens, num_encds=1, num_decds=1,
                 enc_dropout=0, enc_rtseq=True, enc_rtstate=True,
                 dec_dropout=0, dec_rtseq=True, dec_rtstate=True,
                 **kwargs):
        super().__init__(**kwargs)
        self.word2index = word2index
        self.index2word = index2word
        self.Hidden = tf.keras.layers.GRU
        # model
        self.Embed = layers.Embedding(vocab_size, embed_size)
        self.Encoders = [
            self.Hidden(num_hiddens, dropout=enc_dropout, return_sequences=enc_rtseq, return_state=enc_rtstate) for i in
            range(num_encds)]
        self.Decoders = [
            self.Hidden(num_hiddens, dropout=dec_dropout, return_sequences=dec_rtseq, return_state=dec_rtstate) for i in
            range(num_decds)]        
        self.Output = layers.Dense(vocab_size, activation='Softmax')

    def call(self, X, **kwargs):
        """
        Input:
            :X_len:   样本特征序列的实际长度
            :y:       采用序号编码的样本标签序列
            :y_len:   样本标签序列的实际长度
        Output:
            :y_hat:   a
            :outputs: a
        """
        X_len = kwargs['X_len']
        y = kwargs['y']
        enc_hiddens = []
        last_enc_hidden = None
        dec_hiddens = []
        last_dec_hidden = None
        outputs = []
        # Encoder
        enc_hiddens = self.Embed(X)
        for Encd in self.Encoders:
            enc_hiddens, last_enc_hidden = Encd(enc_hiddens)
        # Decoder
        y_hat = tf.constant([self.word2index['<bos>']] * y.shape[0], shape=[y.shape[0], 1], dtype=y.dtype)
        for _ in range(y.shape[1]):
            dec_hiddens = self.Embed(y_hat)
            for Decd in self.Decoders:
                dec_hiddens, last_dec_hidden = Decd(dec_hiddens, initial_state=last_enc_hidden)
            # Output
            output = self.Output(output_features)
            token = tf.cast(tf.reshape(tf.argmax(output, axis=1), shape=[-1, 1]), dtype=y.dtype)
            y_hat = tf.concat([y_hat, token], axis=1)
            outputs.append(tf.reshape(output, shape=[output.shape[0], 1, output.shape[1]]))
        y_hat = y_hat[:, 1:]
        outputs = tf.concat(outputs, axis=1)
        return y_hat, outputs

2.2 基于Pytorch框架的实现

from .RNN import *
from .Attention import *
from .EncoderDecoder import *


class Seq2Seq(EncoderDecoder):
    """readme:
    v1.0  初代版本,实现了基础版本的Seq2Seq模型,并引入了注意力机制(选用点积打分函数)
    v1.1  更新掩码矩阵生成成方法,令其可以满足多头注意力机制和多头自注意力机制的任务需求
    """

    def __init__(self, vocab_size, embed_dim, hidden_dim
                 , word2index, index2word, pad_idx=0
                 , hidden_cell=GRU_Cell
                 , encoder_layers=1, decoder_layers=1
                 , enc_bid=False, enc_pad_pos='post'
                 # , enc_dropout=0
                 , dec_bid=False, ded_pad_pos='post'
                 # , dec_dropout=0
                 , device="cpu", **kwargs):
        super().__init__()
        #
        self.HiddenCell = hidden_cell
        self.word2index = word2index
        self.index2word = index2word
        #
        self.Embed = nn.Embedding(
            vocab_size, embed_dim, padding_idx=pad_idx, dtype=torch.float32
            , device=device
        )
        self.Encoders = nn.Sequential(
            RNN_Layer(
                self.HiddenCell(embed_dim, hidden_dim, **kwargs, device=device)
                , bidirectional=enc_bid, pad_position=enc_pad_pos
            )
        )
        if encoder_layers > 1:
            for i in range(encoder_layers-1):
                self.Encoders.add_module(
                    str(i+1), RNN_Layer(
                        self.HiddenCell(hidden_dim, hidden_dim, **kwargs, device=device)
                        , bidirectional=enc_bid, pad_position=enc_pad_pos
                    )
                )
        self.Decoders = nn.Sequential(
            RNN_Layer(
                self.HiddenCell(embed_dim, hidden_dim, **kwargs, device=device)
                , bidirectional=dec_bid, pad_position=ded_pad_pos
            )
        )
        if decoder_layers > 1:
            for i in range(decoder_layers-1):
                self.Decoders.add_module(
                    str(i+1), RNN_Layer(
                        self.HiddenCell(hidden_dim, hidden_dim, **kwargs, device=device)
                        , bidirectional=dec_bid, pad_position=ded_pad_pos
                    )
                )
        self.Attention = Attention()
        self.Output = nn.Linear(hidden_dim + hidden_dim, vocab_size, device=device)

    def forward(self, inputs, input_mask, target_time_steps, target_type, **kwargs):
        """
        Input:
            :X_len:   样本特征序列的实际长度
            :y:       采用序号编码的样本标签序列
        Output:
            :y_hat:   a
            :outputs: a
        """
        enc_last_state = None
        dec_last_state = None
        outputs = []
        # Input
        enc_hiddens = self.Embed(inputs)
        # Encoder
        for Encd in self.Encoders:
            enc_hiddens, enc_last_state = Encd(enc_hiddens, input_mask)
        # Decoder
        y_hat = kwargs['bos']
        for _ in range(target_time_steps):
            dec_hiddens = self.Embed(y_hat)
            for Decd in self.Decoders:
                dec_hiddens, dec_last_state = Decd(dec_hiddens, initial_state=enc_last_state)
            # Attention
            attention = self.Attention(enc_hiddens, queries=dec_last_state[-1], input_mask=input_mask)
            output_features = torch.concat([dec_last_state[-1], attention], dim=1)
            # Output
            output = F.softmax(self.Output(output_features))
            token = torch.unsqueeze(
                torch.argmax(output, dim=1)
                , dim=-1
            ).type(target_type)
            y_hat = torch.concat([y_hat, token], dim=1)
            outputs.append(torch.unsqueeze(output, dim=1))
        y_hat = y_hat[:, 1:]
        outputs = torch.concat(outputs, dim=1)
        return y_hat, outputs
       

  1. Mini-batch Stochastic Gradient Descent,小批量随机梯度下降 ↩︎

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值