Transfomer中的解码器层及解码器的实现

解码器层

        解码器由多个解码器层组成,每个解码器层根据输入数据和目标进行特征提取,这个过程称为解码。解码器的功能是利用编码器的输出和之前的预测结果,生成对下一步预测的特征表示。

import copy
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from copy import deepcopy

torch.set_printoptions(sci_mode=False)


class DecoderLayer(nn.Module):
    def __init__(self, embedding_dim, self_mha_attention, mha_attention, feed_forward, dropout):
        """初始化函数的参数有5个,分别是:
        - embedding_dim: 词嵌入的维度大小,也是解码器层的尺寸
        - self_mha_attention: 多头自注意力机制对象,Q=K=V
        - mha_attention: 多头注意力机制对象,Q!=K=V
        - feed_forward: 前馈全连接层对象
        - dropout: 置0比率
        """
        super(DecoderLayer, self).__init__()
        self.embedding_dim = embedding_dim
        self.self_mha_attention = self_mha_attention
        self.mha_attention = mha_attention
        self.feed_forward = feed_forward
        # 克隆三个子层连接对象
        self.sublayers = clones(SublayerConnectionWithNormalization(embedding_dim, dropout), 3)

    def forward(self, target_input, encoder_output, source_mask, target_mask):
        """forward函数的参数有4个:
        - target_input: 上一层的输入
        - encoder_output: 编码器层的输出
        - source_mask: 源数据掩码张量
        - target_mask: 目标数据掩码张量
        """
        target_input = self.sublayers[0](target_input, lambda x: self.self_mha_attention(x, x, x, target_mask))
        target_input = self.sublayers[1](target_input, lambda x: self.mha_attention(x, encoder_output, encoder_output, source_mask))
        return self.sublayers[2](target_input, self.feed_forward)


if __name__ == "__main__":
    # 设置参数
    test_embedding_dim = 512
    test_vocab_size = 10000
    test_max_len = 100
    test_heads = 8
    test_dropout = 0.2
    d_ffl = 64
    size = d_model = test_embedding_dim

    # 文本嵌入层
    text_embeddings = TextEmbeddings(test_vocab_size, test_embedding_dim)
    test_input_tensor = torch.LongTensor([[1, 2, 3, 4], [4, 3, 2, 1]])
    text_embeddings_output = text_embeddings(test_input_tensor)

    # 添加位置编码
    positional_encoding = PositionalEncoding(test_embedding_dim, dropout=0.1,
                                             max_sequence_length=test_max_len)
    positional_encoded_output = positional_encoding(text_embeddings_output)

    # 多头注意力机制计算(前一个是多头自注意力,后一个就是多头注意力)
    # 原文本的掩码(source_mask) 和 目标文本的掩码(target_mask) 实际中可能不同,这里为了方便计算使它们相同
    test_mask = torch.zeros(8, 4, 4)
    src_mask = tar_mask = test_mask
    self_mha = mha = MultiHeadedAttention(test_heads, test_embedding_dim, test_dropout)

    # 前馈全连接层
    ffl = FeedForwardLayer(d_model, d_ffl, test_dropout)

    # 构建编码器层
    el = EncoderLayer(size, deepcopy(self_mha), deepcopy(ffl), test_dropout)

    # 构建编码器
    test_num_layers = 4   # 编码器层数
    encoder = TransformerEncoder(el, test_num_layers)
    en_result = encoder(positional_encoded_output, test_mask)

    # 构建解码器层
    dl = DecoderLayer(test_embedding_dim, self_mha, mha, ffl, test_dropout)
    dl_output = dl(positional_encoded_output, en_result, src_mask, tar_mask)
    print("解码器层输出:\n", dl_output)
    print("输出维度:", dl_output.shape)

解码器层输出:
 tensor([[[-38.6796, -41.5885, -43.0422,  ...,   4.0466,   6.3220,  39.0872],
         [-15.4554,  -7.5811, -24.9081,  ...,   2.3801,  14.3226, -13.1563],
         [  0.5455,  12.1200,  25.7042,  ...,   5.7617,  -0.4356, -21.9242],
         [ -9.9778,  -2.3377, -41.4373,  ...,  -1.1107,  12.1996, -26.3887]],

        [[ -9.8397,   0.7901, -42.0429,  ...,  22.3260,  12.7510, -26.6649],
         [ 24.1362,  -0.9023,  25.7608,  ...,   5.7735, -20.5763, -23.1112],
         [-15.5547,  -0.4478, -25.5314,  ...,   2.6086,  14.0167, -13.0798],
         [-38.8152, -43.0345, -42.7138,  ...,   4.4873,   6.4754,  39.2665]]],
       grad_fn=<AddBackward0>)
输出维度: torch.Size([2, 4, 512])

解码器

import copy
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from copy import deepcopy

torch.set_printoptions(sci_mode=False)


class TransformerDecoder(nn.Module):
    def __init__(self, decoder_layer, num_layers):
        """初始化函数的参数有两个:
        - decoder_layer: 解码器层
        - num_layers: 解码器层的个数
        """
        super(TransformerDecoder, self).__init__()
        self.layers = clones(decoder_layer, num_layers)
        self.norm = NormalizationLayer(decoder_layer.embedding_dim)

    def forward(self, target_input, encoder_output, source_mask, target_mask):
        """forward函数中的参数有4个:
        - target_input: 目标数据的嵌入表示
        - encoder_output: 编码器层的输出
        - source_mask: 源数据掩码张量
        - target_mask: 目标数据掩码张量
        """
        for layer in self.layers:
            target_input = layer(target_input, encoder_output, source_mask, target_mask)
        return self.norm(target_input)


if __name__ == "__main__":
    # 设置参数
    test_embedding_dim = 512
    test_vocab_size = 10000
    test_max_len = 100
    test_heads = 8
    test_dropout = 0.2
    d_ffl = 64
    size = d_model = test_embedding_dim

    # 文本嵌入层
    text_embeddings = TextEmbeddings(test_vocab_size, test_embedding_dim)
    test_input_tensor = torch.LongTensor([[1, 2, 3, 4], [4, 3, 2, 1]])
    text_embeddings_output = text_embeddings(test_input_tensor)

    # 添加位置编码
    positional_encoding = PositionalEncoding(test_embedding_dim, dropout=0.1,
                                             max_sequence_length=test_max_len)
    positional_encoded_output = positional_encoding(text_embeddings_output)

    # 多头注意力机制计算(前一个是多头自注意力,后一个就是多头注意力)
    # 原文本的掩码(source_mask) 和 目标文本的掩码(target_mask) 实际中可能不同,这里为了方便计算使它们相同
    test_mask = torch.zeros(8, 4, 4)
    src_mask = tar_mask = test_mask
    self_mha = mha = MultiHeadedAttention(test_heads, test_embedding_dim, test_dropout)

    # 前馈全连接层
    ffl = FeedForwardLayer(d_model, d_ffl, test_dropout)

    # 编码器层
    el = EncoderLayer(size, deepcopy(self_mha), deepcopy(ffl), test_dropout)

    # 编码器
    test_num_layers = 4  # 编码器层和解码器层数
    encoder = TransformerEncoder(el, test_num_layers)
    en_result = encoder(positional_encoded_output, test_mask)

    # 解码器层
    dl = DecoderLayer(test_embedding_dim, deepcopy(self_mha), deepcopy(mha), deepcopy(ffl), test_dropout)

    # 解码器
    decoder = TransformerDecoder(dl, test_num_layers)
    de_result = decoder(positional_encoded_output, en_result, src_mask, tar_mask)
    print("解码器输出:\n", de_result)
    print("输出维度:", de_result.shape)

 解码器输出:
 tensor([[[ 0.1566, -1.5408, -0.6142,  ..., -0.2367, -1.0645, -0.0048],
         [-1.8799, -0.0538, -0.3267,  ..., -0.8985, -0.4490,  0.3226],
         [-0.5468, -0.2171,  1.4543,  ..., -1.8045,  1.4159, -0.7944],
         [-0.6021, -0.0140, -2.6551,  ..., -0.7373,  1.5643,  0.2795]],

        [[-0.5871,  0.1262, -2.6315,  ..., -0.5955,  1.6100,  0.3421],
         [-0.5836, -0.1597, -0.0855,  ..., -1.7812,  1.4634, -0.7508],
         [-1.8776,  0.0244, -0.2953,  ..., -0.8799, -0.5054,  0.4284],
         [-0.1860, -1.4688, -0.7688,  ...,  2.3630, -1.1300,  0.0530]]],
       grad_fn=<AddBackward0>)
输出维度: torch.Size([2, 4, 512])

 

 

  • 16
    点赞
  • 9
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值