编码器层
编码器通过堆叠多个编码器层实现对输入数据的特征提取,这些层连续工作以完成编码过程。
import copy
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
torch.set_printoptions(sci_mode=False)
# 构建编码器层
class EncoderLayer(nn.Module):
def __init__(self, embedding_dim, self_attention, feed_forward, dropout):
"""初始化编码器层.
参数:
- embedding_dim (int): 嵌入维度大小,也用作层大小.
- self_attention (nn.Module): 多头自注意力机制的实例.
- feed_forward (nn.Module): 前馈全连接层的实例.
- dropout (float): 用于正则化的dropout率.
"""
super(EncoderLayer, self).__init__()
self.self_attention = self_attention
self.feed_forward = feed_forward
# 使用克隆函数创建两个子层连接结构
self.sublayers = clones(SublayerConnectionWithNormalization(embedding_dim, dropout), 2)
self.embedding_dim = embedding_dim
def forward(self, input_tensor, mask):
"""编码器层的前向传播
参数:
- input_tensor (torch.Tensor): 来自上一层的输入张量.
- mask (torch.Tensor): 注意力机制的掩码张量.
返回:
- torch.Tensor: 层处理后的输出张量.
"""
# 第一个子层连接,包括多头自注意力
input_tensor = self.sublayers[0](input_tensor, lambda x: self.self_attention(x, x, x, mask))
# 第二个子层连接,包括前馈全连接层
return self.sublayers[1](input_tensor, self.feed_forward)
if __name__ == "__main__":
# 设置参数
test_embedding_dim = 512
test_vocab_size = 10000
test_max_len = 100
test_heads = 8
test_dropout = 0.2
d_ffl = 64
size = d_model = test_embedding_dim
# 文本嵌入层
text_embeddings = TextEmbeddings(test_vocab_size, test_embedding_dim)
test_input_tensor = torch.LongTensor([[1, 2, 3, 4], [4, 3, 2, 1]])
text_embeddings_output = text_embeddings(test_input_tensor)
# 添加位置编码
positional_encoding = PositionalEncoding(test_embedding_dim, dropout=0.1,
max_sequence_length=test_max_len)
positional_encoded_output = positional_encoding(text_embeddings_output)
# 多头注意力
test_mask = torch.zeros(8, 4, 4) # 令mask为一个8x4x4的零张量
self_mha = MultiHeadedAttention(test_heads, d_model)
# 前馈全连接层
ffl = FeedForwardLayer(d_model, d_ffl, test_dropout)
# 构建编码器层
el = EncoderLayer(size, self_mha, ffl, test_dropout)
el_result = el(positional_encoded_output, test_mask)
print("Encoder Layer Output:\n", el_result)
print("Shape of Encoder Layer Output:", el_result.shape)
编码器
import copy
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from copy import deepcopy
torch.set_printoptions(sci_mode=False)
class TransformerEncoder(nn.Module):
def __init__(self, encoder_layer, num_layers):
"""初始化Transformer编码器
参数:
- encoder_layer (nn.Module): 单个编码器层的实例
- num_layers (int): 编码器层的数量
"""
super(TransformerEncoder, self).__init__()
# 克隆多个编码器层
self.encoder_layers = clones(encoder_layer, num_layers)
# 初始化一个规范化层
self.norm_layer = NormalizationLayer(encoder_layer.embedding_dim)
def forward(self, input_tensor, mask):
"""编码器的前向传播
参数:
- input_tensor (torch.Tensor): 上一层的输出
- mask (torch.Tensor): 掩码张量
返回:
- torch.Tensor: 编码器的输出
"""
# 依次通过每个编码器层
for layer in self.encoder_layers:
input_tensor = layer(input_tensor, mask)
# 通过规范化层
return self.norm_layer(input_tensor)
if __name__ == "__main__":
# 设置参数
test_embedding_dim = 512
test_vocab_size = 10000
test_max_len = 100
test_heads = 8
test_dropout = 0.2
d_ffl = 64
size = d_model = test_embedding_dim
# 文本嵌入层
text_embeddings = TextEmbeddings(test_vocab_size, test_embedding_dim)
test_input_tensor = torch.LongTensor([[1, 2, 3, 4], [4, 3, 2, 1]])
text_embeddings_output = text_embeddings(test_input_tensor)
# 添加位置编码
positional_encoding = PositionalEncoding(test_embedding_dim, dropout=0.1,
max_sequence_length=test_max_len)
positional_encoded_output = positional_encoding(text_embeddings_output)
# 多头注意力
test_mask = torch.zeros(8, 4, 4) # 令mask为一个8x4x4的零张量
self_mha = MultiHeadedAttention(test_heads, d_model)
# 前馈全连接层
ffl = FeedForwardLayer(d_model, d_ffl, test_dropout)
# 构建编码器层
# 需要深度拷贝子层的各个对象
el = EncoderLayer(size, deepcopy(self_mha), deepcopy(ffl), test_dropout)
# 构建编码器
test_num_layers = 4 # 编码器层数
encoder = TransformerEncoder(el, test_num_layers)
en_result = encoder(positional_encoded_output, test_mask)
print(f"Encoder Output:\n{en_result}")
print(f"Shape of Encoder Output: {en_result.shape}")
Encoder Output:
tensor([[[-0.6755, -1.1236, -0.1131, ..., -0.0815, 0.3898, 0.7918],
[ 0.7663, -0.6230, -1.2164, ..., 0.0106, -0.7013, 0.2994],
[-0.2404, 1.0819, 1.5586, ..., -0.1301, 1.7509, 0.1889],
[-0.3832, -0.0331, -0.4277, ..., 1.1210, -0.9636, -2.4338]],[[-0.3616, -1.8817, -0.4713, ..., 1.2407, -0.8797, -2.4673],
[-0.2006, 0.0076, 1.6668, ..., -0.1171, 1.7598, 0.2362],
[ 0.7781, 0.0376, -1.2090, ..., -1.1441, -0.7755, 0.2996],
[-0.6776, -1.2891, -0.2176, ..., -0.0245, 0.4007, 0.7322]]],
grad_fn=<AddBackward0>)
Shape of Encoder Output: torch.Size([2, 4, 512])