Transformer介绍
解码器层
class DecoderLayer(nn.Module):
def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
"""
self_attn: 多头自注意力对象,Q=K=V
src_attn: 多头注意力对象,Q!=K=V
"""
super(DecoderLayer, self).__init__()
self.size = size
self.self_attn = self_attn
self.src_attn = src_attn
self.feed_forward = feed_forward
self.sublayer = clones(SublayerConnection(size, dropout), 3)
def forward(self, x, memory, source_mask, target_mask):
m = memory
x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, target_mask))
x = self.sublayer[1](x, lambda x: self.src_attn(x, m, m, source_mask))
x = self.sublayer[2](x, self.feed_forward)
return x
head = 8
size = 512
d_model = 512
d_ff = 2048
dropout = 0.2
self_attn = src_attn = MultiHeadedAttention(head, d_model, dropout)
ff = PositionwiseFeedForward(d_model, d_ff, dropout)
x = out_pe
memory = out_en
mask = torch.zeros(8, 4, 4)
source_mask = target_mask = mask
dl = DecoderLayer(size, self_attn, src_attn, ff, dropout)
out_dl = dl(x, memory, source_mask, target_mask)
print(out_dl)
print(out_dl.shape)
tensor([[[-16.8412, 32.9487, 14.4696, ..., -22.2736, -0.1756, -0.5874],
[ 14.0010, 3.7020, 0.8705, ..., 16.0435, -0.7342, -27.5716],
[ -6.6832, -50.7786, 0.3991, ..., 35.4034, -34.8524, -10.2925],
[ 0.4025, -27.5343, 0.0897, ..., -4.3228, -52.6830, -26.3140]],
[[-36.1756, -15.5990, -6.8882, ..., -27.8795, -22.8598, 9.3413],
[-13.2061, -39.8130, 31.6627, ..., -17.9774, 3.5719, 0.4909],
[ -0.2878, -2.6208, -0.1447, ..., -6.8129, -6.6932, -56.2726],
[ -0.5078, 15.9361, 0.1945, ..., 5.3691, 1.5384, -0.4271]]],
grad_fn=<AddBackward0>)
torch.Size([2, 4, 512])
解码器
class Decoder(nn.Module):
def __init__(self, layer, N):
super(Decoder, self).__init__()
self.layers = clones(layer, N)
self.norm = LayerNorm(layer.size)
def forward(self, x, memory, source_mask, target_mask):
for layer in self.layers:
x = layer(x, memory, source_mask, target_mask)
return self.norm(x)
size = 512
d_model = 512
head = 8
d_ff = 64
dropout = 0.2
c = copy.deepcopy
attn = MultiHeadedAttention(head, d_model)
ff = PositionwiseFeedForward(d_model, d_ff, dropout)
layer = DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout)
N = 8
x = out_pe
memory = out_en
mask = torch.zeros(8, 4, 4)
source_mask = target_mask = mask
de = Decoder(layer, N)
out_de = de(x, memory, source_mask, target_mask)
print(out_de)
print(out_de.shape)
tensor([[[-0.3795, 1.4082, 0.6244, ..., -0.9057, -0.2942, -0.0643],
[ 0.9174, 0.6259, 0.1907, ..., 0.7216, -0.0158, -0.9123],
[-0.0695, -1.6939, 0.0654, ..., 1.3314, -1.5960, -0.5077],
[ 0.2412, -0.7457, 0.1086, ..., -0.2554, -2.3369, -1.0643]],
[[-1.5241, -0.2561, -0.2190, ..., -1.2681, -0.9377, 0.4868],
[-0.5661, -1.0945, 1.4063, ..., -0.7858, 0.1505, 0.1036],
[-0.1120, 0.0913, -0.0218, ..., -0.4020, -0.3154, -2.0716],
[-0.1565, 0.9397, -0.0604, ..., 0.1730, 0.0154, 0.1646]]],
grad_fn=<AddBackward0>)
torch.Size([2, 4, 512])
输出部分
class Generator(nn.Module):
def __init__(self, d_model, vocab_size):
super(Generator, self).__init__()
self.linear = nn.Linear(d_model, vocab_size)
def forward(self, x):
return F.log_softmax(self.linear(x), dim=-1)
d_model = 512
vocab_size = 1000
x = out_de
gen = Generator(d_model, vocab_size)
out_gen = gen(x)
print(out_gen)
print(out_gen.shape)
tensor([[[-7.2107, -6.8135, -7.3745, ..., -6.8875, -7.7995, -6.8735],
[-7.2454, -6.7102, -6.5739, ..., -7.4684, -7.8100, -6.5677],
[-6.7945, -6.4811, -7.0429, ..., -6.1130, -7.7186, -8.6953],
[-6.4648, -8.1545, -6.5278, ..., -6.5628, -8.1096, -6.6423]],
[[-6.6874, -6.4551, -7.2109, ..., -7.2206, -7.1038, -6.5141],
[-7.0045, -7.2391, -6.9019, ..., -7.3764, -7.8051, -6.4741],
[-7.1714, -6.9198, -7.1506, ..., -5.9916, -6.7347, -7.1622],
[-7.9641, -7.2477, -7.8616, ..., -6.5662, -8.9810, -7.2106]]],
grad_fn=<LogSoftmaxBackward>)
torch.Size([2, 4, 1000])