本文主要参考github上一个开源的seq2seq教程,在此基础上稍作修改
https://github.com/bentrevett/pytorch-seq2seq/blob/master/3%20-%20Neural%20Machine%20Translation%20by%20Jointly%20Learning%20to%20Align%20and%20Translate.ipynb
1.Seq2Seq模型
在我上一篇文章有这个代码,原理就是一开始利用编码器的hidden,解码器去生成相应的字符。
2.Attention机制
虽然Seq2Seq模型可以通过encoder生成的上文信息来生成相应的字符或者词语,但是却不能理解encoder中输入序列中句子内部的词语和词语,字符和字符之间潜在的关系。例如我们做翻译的时候,要将英文中的动词转化为中文,我们应该更关注英文中的动词词汇。这就是attention机制,它可以告诉我们每次翻译时更应该关注英文中的哪一部分。
计算权重的方式有很多,这里介绍最基础的一种:concat
原理是通过encoder每层最后一个状态和encoder中每个输入进行相应计算,最后得到一个权重向量。如图所示,将encoder的输入和s0叠加之后,经过两个线性变换得到权重向量。得到的权重每次decoder做解码的时候都要更新一次。因此attention的计算量要比传统的Seq2Seq要大的多。
3.代码
首先是Encoder,和传统的Seq2Seq基本没有区别,只是多了一个要输出最后一层的状态。
# 此例子默认编码器解码器的hidden_size相同
class Encoder(nn.Module):
def __init__(self, input_dim, emb_dim, hidden_size, n_layers, dropout=0.5, bidirectional=True):
super(Encoder, self).__init__()
self.hidden_size = hidden_size
self.n_layers = n_layers
self.embedding = nn.Embedding(input_dim, emb_dim) # input_dim数量等于源语言的字符数
self.gru = nn.GRU(emb_dim, hidden_size, n_layers, dropout=dropout, bidirectional=bidirectional)
self.fc = nn.Linear(hidden_size*2, hidden_size)
def forward(self, input_seqs):
# input_seqs [seq_len, batch]
embedded = self.embedding(input_seqs)
# embedded [seq_len, batch, embed_dim]
outputs, hidden = self.gru(embedded)
# outputs [seq_len, batch, hidden_size * 2]
# hidden is stacked [forward_1, backward_1, forward_2, backward_2, ...]
# outputs are always from the last layer
h_hat = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
# h_hat [batch, hidden_size*2]
# h_hat 是最后一层的最后一个状态,用于计算注意力权重
# 通过线性层将h_hat转为编码器每次最后一层所输出的形式
h_hat = torch.tanh(self.fc(h_hat))
# h_hat [bacth, hidden]
return outputs, hidden, h_hat
任意取一组数据输入到Encoder
INPUT_DIM = 8
OUTPUT_DIM = 6
HIDDEN_SIZE = 10
N_LAYERS = 2
EMB_SIZE = 12
BATCH_SIZE = 2
SEQ_LEN = 5
encoder = Encoder(INPUT_DIM, EMB_SIZE, HIDDEN_SIZE, N_LAYERS)
x = torch.randint(1, 7, (SEQ_LEN, BATCH_SIZE))
en_out, hidden, h_hat = encoder(x)
重点来了,Attention部分的代码如下:
这里有很多的拼接操作
class Attention(nn.Module):
def __init__(self, hidden_size):
super().__init__()
self.attn = nn.Linear(3*hidden_size, hidden_size)
self.v = nn.Linear(hidden_size, 1, bias = False)
def forward(self, h_hat, encoder_outputs):
# h_hat [batch, hidden_size]
# encoder_outputs [seq_len, bacth, hidden_size*2]
batch_size = encoder_outputs.shape[1]
src_len = encoder_outputs.shape[0] # 序列长度(时间步)
# 编码器每个时间步最后的输出都要和h_hat做拼接,因此需要将h_hat复制序列长度份
h_hat = h_hat.unsqueeze(1).repeat(1, src_len, 1)
encoder_outputs = encoder_outputs.permute(1, 0, 2)
#h_hat = [batch, seq_len, hidden_size]
#encoder_outputs = [batch, seq_len, hidden_size * 2]
energy = torch.tanh(self.attn(torch.cat((h_hat, encoder_outputs), dim = 2)))
#energy [batch, seq_len, hidden_size]
attention = self.v(energy).squeeze(2)
# attention [batch, seq_len]
# 返回每个batch每个时间步的权重
return F.softmax(attention, dim=1)
我们这里可以测试一下Attention的输出:
atten = Attention(HIDDEN_SIZE)
print(atten(h_hat, en_out))
"""
tensor([[0.1896, 0.1891, 0.2035, 0.2095, 0.2083],
[0.1951, 0.2025, 0.2011, 0.1986, 0.2026]], grad_fn=<SoftmaxBackward>)
"""
最后是Decoder
class Decoder(nn.Module):
def __init__(self, output_dim, emb_dim, hidden_size, n_layers, attention, dropout=0.5, bidirectional=True):
super().__init__()
self.output_dim = output_dim
self.attention = attention
self.embedding = nn.Embedding(output_dim, emb_dim)
self.rnn = nn.GRU(2*hidden_size + emb_dim, hidden_size, n_layers, bidirectional=True)
self.fc_out = nn.Linear(4*hidden_size+emb_dim, output_dim)
self.dropout = nn.Dropout(dropout)
def forward(self, input, h_hat, hidden, encoder_outputs):
# input [batch]
# h_hat [batch, hidden_size]
# encoder_outputs [seq_len, batch, hidden_size * 2]
# hidden [direction*n_layers, bacth, hidden_size]
input = input.unsqueeze(0)
# input [1, batch]
embedded = self.dropout(self.embedding(input))
#embedded [1, batch, emb_dim]
a = self.attention(h_hat, encoder_outputs)
# a [batch, seq_len]
a = a.unsqueeze(1)
# a [batch, 1, seq_len]
encoder_outputs = encoder_outputs.permute(1, 0, 2)
# encoder_outputs [batch, seq_len, hidden_size * 2]
weighted = torch.bmm(a, encoder_outputs)
# weighted [batch, 1, hidden_size * 2]
weighted = weighted.permute(1, 0, 2)
# weighted [1, batch, hidden_size * 2]
rnn_input = torch.cat((embedded, weighted), dim = 2)
#rnn_input [1, batch, 2*hidden_size+emb_dim]
output, hidden = self.rnn(rnn_input, hidden)
# output [1, batch, hidden_size*2]
# hidden [layers * directions, batch, hidden_size]
embedded = embedded.squeeze(0)
output = output.squeeze(0)
weighted = weighted.squeeze(0)
prediction = self.fc_out(torch.cat((output, weighted, embedded), dim = 1))
# prediction = [batch, output dim]
# hidden [n_layers*directions, batch, hidden_size]
return prediction, hidden
最后进行测试:
decoder = Decoder(OUTPUT_DIM, EMB_SIZE, HIDDEN_SIZE, N_LAYERS, atten)
decoder_input = torch.tensor([1, 2])
pre, hidden = decoder(decoder_input, h_hat, hidden, en_out)
print(pre.shape)
print(hidden.shape)
"""
torch.Size([2, 6])
torch.Size([4, 2, 10])
"""