- 1、导入所需要的包
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator
import spacy
import numpy as np
import random
import math
import time
- 2、设置可重复性的随机种子
SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
一、数据预处理
- 3、加载德语和英语的spaCy模块
下载de_core_news_sm包和en_core_web_sm包
然后在conda中使用 pip install … 导入即可
spacy_de = spacy.load('de_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')
- 4、创建tokenizers
def tokenize_de(text):
# Tokenizes German text from a string into a list of strings
return [tok.text for tok in spacy_de.tokenizer(text)]
def tokenize_en(text):
# Tokenizes English text from a string into a list of strings
return [tok.text for tok in spacy_en.tokenizer(text)]
- 5、
SRC = Field(tokenize = tokenize_de,
init_token = '<sos>',
eos_token = '<eos>',
lower = True)
TRG = Field(tokenize = tokenize_en,
init_token = '<sos>',
eos_token = '<eos>',
lower = True)
- 6、按规定格式加载数据data
train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'),fields = (SRC, TRG))
- 7、创建词表vocabulary
SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)
- 8、定义device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
- 9、创建迭代器iterators
BATCH_SIZE = 128
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
(train_data, valid_data, test_data),
batch_size = BATCH_SIZE,
device = device)
二、搭建Seq2Seq模型
(一)Encoder模块
在Encoder模块,我们使用单层的GRU,采用双向RNN模式。因为使用的双向RNN,所以在每一层都会有两个RNN。前向的RNN从左到右遍历嵌入的句子中的单词(如下图中绿色方框显示),而后向的RNN从右到左遍历嵌入的句子的单词(如下图中深青色方框显示)。使用双向RNN需要在代码中将 bidirectional 设置为 True 即 bidirectional = True。
将嵌入后的句子传入RNN中后我们将得到:
其中前向第一个时间步x0输入的为《sos》,第二个时间步x2输入的为《guten》,后向的第一个时间步x0输入的为《eos》,后向的第二个时间不输入的为《morgen》
我们将input(embedded)传递给RNN,并且告诉RNN前向和后向的初始隐藏状态(分别为h->0和h<-0)初始化为全为零的张量tensor,然后我们将获得两个上下文向量,一个是来自前向RNN的在看到句子中最后一个单词产生的z->=h->T,另一个是来自后向RNN的在看到句子中第一个单词后产生的z<-=h<-T。
RNN执行完最后时间步后返回 outputs 和 hidden
outputs的大小为[src_len, batch_size, hid_dim * num_directions],其中上图中顶层(向上第三层)的第一个hid_dim元素(<-h0)是顶层RNN反向第一个时间步开始之前的隐藏状态,上图中向上第二层的最后一个hid_dim元素(z->)是正向最后一个时间步之后的隐藏状态,我们可以将它们看为前向和后向串联在一起的隐藏状态,比如:
并且我们可以将Encoder中所有RNN单元的隐藏状态(前向和后向串联在一起)表示为:
hidden的大小为[num_layers * num_directions, batch_size, hid_dim],其中[-2, :, :]为顶层正向RNN中最后一个时间步后输出的隐藏状态(即看到句子中的最后一个单词之后),[-1, :, :]为顶层反向RNN中最后一个时间步后输出的隐藏状态(即看到句子中第一个个单词之后)。
由于Decoder不是双向的,因此只需要一个上下文向量z即可用作初始隐藏状态,并且我们在经过Encoder之后有两个(前向和后向的最后一个隐藏状态)。我们通过将这两个上下文向量连接在一起,然后通过线性层g,并使用tanh函数,这样来转换hid_dim的大小。
当我们希望我们的模型能回顾整个源句子时,我们通过返回 outputs 来回顾,outputs中堆叠了每个时间步的前向和后向的隐藏状态。我们也返回 hidden,这是我们用在Decoder中的隐藏状态。
- 10、Encoder模块
class Encoder(nn.Module):
def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
super().__init__()
self.embedding = nn.Embedding(input_dim, emb_dim)
self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional=True)
self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
self.dropout = nn.Dropout(dropout)
def forward(self, src):
'''
src = [src_len, batch_size]
'''
src = src.transpose(0, 1) # src = [batch_size, src_len]
embedded = self.dropout(self.embedding(src)).transpose(0, 1) # embedded = [src_len, batch_size, emb_dim]
# enc_output = [src_len, batch_size, hid_dim * num_directions]
# enc_hidden = [n_layers * num_directions, batch_size, hid_dim]
enc_output, enc_hidden = self.rnn(embedded) # if h_0 is not give, it will be set 0 acquiescently
# enc_hidden is stacked [forward_1, backward_1, forward_2, backward_2, ...]
# enc_output are always from the last layer
# enc_hidden [-2, :, : ] is the last of the forwards RNN
# enc_hidden [-1, :, : ] is the last of the backwards RNN
# initial decoder hidden is final hidden state of the forwards and backwards
# encoder RNNs fed through a linear layer
# s = [batch_size, dec_hid_dim]
s = torch.tanh(self.fc(torch.cat((enc_hidden[-2, :, :], enc_hidden[-1, :, :]), dim=1)))
return enc_output, s
(二)Attention模块
接下来是注意力层。这里将采用Decoder的第一个时间步之前的隐藏状态以及来自Encoder的所有堆叠的前向和后向的隐藏状态H。然后产生注意力向量
,向量的大小为源语句的长度,其中每个元素都在0和1之间并且它们的和为1。
直观地讲,该层采用我们到目前时间步为止已经解码的隐藏状态
和所有Encoder中已编码的H,来生成一个向量,这个向量表示我们为了预测下一个要解码的单词应该对源语句中哪些单词给予最多的注意力。
首先,我们计算前一个前一个时间步Decoder的隐藏状态和Encoder产生的所有隐藏状态H之间的能量energy。由于我们的Encoder产生的隐藏状态是T张量序列。而我们的Decoder在前一时间步产生的隐藏状态是一个单张量序列,所以我们要做的第一件事就是将之前的解码器隐藏状态重复T次。然后我们再计算他们之间的energy,做法是通过将它们连接在一起,然后通过一个线性层(attn)和一个tanh激活函数来产生
这里可以看作是计算每个Encoder所有隐藏状态与前一个Decoder隐藏状态的“匹配”的程度。
我们目前有一个[dec_hid_dim, src_len]的张量为每个批次的例子,我们希望这是[src_len]为每个批次中的例子,因为注意力向量应该超过源句子的长度,这是通过将energy乘以一个[1, dec_hid_dim]的张量v来实现的。
我们可以把v看作是所有Encoder隐藏状态的能量加权和的权重。这些权重告诉我们应该处理源语句序列中的每个标记的程度。v的参数是随机初始化的,但通过反向传播与模型的其余部分学习。注意v是如何不依赖时间的,并且相同的v用于Decoder的每个时间步长。我们实现v作为一个没有偏差的线性层。
最后,我们确保注意力向量符合所有元素都在0到1之间的约束条件,并通过softmax层使它的向量和为1。
如上便给了我们源语句注意力。
如下图所示,这是为了计算第一个注意力向量,其中。绿色/青色方块表示前向和后向的RNN隐藏状态,注意力的计算全部在粉色方块中完成。
- 11、Attention模块
class Attention(nn.Module):
def __init__(self, enc_hid_dim, dec_hid_dim):
super(Attention, self).__init__()
self.attn = nn.Linear((enc_hid_dim) * 2 + dec_hid_dim, dec_hid_dim, bias=False)
self.V = nn.Linear(dec_hid_dim, 1, bias=False)
def forward(self, s, enc_output):
# s = [batch_size, dec_hid_dim]
# enc_output = [seq_len, batch_size, enc_hid_dim * 2]
batch_size = enc_output.shape[1]
src_len = enc_output.shape[0]
# repeat decoder hidden state src_len times
# s = [batch_size, seq_len, dec_hid_dim]
# enc_out_put = [batch_size, seq_len, enc_hid_dim * 2]
s = s.unsqueeze(1).repeat(1, src_len, 1)
enc_output = enc_output.transpose(0, 1)
# energy = [batch_size, src_len, dec_hid_dim]
energy = torch.tanh(self.attn(torch.cat((s, enc_output), dim=2)))
# attention = [batch_size, src_len]
attention = self.V(energy).squeeze(2)
return F.softmax(attention, dim=1)
(三) Decoder模块
Decoder包含注意力层attention,attention层用Decoder上一时间步的隐藏状态st-1和Encoder中所有隐藏状态H,计算产生注意力向量at并返回。
然后我们使用这个注意力向量来创建一个加权的源向量wt,它是Encoder隐藏状态H的加权和,以at作为权重。
然后,将嵌入的字向量d(yt),加权源向量wt和先前的Decoder隐藏状态st-1都传递到Decoder的RNN,同时将d(yt)和wt串联在一起。
然后,我们将d(yt)、wt和st传递给线性层f,来预测目标句子中下一个单词。这是通过将它们连接在一起来实现的。
下图中显示了翻译示例中第一个单词的解码过程。
绿色块表示输出H的前向/后向Encoder的rnn,红色z块表示上下文向量,
蓝色块表示Decoder中RNN输出st,紫色块表示线性层,f输出,橙色块表示at和输出wt对H的加权和的计算。
- 12、Decoder模块
class Decoder(nn.Module):
def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
super().__init__()
self.output_dim = output_dim
self.attention = attention
self.embedding = nn.Embedding(output_dim, emb_dim)
self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim)
self.fc_out = nn.Linear((enc_hid_dim * 2) + dec_hid_dim + emb_dim, output_dim)
self.dropout = nn.Dropout(dropout)
def forward(self, dec_input, s, enc_output):
# dec_input = [batch_size]
# s = [batch_size, dec_hid_dim]
# enc_output = [src_len, batch_size, enc_hid_dim * 2]
dec_input = dec_input.unsqueeze(1) # dec_input = [batch_size, 1]
embedded = self.dropout(self.embedding(dec_input)).transpose(0, 1) # embedded = [1, batch_size, emb_dim]
# a = [batch_size, 1, src_len]
a = self.attention(s, enc_output).unsqueeze(1)
# enc_output = [batch_size, src_len, enc_hid_dim * 2]
enc_output = enc_output.transpose(0, 1)
# c = [1, batch_size, enc_hid_dim * 2]
c = torch.bmm(a, enc_output).transpose(0, 1)
# rnn_input = [1, batch_size, (enc_hid_dim * 2) + emb_dim]
rnn_input = torch.cat((embedded, c), dim=2)
# dec_output = [src_len(=1), batch_size, dec_hid_dim]
# dec_hidden = [n_layers * num_directions, batch_size, dec_hid_dim]
dec_output, dec_hidden = self.rnn(rnn_input, s.unsqueeze(0))
# embedded = [batch_size, emb_dim]
# dec_output = [batch_size, dec_hid_dim]
# c = [batch_size, enc_hid_dim * 2]
embedded = embedded.squeeze(0)
dec_output = dec_output.squeeze(0)
c = c.squeeze(0)
# pred = [batch_size, output_dim]
pred = self.fc_out(torch.cat((dec_output, c, embedded), dim=1))
return pred, dec_hidden.squeeze(0)
(四)Seq2Seq模型
Seq2Seq模型不需要编码器RNN和解码器RNN有相同的隐藏维度,但是编码器必须是双向的。
简要回顾所有的步骤:
-
outputs 张量用来保留所有的预测
-
源序列X被送入编码器,并且产生最后一个时间步的隐藏状态z和所有时间步堆叠起来的隐藏状态H
-
初始解码器隐藏状态被设置为编码器最后一个时间步的隐藏状态
-
我们使用一批《sos》令牌作为第一输入y1
-
然后我们在一个循环中解码:
- 将输入令牌yt、先前隐藏状态st−1和编码器的所有outputs,即H 输入到解码器中
- 接收一个预测,y^t+1,和一个新的隐藏状态st
- 然后我们决定是否使用teacher force,设置下一个时间步的合适的输入
-
13、Seq2Seq模型
class Seq2Seq(nn.Module):
def __init__(self, encoder, decoder, device):
super(Seq2Seq, self).__init__()
self.encoder = encoder
self.decoder = decoder
self.device = device
def forward(self, src, trg, teacher_forcing_radio=0.5):
# src = [src_len, batch_size]
# trg = [trg_len, batch_size]
# teacher_foring_radio is probability to use teacher forcing
batch_size = src.shape[1]
trg_len = trg.shape[0]
trg_vocab_size = self.decoder.output_dim
# tensor to store decoder outputs
outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
# enc_output is all hidden states of the input sequence, back and forwards
# s is the final forward and backward hidden states, passed through a linear layer
enc_output, s = self.encoder(src)
# first input to the decoder is the <sos> tokens
dec_input = trg[0, :]
for t in range(1, trg_len):
# insert dec_input token embeddings, previous hidden state and all encoder hidden states
# receive output tensor (predictions) and new hidden state
# dec_output = [batch_size, output_dim]
# s = [batch_size, dec_hid_dim]
dec_output, s = self.decoder(dec_input, s, enc_output)
# place predictions in a tensor holding predictions for each token
outputs[t] = dec_output
# decide if we are going to use teacher forcing or not
teacher_force = random.random() < teacher_forcing_radio
# get the highest predicted token from our predictions
top1 = dec_output.argmax(1)
# if teacher foring, use actual next token as next input
# if not, use predicted token
dec_input = trg[t] if teacher_force else top1
return outputs
三、训练及评估模型
我们初始化我们的参数,编码器,解码器和seq2seq模型(如果我们有GPU的话把它放在GPU上)。
- 14、初始化我们的参数,编码器,解码器和seq2seq模型
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
ENC_HID_DIM = 512
DEC_HID_DIM = 512
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5
attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)
model = Seq2Seq(enc, dec, device).to(device)
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
- 15、训练函数
def train(model, iterator, optimizer, criterion):
model.train()
epoch_loss = 0
for i, batch in enumerate(iterator):
src = batch.src
trg = batch.trg # trg = [trg_len, batch_size]
# pred = [trg_len, batch_size, pred_dim]
pred = model(src, trg)
pred_dim = pred.shape[-1]
# trg = [(trg_len - 1) * batch_size]
# pred = [(trg_len - 1) * batch_size, pred_dim]
trg = trg[1:].view(-1)
pred = pred[1:].view(-1, pred_dim)
loss = criterion(pred, trg)
optimizer.zero_grad()
loss.backward()
optimizer.step()
epoch_loss += loss.item()
return epoch_loss / len(iterator)
- 16、评估函数
def evaluate(model, iterator, criterion):
model.eval()
epoch_loss = 0
with torch.no_grad():
for i, batch in enumerate(iterator):
src = batch.src
trg = batch.trg # trg = [trg_len, batch_size]
# out_put = [trg_len, batch_size, output_dim]
output = model(src, trg, 0) # turn off teacher forcing
output_dim = output.shape[-1]
# trg = [(trg_len - 1 ) * batch_size]
# output = [(trg_len - 1) * batch_size, output_dim]
output = output[1:].view(-1, output_dim)
trg = trg[1:].view(-1)
loss = criterion(output, trg)
epoch_loss += loss.item()
return epoch_loss / len(iterator)
- 17、计时函数
def epoch_time(start_time, end_time):
elapsed_time = end_time - start_time
elapsed_mins = int(elapsed_time / 60)
elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
return elapsed_mins, elapsed_secs
- 18、训练我们的模型,保存最好的验证损失的参数。
best_valid_loss = float('inf')
for epoch in range(10):
start_time = time.time()
train_loss = train(model, train_iterator, optimizer, criterion)
valid_loss = evaluate(model, valid_iterator, criterion)
end_time = time.time()
epoch_mins, epoch_secs = epoch_time(start_time, end_time)
if valid_loss < best_valid_loss:
best_valid_loss = valid_loss
torch.save(model.state_dict(), 'tut3-model.pt')
print(f'Epoch: {epoch + 1:02} | Time: {epoch_mins}m {epoch_secs}s')
print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
print(f'\t Val. Loss: {valid_loss:.3f} | Val. PPL: {math.exp(valid_loss):7.3f}')
- 训练结果
Epoch: 01 | Time: 2m 40s
Train Loss: 4.346 | Train PPL: 77.152
Val. Loss: 3.636 | Val. PPL: 37.924
Epoch: 02 | Time: 2m 41s
Train Loss: 3.114 | Train PPL: 22.511
Val. Loss: 3.281 | Val. PPL: 26.606
Epoch: 03 | Time: 2m 40s
Train Loss: 2.653 | Train PPL: 14.192
Val. Loss: 3.304 | Val. PPL: 27.221
Epoch: 04 | Time: 2m 41s
Train Loss: 2.355 | Train PPL: 10.536
Val. Loss: 3.269 | Val. PPL: 26.282
Epoch: 05 | Time: 2m 40s
Train Loss: 2.136 | Train PPL: 8.464
Val. Loss: 3.279 | Val. PPL: 26.548
Epoch: 06 | Time: 2m 42s
Train Loss: 1.995 | Train PPL: 7.350
Val. Loss: 3.292 | Val. PPL: 26.895
Epoch: 07 | Time: 2m 40s
Train Loss: 1.881 | Train PPL: 6.563
Val. Loss: 3.261 | Val. PPL: 26.070
Epoch: 08 | Time: 2m 40s
Train Loss: 1.780 | Train PPL: 5.928
Val. Loss: 3.332 | Val. PPL: 27.984
Epoch: 09 | Time: 2m 41s
Train Loss: 1.693 | Train PPL: 5.434
Val. Loss: 3.360 | Val. PPL: 28.796
Epoch: 10 | Time: 2m 40s
Train Loss: 1.656 | Train PPL: 5.236
Val. Loss: 3.441 | Val. PPL: 31.209
- 19、最后,我们使用这些“最佳”参数在测试集中测试模型。
model.load_state_dict(torch.load('tut3-model.pt'))
test_loss = evaluate(model, test_iterator, criterion)
print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')
| Test Loss: 3.337 | Test PPL: 28.140 |
在下一个笔记中,我们将使用相同的架构,但使用一些技巧,适用于所有RNN架构-packed padded sequences和masking,我们还将实现允许我们查看当解码输出时RNN正在关注输入中的哪些单词的代码。
四、完整代码
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator
import spacy
import numpy as np
import random
import math
import time
SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
spacy_de = spacy.load('de_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')
def tokenize_de(text):
"""
Tokenizes German text from a string into a list of strings (tokens) and reverses it
"""
return [tok.text for tok in spacy_de.tokenizer(text)]
def tokenize_en(text):
"""
Tokenizes English text from a string into a list of strings (tokens)
"""
return [tok.text for tok in spacy_en.tokenizer(text)]
SRC = Field(tokenize=tokenize_de,
init_token='<sos>',
eos_token='<eos>',
lower=True)
TRG = Field(tokenize=tokenize_en,
init_token='<sos>',
eos_token='<eos>',
lower=True)
train_data, valid_data, test_data = Multi30k.splits(exts=('.de', '.en'),
fields=(SRC, TRG))
print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")
SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BATCH_SIZE = 128
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
(train_data, valid_data, test_data),
batch_size = BATCH_SIZE,
device = device)
class Encoder(nn.Module):
def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
super().__init__()
self.embedding = nn.Embedding(input_dim, emb_dim)
self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional=True)
self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
self.dropout = nn.Dropout(dropout)
def forward(self, src):
'''
src = [src_len, batch_size]
'''
src = src.transpose(0, 1) # src = [batch_size, src_len]
embedded = self.dropout(self.embedding(src)).transpose(0, 1) # embedded = [src_len, batch_size, emb_dim]
# enc_output = [src_len, batch_size, hid_dim * num_directions]
# enc_hidden = [n_layers * num_directions, batch_size, hid_dim]
enc_output, enc_hidden = self.rnn(embedded) # if h_0 is not give, it will be set 0 acquiescently
# enc_hidden is stacked [forward_1, backward_1, forward_2, backward_2, ...]
# enc_output are always from the last layer
# enc_hidden [-2, :, : ] is the last of the forwards RNN
# enc_hidden [-1, :, : ] is the last of the backwards RNN
# initial decoder hidden is final hidden state of the forwards and backwards
# encoder RNNs fed through a linear layer
# s = [batch_size, dec_hid_dim]
s = torch.tanh(self.fc(torch.cat((enc_hidden[-2, :, :], enc_hidden[-1, :, :]), dim=1)))
return enc_output, s
class Attention(nn.Module):
def __init__(self, enc_hid_dim, dec_hid_dim):
super(Attention, self).__init__()
self.attn = nn.Linear((enc_hid_dim) * 2 + dec_hid_dim, dec_hid_dim, bias=False)
self.V = nn.Linear(dec_hid_dim, 1, bias=False)
def forward(self, s, enc_output):
# s = [batch_size, dec_hid_dim]
# enc_output = [seq_len, batch_size, enc_hid_dim * 2]
batch_size = enc_output.shape[1]
src_len = enc_output.shape[0]
# repeat decoder hidden state src_len times
# s = [batch_size, seq_len, dec_hid_dim]
# enc_out_put = [batch_size, seq_len, enc_hid_dim * 2]
s = s.unsqueeze(1).repeat(1, src_len, 1)
enc_output = enc_output.transpose(0, 1)
# energy = [batch_size, src_len, dec_hid_dim]
energy = torch.tanh(self.attn(torch.cat((s, enc_output), dim=2)))
# attention = [batch_size, src_len]
attention = self.V(energy).squeeze(2)
return F.softmax(attention, dim=1)
class Decoder(nn.Module):
def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
super().__init__()
self.output_dim = output_dim
self.attention = attention
self.embedding = nn.Embedding(output_dim, emb_dim)
self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim)
self.fc_out = nn.Linear((enc_hid_dim * 2) + dec_hid_dim + emb_dim, output_dim)
self.dropout = nn.Dropout(dropout)
def forward(self, dec_input, s, enc_output):
# dec_input = [batch_size]
# s = [batch_size, dec_hid_dim]
# enc_output = [src_len, batch_size, enc_hid_dim * 2]
dec_input = dec_input.unsqueeze(1) # dec_input = [batch_size, 1]
embedded = self.dropout(self.embedding(dec_input)).transpose(0, 1) # embedded = [1, batch_size, emb_dim]
# a = [batch_size, 1, src_len]
a = self.attention(s, enc_output).unsqueeze(1)
# enc_output = [batch_size, src_len, enc_hid_dim * 2]
enc_output = enc_output.transpose(0, 1)
# c = [1, batch_size, enc_hid_dim * 2]
c = torch.bmm(a, enc_output).transpose(0, 1)
# rnn_input = [1, batch_size, (enc_hid_dim * 2) + emb_dim]
rnn_input = torch.cat((embedded, c), dim=2)
# dec_output = [src_len(=1), batch_size, dec_hid_dim]
# dec_hidden = [n_layers * num_directions, batch_size, dec_hid_dim]
dec_output, dec_hidden = self.rnn(rnn_input, s.unsqueeze(0))
# embedded = [batch_size, emb_dim]
# dec_output = [batch_size, dec_hid_dim]
# c = [batch_size, enc_hid_dim * 2]
embedded = embedded.squeeze(0)
dec_output = dec_output.squeeze(0)
c = c.squeeze(0)
# pred = [batch_size, output_dim]
pred = self.fc_out(torch.cat((dec_output, c, embedded), dim=1))
return pred, dec_hidden.squeeze(0)
class Seq2Seq(nn.Module):
def __init__(self, encoder, decoder, device):
super(Seq2Seq, self).__init__()
self.encoder = encoder
self.decoder = decoder
self.device = device
def forward(self, src, trg, teacher_forcing_radio=0.5):
# src = [src_len, batch_size]
# trg = [trg_len, batch_size]
# teacher_foring_radio is probability to use teacher forcing
batch_size = src.shape[1]
trg_len = trg.shape[0]
trg_vocab_size = self.decoder.output_dim
# tensor to store decoder outputs
outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
# enc_output is all hidden states of the input sequence, back and forwards
# s is the final forward and backward hidden states, passed through a linear layer
enc_output, s = self.encoder(src)
# first input to the decoder is the <sos> tokens
dec_input = trg[0, :]
for t in range(1, trg_len):
# insert dec_input token embeddings, previous hidden state and all encoder hidden states
# receive output tensor (predictions) and new hidden state
# dec_output = [batch_size, output_dim]
# s = [batch_size, dec_hid_dim]
dec_output, s = self.decoder(dec_input, s, enc_output)
# place predictions in a tensor holding predictions for each token
outputs[t] = dec_output
# decide if we are going to use teacher forcing or not
teacher_force = random.random() < teacher_forcing_radio
# get the highest predicted token from our predictions
top1 = dec_output.argmax(1)
# if teacher foring, use actual next token as next input
# if not, use predicted token
dec_input = trg[t] if teacher_force else top1
return outputs
# Train the Seq2Seq Model
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
ENC_HID_DIM = 512
DEC_HID_DIM = 512
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5
attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)
model = Seq2Seq(enc, dec, device).to(device)
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
def train(model, iterator, optimizer, criterion):
model.train()
epoch_loss = 0
for i, batch in enumerate(iterator):
src = batch.src
trg = batch.trg # trg = [trg_len, batch_size]
# pred = [trg_len, batch_size, pred_dim]
pred = model(src, trg)
pred_dim = pred.shape[-1]
# trg = [(trg_len - 1) * batch_size]
# pred = [(trg_len - 1) * batch_size, pred_dim]
trg = trg[1:].view(-1)
pred = pred[1:].view(-1, pred_dim)
loss = criterion(pred, trg)
optimizer.zero_grad()
loss.backward()
optimizer.step()
epoch_loss += loss.item()
return epoch_loss / len(iterator)
def evaluate(model, iterator, criterion):
model.eval()
epoch_loss = 0
with torch.no_grad():
for i, batch in enumerate(iterator):
src = batch.src
trg = batch.trg # trg = [trg_len, batch_size]
# out_put = [trg_len, batch_size, output_dim]
output = model(src, trg, 0) # turn off teacher forcing
output_dim = output.shape[-1]
# trg = [(trg_len - 1 ) * batch_size]
# output = [(trg_len - 1) * batch_size, output_dim]
output = output[1:].view(-1, output_dim)
trg = trg[1:].view(-1)
loss = criterion(output, trg)
epoch_loss += loss.item()
return epoch_loss / len(iterator)
# Time function
def epoch_time(start_time, end_time):
elapsed_time = end_time - start_time
elapsed_mins = int(elapsed_time / 60)
elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
return elapsed_mins, elapsed_secs
best_valid_loss = float('inf')
for epoch in range(10):
start_time = time.time()
train_loss = train(model, train_iterator, optimizer, criterion)
valid_loss = evaluate(model, valid_iterator, criterion)
end_time = time.time()
epoch_mins, epoch_secs = epoch_time(start_time, end_time)
if valid_loss < best_valid_loss:
best_valid_loss = valid_loss
torch.save(model.state_dict(), 'tut3-model.pt')
print(f'Epoch: {epoch + 1:02} | Time: {epoch_mins}m {epoch_secs}s')
print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
print(f'\t Val. Loss: {valid_loss:.3f} | Val. PPL: {math.exp(valid_loss):7.3f}')
model.load_state_dict(torch.load('tut3-model.pt'))
test_loss = evaluate(model, test_iterator, criterion)
print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')