Hung-Yi Lee homework[8]: Sequece to Sequence
一、Sequence to Sequence介绍
常见的seq2seq模型都是encoder-decoder模型,主要由Encoder和Decoder两部分组成,这两部分大多数情况下均由RNN来实现,作用是解决输入和输出的长度不一致的问题。Encoder是将一连串的输入编码为单个向量,Decoder是将Encoder输出的单个向量逐步解码,一次输出一个结果,每次的输出会影响到下一次的输出,一般会在Decoder的开头加入“<BOS>”来表示开始解码,在Decoder的结尾加入“<EOS>”来表示输出结束。
二、作业内容介绍
输入一句英文,输出一句中文翻译。
三、作业实现
3.1 总体代码
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.utils.data as data
import torch.utils.data.sampler as sampler
import torchvision
from torchvision import datasets, transforms
import numpy as np
import sys
import os
import random
import json
import re
import nltk
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction
import matplotlib.pyplot as plt
# 将不同长度的答案扩展到相同长度,以便训练模型
class LabelTransform(object):
def __init__(self, size, pad):
self.size = size
self.pad = pad
def __call__(self, label):
label = np.pad(label, (0, (self.size - label.shape[0])), mode='constant', constant_values=self.pad)
return label
# 定义dataset
class EN2CNDataset(data.Dataset):
def __init__(self, root, max_output_len, set_name):
self.root = root
self.word2int_cn, self.int2word_cn = self.get_dictionary('cn')
self.word2int_en, self.int2word_en = self.get_dictionary('en')
self.data = []
with open(os.path.join(self.root, f'{set_name}.txt'), "r", encoding='UTF-8') as f:
for line in f:
self.data.append(line)
print (f'{set_name} dataset size: {len(self.data)}')
self.cn_vocab_size = len(self.word2int_cn)
self.en_vocab_size = len(self.word2int_en)
self.transform = LabelTransform(max_output_len, self.word2int_en['<PAD>'])
def get_dictionary(self, language):
with open(os.path.join(self.root, f'word2int_{language}.json'), "r", encoding='UTF-8') as f:
word2int = json.load(f)
with open(os.path.join(self.root, f'int2word_{language}.json'), "r", encoding='UTF-8') as f:
int2word = json.load(f)
return word2int, int2word
def __len__(self):
return len(self.data)
def __getitem__(self, Index):
# 先将中英文分开
sentences = self.data[Index]
sentences = re.split('[\t\n]', sentences)
sentences = list(filter(None, sentences))
#print (sentences)
assert len(sentences) == 2
# 准备特殊字元
BOS = self.word2int_en['<BOS>']
EOS = self.word2int_en['<EOS>']
UNK = self.word2int_en['<UNK>']
# 在开头添加 <BOS>,在结尾添加 <EOS> ,不在字典的词用 <UNK> 取代
en, cn = [BOS], [BOS]
# 将句子拆解为subword并转为整数
sentence = re.split(' ', sentences[0])
sentence = list(filter(None, sentence))
#print (f'en: {sentence}')
for word in sentence:
en.append(self.word2int_en.get(word, UNK))
en.append(EOS)
# 将句子拆解为单词并转为整数
sentence = re.split(' ', sentences[1])
sentence = list(filter(None, sentence))
#print (f'cn: {sentence}')
for word in sentence:
cn.append(self.word2int_cn.get(word, UNK))
cn.append(EOS)
en, cn = np.asarray(en), np.asarray(cn)
# 用 <PAD> 将句子补到相同长度
en, cn = self.transform(en), self.transform(cn)
en, cn = torch.LongTensor(en), torch.LongTensor(cn)
return en, cn
class Attention(nn.Module):
def __init__(self, hid_dim):
super(Attention, self).__init__()
self.hid_dim = hid_dim
def forward(self, encoder_outputs, decoder_hidden):
attention = None
return attention
# 模型
class Encoder(nn.Module):
def __init__(self, en_vocab_size, emb_dim, hid_dim, n_layers, dropout):
super().__init__()
self.embedding = nn.Embedding(en_vocab_size, emb_dim)
self.hid_dim = hid_dim
self.n_layers = n_layers
self.rnn = nn.GRU(emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=True, bidirectional=True)
self.dropout = nn.Dropout(dropout)
def forward(self, input):
# input = [batch size, sequence len, vocab size]
# outputs = [batch size, sequence len, hid dim * directions]
# hidden = [num_layers * directions, batch size , hid dim]
embedding = self.embedding(input)
# outputs 是最上层RNN的輸出
outputs, hidden = self.rnn(self.dropout(embedding))
return outputs, hidden
class Decoder(nn.Module):
def __init__(self, cn_vocab_size, emb_dim, hid_dim, n_layers, dropout, isatt):
super().__init__()
self.cn_vocab_size = cn_vocab_size
self.hid_dim = hid_dim * 2
self.n_layers = n_layers
self.embedding = nn.Embedding(cn_vocab_size, config.emb_dim)
self.isatt = isatt
self.attention = Attention(hid_dim)
self.input_dim = emb_dim
self.rnn = nn.GRU(self.input_dim, self.hid_dim, self.n_layers, dropout = dropout, batch_first=True)
self.embedding2vocab1 = nn.Linear(self.hid_dim, self.hid_dim * 2)
self.embedding2vocab2 = nn.Linear(self.hid_dim * 2, self.hid_dim * 4)
self.embedding2vocab3 = nn.Linear(self.hid_dim * 4, self.cn_vocab_size)
self.dropout = nn.Dropout(dropout)
def forward(self, input, hidden, encoder_outputs):
# input = [batch size, vocab size]
# hidden = [batch size, n layers * directions, hid dim]
# Decoder是单向的,所以 directions=1
input = input.unsqueeze(1)
embedded = self.dropout(self.embedding(input))
# embedded = [batch size, 1, emb dim]
if self.isatt:
attn = self.attention(encoder_outputs, hidden)
output, hidden = self.rnn(embedded, hidden)
# output = [batch size, 1, hid dim]
# hidden = [num_layers, batch size, hid dim]
# 将RNN的输出转为每个词出现的概率
output = self.embedding2vocab1(output.squeeze(1))
output = self.embedding2vocab2(output)
prediction = self.embedding2vocab3(output)
# prediction = [batch size, vocab size]
return prediction, hidden
class Seq2Seq(nn.Module):
def __init__(self, encoder, decoder, device):
super().__init__()
self.encoder = encoder
self.decoder = decoder
self.device = device
assert encoder.n_layers == decoder.n_layers, \
"Encoder and decoder must have equal number of layers!"
def forward(self, input, target, teacher_forcing_ratio):
# input = [batch size, input len, vocab size]
# target = [batch size, target len, vocab size]
# teacher_forcing_ratio 是有多少概率使用正确答案来训练
batch_size = target.shape[0]
target_len = target.shape[1]
vocab_size = self.decoder.cn_vocab_size
# 准备一个用来存储输出的空间
outputs = torch.zeros(batch_size, target_len, vocab_size).to(self.device)
# 将输入放入Encoder
encoder_outputs, hidden = self.encoder(input)
# encoder_outputs 主要是使用在 Attention
# 因为 Encoder 是双向的RNN,所以需要将同一层两个方向的 hidden state 接在一起
# hidden = [num_layers * directions, batch size , hid dim] --> [num_layers, directions, batch size , hid dim]
hidden = hidden.view(self.encoder.n_layers, 2, batch_size, -1)
hidden = torch.cat((hidden[:, -2, :, :], hidden[:, -1, :, :]), dim=2)
input = target[:, 0]
preds = []
for t in range(1, target_len):
output, hidden = self.decoder(input, hidden, encoder_outputs)
outputs[:, t] = output
# 决定是否用正确答案来进行训练
teacher_force = random.random() <= teacher_forcing_ratio
# 取出概率最大的单词
top1 = output.argmax(1)
# 如果是 teacher force 就用正确答案来进行训练,如果不是就用自己预测的单词进行训练
input = target[:, t] if teacher_force and t < target_len else top1
preds.append(top1.unsqueeze(1))
preds = torch.cat(preds, 1)
return outputs, preds
def inference(self, input, target):
# input = [batch size, input len, vocab size]
# target = [batch size, target len, vocab size]
batch_size = input.shape[0]
input_len = input.shape[1] # 取得最大字数
vocab_size = self.decoder.cn_vocab_size
outputs = torch.zeros(batch_size, input_len, vocab_size).to(self.device)
encoder_outputs, hidden = self.encoder(input)
# hidden = [num_layers * directions, batch size , hid dim] --> [num_layers, directions, batch size , hid dim]
hidden = hidden.view(self.encoder.n_layers, 2, batch_size, -1)
hidden = torch.cat((hidden[:, -2, :, :], hidden[:, -1, :, :]), dim=2)
input = target[:, 0]
preds = []
for t in range(1, input_len):
output, hidden = self.decoder(input, hidden, encoder_outputs)
# 将预测结果存起来
outputs[:, t] = output
# 取出概率最大的单词
top1 = output.argmax(1)
input = top1
preds.append(top1.unsqueeze(1))
preds = torch.cat(preds, 1)
return outputs, preds
def save_model(model, optimizer, store_model_path, step):
torch.save(model.state_dict(), f'{store_model_path}/model_{step}.ckpt')
return
def load_model(model, load_model_path):
print(f'Load model from {load_model_path}')
model.load_state_dict(torch.load(f'{load_model_path}.ckpt'))
return model
def build_model(config, en_vocab_size, cn_vocab_size):
encoder = Encoder(en_vocab_size, config.emb_dim, config.hid_dim, config.n_layers, config.dropout)
decoder = Decoder(cn_vocab_size, config.emb_dim, config.hid_dim, config.n_layers, config.dropout, config.attention)
model = Seq2Seq(encoder, decoder, device)
print(model)
optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)
print(optimizer)
if config.load_model:
model = load_model(model, config.load_model_path)
model = model.to(device)
return model, optimizer
# 数字转句子
def tokens2sentence(outputs, int2word):
sentences = []
for tokens in outputs:
sentence = []
for token in tokens:
word = int2word[str(int(token))]
if word == '<EOS>':
break
sentence.append(word)
sentences.append(sentence)
return sentences
# 计算BLEU
def computebleu(sentences, targets):
score = 0
assert (len(sentences) == len(targets))
def cut_token(sentence):
tmp = []
for token in sentence:
if token == '<UNK>' or token.isdigit() or len(bytes(token[0], encoding='utf-8')) == 1:
tmp.append(token)
else:
tmp += [word for word in token]
return tmp
for sentence, target in zip(sentences, targets):
sentence = cut_token(sentence)
target = cut_token(target)
score += sentence_bleu([target], sentence, weights=(1, 0, 0, 0))
return score
def infinite_iter(data_loader):
it = iter(data_loader)
while True:
try:
ret = next(it)
yield ret
except StopIteration:
it = iter(data_loader)
def schedule_sampling():
return 1
# 定义训练
def train(model, optimizer, train_iter, loss_function, total_steps, summary_steps, train_dataset):
model.train()
model.zero_grad()
losses = []
loss_sum = 0.0
for step in range(summary_steps):
sources, targets = next(train_iter)
sources, targets = sources.to(device), targets.to(device)
outputs, preds = model(sources, targets, schedule_sampling())
# targets 的第一个 token 是 <BOS> 所以忽略
outputs = outputs[:, 1:].reshape(-1, outputs.size(2))
targets = targets[:, 1:].reshape(-1)
loss = loss_function(outputs, targets)
optimizer.zero_grad()
loss.backward()
grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
optimizer.step()
loss_sum += loss.item()
if (step + 1) % 5 == 0:
loss_sum = loss_sum / 5
print("\r", "train [{}] loss: {:.3f}, Perplexity: {:.3f} ".format(total_steps + step + 1, loss_sum,
np.exp(loss_sum)), end=" ")
losses.append(loss_sum)
loss_sum = 0.0
return model, optimizer, losses
def test(model, dataloader, loss_function):
model.eval()
loss_sum, bleu_score= 0.0, 0.0
n = 0
result = []
for sources, targets in dataloader:
sources, targets = sources.to(device), targets.to(device)
batch_size = sources.size(0)
outputs, preds = model.inference(sources, targets)
# targets 的第一个 token 是 <BOS> 所以忽略
outputs = outputs[:, 1:].reshape(-1, outputs.size(2))
targets = targets[:, 1:].reshape(-1)
loss = loss_function(outputs, targets)
loss_sum += loss.item()
# 将预测结果转为文字
targets = targets.view(sources.size(0), -1)
preds = tokens2sentence(preds, dataloader.dataset.int2word_cn)
sources = tokens2sentence(sources, dataloader.dataset.int2word_en)
targets = tokens2sentence(targets, dataloader.dataset.int2word_cn)
for source, pred, target in zip(sources, preds, targets):
result.append((source, pred, target))
# 计算 Bleu Score
bleu_score += computebleu(preds, targets)
n += batch_size
return loss_sum / len(dataloader), bleu_score / n, result
# 训练流程
def train_process(config):
train_dataset = EN2CNDataset(config.data_path, config.max_output_len, 'training')
train_loader = data.DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)
train_iter = infinite_iter(train_loader)
val_dataset = EN2CNDataset(config.data_path, config.max_output_len, 'validation')
val_loader = data.DataLoader(val_dataset, batch_size=1)
model, optimizer = build_model(config, train_dataset.en_vocab_size, train_dataset.cn_vocab_size)
loss_function = nn.CrossEntropyLoss(ignore_index=0)
train_losses, val_losses, bleu_scores = [], [], []
total_steps = 0
while (total_steps < config.num_steps):
model, optimizer, loss = train(model, optimizer, train_iter, loss_function, total_steps, config.summary_steps,
train_dataset)
train_losses += loss
val_loss, bleu_score, result = test(model, val_loader, loss_function)
val_losses.append(val_loss)
bleu_scores.append(bleu_score)
total_steps += config.summary_steps
print("\r", "val [{}] loss: {:.3f}, Perplexity: {:.3f}, blue score: {:.3f} ".format(total_steps, val_loss,
np.exp(val_loss),
bleu_score))
if total_steps % config.store_steps == 0 or total_steps >= config.num_steps:
save_model(model, optimizer, config.store_model_path, total_steps)
with open(f'{config.store_model_path}/output_{total_steps}.txt', 'w') as f:
for line in result:
print(line, file=f)
return train_losses, val_losses, bleu_scores
def test_process(config):
test_dataset = EN2CNDataset(config.data_path, config.max_output_len, 'testing')
test_loader = data.DataLoader(test_dataset, batch_size=1)
model, optimizer = build_model(config, test_dataset.en_vocab_size, test_dataset.cn_vocab_size)
print ("Finish build model")
loss_function = nn.CrossEntropyLoss(ignore_index=0)
model.eval()
test_loss, bleu_score, result = test(model, test_loader, loss_function)
with open(f'./test_output.txt', 'w') as f:
for line in result:
print (line, file=f)
return test_loss, bleu_score
class configurations(object):
def __init__(self):
self.batch_size = 60
self.emb_dim = 256
self.hid_dim = 512
self.n_layers = 3
self.dropout = 0.5
self.learning_rate = 0.00005
self.max_output_len = 50 # 最后输出句子的最大长度
self.num_steps = 12000 # 总训练次数
self.store_steps = 300 # 训练多少次后储存模型
self.summary_steps = 300 # 训练多少次后检验是否过拟合
self.load_model = False # 是否需要载入模型
self.store_model_path = "./ckpt"
self.load_model_path = None
self.data_path = "./cmn-eng"
self.attention = False
if __name__ == '__main__':
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
config = configurations()
print('config:\n', vars(config))
train_losses, val_losses, bleu_scores = train_process(config)
plt.figure()
plt.plot(train_losses)
plt.xlabel('次数')
plt.ylabel('loss')
plt.title('train loss')
plt.show()
config = configurations()
print('config:\n', vars(config))
test_loss, bleu_score = test_process(config)
print(f'test loss: {test_loss}, bleu_score: {bleu_score}')
plt.figure()
plt.plot(val_losses)
plt.xlabel('次数')
plt.ylabel('loss')
plt.title('validation loss')
plt.show()
plt.figure()
plt.plot(bleu_scores)
plt.xlabel('次数')
plt.ylabel('BLEU score')
plt.title('BLEU score')
plt.show()
3.2 关键代码作用说明
3.2.1 class LabelTransform()
将不同长度的答案扩展到相同长度,以便训练模型。
# 将不同长度的答案扩展到相同长度,以便训练模型
class LabelTransform(object):
def __init__(self, size, pad):
self.size = size
self.pad = pad
def __call__(self, label):
label = np.pad(label, (0, (self.size - label.shape[0])), mode='constant', constant_values=self.pad)
return label
3.2.2 class EN2CNDataset()
Data
训练数据——18000句,检验数据——500句,测试数据——2636句。
数据预处理
- 英文数据【用subword-nmt将word转为subword,再取出标签中出现频率高于定值的subword建立字典】
- 中文数据【用jieba将中文句子断词,再取出标签中出现频率高于定值的词建立字典】
- 特殊字元<PAD>,<BOS>,<EOS>,<UNK>
- 将字典中每个subword/词用整数表示,后续转换为one-hot编码的向量。
- 训练数据中不同语言的句子用TAB分开,字与字之间用空白分开。
# 定义dataset
class EN2CNDataset(data.Dataset):
def __init__(self, root, max_output_len, set_name):
self.root = root
self.word2int_cn, self.int2word_cn = self.get_dictionary('cn')
self.word2int_en, self.int2word_en = self.get_dictionary('en')
self.data = []
with open(os.path.join(self.root, f'{set_name}.txt'), "r", encoding='UTF-8') as f:
for line in f:
self.data.append(line)
print (f'{set_name} dataset size: {len(self.data)}')
self.cn_vocab_size = len(self.word2int_cn)
self.en_vocab_size = len(self.word2int_en)
self.transform = LabelTransform(max_output_len, self.word2int_en['<PAD>'])
def get_dictionary(self, language):
with open(os.path.join(self.root, f'word2int_{language}.json'), "r", encoding='UTF-8') as f:
word2int = json.load(f)
with open(os.path.join(self.root, f'int2word_{language}.json'), "r", encoding='UTF-8') as f:
int2word = json.load(f)
return word2int, int2word
def __len__(self):
return len(self.data)
def __getitem__(self, Index):
# 先将中英文分开
sentences = self.data[Index]
sentences = re.split('[\t\n]', sentences)
sentences = list(filter(None, sentences))
#print (sentences)
assert len(sentences) == 2
# 准备特殊字元
BOS = self.word2int_en['<BOS>']
EOS = self.word2int_en['<EOS>']
UNK = self.word2int_en['<UNK>']
# 在开头添加 <BOS>,在结尾添加 <EOS> ,不在字典的词用 <UNK> 取代
en, cn = [BOS], [BOS]
# 将句子拆解为subword并转为整数
sentence = re.split(' ', sentences[0])
sentence = list(filter(None, sentence))
#print (f'en: {sentence}')
for word in sentence:
en.append(self.word2int_en.get(word, UNK))
en.append(EOS)
# 将句子拆解为单词并转为整数
sentence = re.split(' ', sentences[1])
sentence = list(filter(None, sentence))
#print (f'cn: {sentence}')
for word in sentence:
cn.append(self.word2int_cn.get(word, UNK))
cn.append(EOS)
en, cn = np.asarray(en), np.asarray(cn)
# 用 <PAD> 将句子补到相同长度
en, cn = self.transform(en), self.transform(cn)
en, cn = torch.LongTensor(en), torch.LongTensor(cn)
return en, cn
3.2.3 class Encoder()
对于每个输入,Encoder会输出一个向量和一个隐藏状态,并将隐藏状态用于下一个输入。
参数 | 含义 |
---|---|
en_vocab_size | 英文字典的大小,即英文的subword的个数 |
emb_dim | embedding的维度,用于将one-hot vector的单词向量压缩到指定维度 |
hid_dim | 输出和隐藏状态的维度 |
n_layers | RNN的层数 |
Encoder的输出是outputs和hidden,outputs是最上层RNN全部的输出,可以再用Attention进行处理,hidden是每层最后的隐藏状态,会被传送到Decoder进行解码。
class Encoder(nn.Module):
def __init__(self, en_vocab_size, emb_dim, hid_dim, n_layers, dropout):
super().__init__()
self.embedding = nn.Embedding(en_vocab_size, emb_dim)
self.hid_dim = hid_dim
self.n_layers = n_layers
self.rnn = nn.GRU(emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=True, bidirectional=True)
self.dropout = nn.Dropout(dropout)
def forward(self, input):
# input = [batch size, sequence len, vocab size]
# outputs = [batch size, sequence len, hid dim * directions]
# hidden = [num_layers * directions, batch size , hid dim]
embedding = self.embedding(input)
# outputs 是最上层RNN的輸出
outputs, hidden = self.rnn(self.dropout(embedding))
return outputs, hidden
3.2.4 class Decoder()
Decoder也是一个RNN,使用Encoder每一层最后的隐藏状态来进行解码。
参数 | 含义 |
---|---|
en_vocab_size | 英文字典的大小,即英文的subword的个数 |
emb_dim | embedding的维度,用于将one-hot vector的单词向量压缩到指定维度 |
hid_dim | 输出和隐藏状态的维度 |
output_dim | 最终输出的维度 |
n_layers | RNN的层数 |
isatt | 标志是否使用Attention Mechanism |
Decoder的输出是hidden和output,hidden是根据输入和前一次的隐藏状态得到现在的隐藏状态更新的结果,output是每个字有多少概率是这次解码的结果。
class Decoder(nn.Module):
def __init__(self, cn_vocab_size, emb_dim, hid_dim, n_layers, dropout, isatt):
super().__init__()
self.cn_vocab_size = cn_vocab_size
self.hid_dim = hid_dim * 2
self.n_layers = n_layers
self.embedding = nn.Embedding(cn_vocab_size, config.emb_dim)
self.isatt = isatt
self.attention = Attention(hid_dim)
self.input_dim = emb_dim
self.rnn = nn.GRU(self.input_dim, self.hid_dim, self.n_layers, dropout = dropout, batch_first=True)
self.embedding2vocab1 = nn.Linear(self.hid_dim, self.hid_dim * 2)
self.embedding2vocab2 = nn.Linear(self.hid_dim * 2, self.hid_dim * 4)
self.embedding2vocab3 = nn.Linear(self.hid_dim * 4, self.cn_vocab_size)
self.dropout = nn.Dropout(dropout)
def forward(self, input, hidden, encoder_outputs):
# input = [batch size, vocab size]
# hidden = [batch size, n layers * directions, hid dim]
# Decoder是单向的,所以 directions=1
input = input.unsqueeze(1)
embedded = self.dropout(self.embedding(input))
# embedded = [batch size, 1, emb dim]
if self.isatt:
attn = self.attention(encoder_outputs, hidden)
output, hidden = self.rnn(embedded, hidden)
# output = [batch size, 1, hid dim]
# hidden = [num_layers, batch size, hid dim]
# 将RNN的输出转为每个词出现的概率
output = self.embedding2vocab1(output.squeeze(1))
output = self.embedding2vocab2(output)
prediction = self.embedding2vocab3(output)
# prediction = [batch size, vocab size]
return prediction, hidden
3.2.5 class Attention()
当输入过长时,用Attention Mechanism来为Decoder提供更多的信息。
class Attention(nn.Module):
def __init__(self, hid_dim):
super(Attention, self).__init__()
self.hid_dim = hid_dim
def forward(self, encoder_outputs, decoder_hidden):
attention = None
return attention
3.2.6 class Seq2Seq()
将Encoder和Decoder进行合并。
class Seq2Seq(nn.Module):
def __init__(self, encoder, decoder, device):
super().__init__()
self.encoder = encoder
self.decoder = decoder
self.device = device
assert encoder.n_layers == decoder.n_layers, \
"Encoder and decoder must have equal number of layers!"
def forward(self, input, target, teacher_forcing_ratio):
# input = [batch size, input len, vocab size]
# target = [batch size, target len, vocab size]
# teacher_forcing_ratio 是有多少概率使用正确答案来训练
batch_size = target.shape[0]
target_len = target.shape[1]
vocab_size = self.decoder.cn_vocab_size
# 准备一个用来存储输出的空间
outputs = torch.zeros(batch_size, target_len, vocab_size).to(self.device)
# 将输入放入Encoder
encoder_outputs, hidden = self.encoder(input)
# encoder_outputs 主要是使用在 Attention
# 因为 Encoder 是双向的RNN,所以需要将同一层两个方向的 hidden state 接在一起
# hidden = [num_layers * directions, batch size , hid dim] --> [num_layers, directions, batch size , hid dim]
hidden = hidden.view(self.encoder.n_layers, 2, batch_size, -1)
hidden = torch.cat((hidden[:, -2, :, :], hidden[:, -1, :, :]), dim=2)
input = target[:, 0]
preds = []
for t in range(1, target_len):
output, hidden = self.decoder(input, hidden, encoder_outputs)
outputs[:, t] = output
# 决定是否用正确答案来进行训练
teacher_force = random.random() <= teacher_forcing_ratio
# 取出概率最大的单词
top1 = output.argmax(1)
# 如果是 teacher force 就用正确答案来进行训练,如果不是就用自己预测的单词进行训练
input = target[:, t] if teacher_force and t < target_len else top1
preds.append(top1.unsqueeze(1))
preds = torch.cat(preds, 1)
return outputs, preds
def inference(self, input, target):
# input = [batch size, input len, vocab size]
# target = [batch size, target len, vocab size]
batch_size = input.shape[0]
input_len = input.shape[1] # 取得最大字数
vocab_size = self.decoder.cn_vocab_size
outputs = torch.zeros(batch_size, input_len, vocab_size).to(self.device)
encoder_outputs, hidden = self.encoder(input)
# hidden = [num_layers * directions, batch size , hid dim] --> [num_layers, directions, batch size , hid dim]
hidden = hidden.view(self.encoder.n_layers, 2, batch_size, -1)
hidden = torch.cat((hidden[:, -2, :, :], hidden[:, -1, :, :]), dim=2)
input = target[:, 0]
preds = []
for t in range(1, input_len):
output, hidden = self.decoder(input, hidden, encoder_outputs)
# 将预测结果存起来
outputs[:, t] = output
# 取出概率最大的单词
top1 = output.argmax(1)
input = top1
preds.append(top1.unsqueeze(1))
preds = torch.cat(preds, 1)
return outputs, preds
3.3 实验结果