笔记:ML-LHY: Attention-based Model / Conditional Generation by RNN & Attention实验,本文代码在TA代码上轻微修改。
Dataset
data example(train, validation, test):
it 's none of your concern . 這不關 你 的 事 。
she has a habit of bi@@ ting her na@@ ils . 她 有 咬 指甲 的 習慣 。
he is a teacher . 他 是 老師 。
japan re@@ lies on ar@@ a@@ b countries for oil . 日本 靠 阿拉伯 國家 提供 石油 。
i 'll dream about you . 我會 夢到 你 的 。
i borrowed tom 's phone this morning . 我 今天 早上 跟 Tom 藉手機 。
she does n't know how to play golf . 她 不 知道 怎麼 打 高爾夫球 。
tears ro@@ lled down my chee@@ ks . 淚水 沿着 我 的 面頰 流 了 下來 。
that 's soft . 這 很 軟 。
do n't open this door , please . 請 不要 打開 此門 。
what exactly does tom do ? 湯姆 究竟 在 做 什麼 ?
there 's a book here . 這裡 有 一本 書 。
tom was the first to rea@@ ct . 湯姆 是 最先 作出 反應 的 。
i 'm going to check out at eight . 我會 在 八點鐘 的 時候 登出 。
is this your notebook ? 這是 你 的 書 嗎 ?
how could things get worse ? 事情 怎麼 變糟 的 ?
i hope this is the right one . 我 希望 這是 對 的 。
i play tennis an hour a day . 我 每天 打一 小時 網球 。
...
将训练数据每行分割成英文和中文sequence,并且将sequence中每个word装换为index,使用LabelTransform转换成一样长度的index token。
# data.py
import re
import json
import torch.utils.data as data
import numpy as np
import os
import torch
class LabelTransform(object):
def __init__(self, size, pad):
self.size = size
self.pad = pad
def __call__(self, label):
label = np.pad(label, (0, (self.size - label.shape[0])), mode='constant', constant_values=self.pad)
return label
class EN2CNDataset(data.Dataset):
def __init__(self, root, max_output_len, set_name):
self.root = root
self.word2int_cn, self.int2word_cn = self.get_dictionary('cn')
self.word2int_en, self.int2word_en = self.get_dictionary('en')
# 載入資料
self.data = []
with open(os.path.join(self.root, f'{set_name}.txt'), "r") as f:
for line in f:
self.data.append(line)
print(f'{set_name} dataset size: {len(self.data)}')
self.cn_vocab_size = len(self.word2int_cn)
self.en_vocab_size = len(self.word2int_en)
self.transform = LabelTransform(max_output_len, self.word2int_en['<PAD>'])
def get_dictionary(self, language):
# 載入字典
with open(os.path.join(self.root, f'word2int_{language}.json'), "r") as f:
word2int = json.load(f)
with open(os.path.join(self.root, f'int2word_{language}.json'), "r") as f:
int2word = json.load(f)
return word2int, int2word
def __len__(self):
return len(self.data)
def __getitem__(self, Index):
# 先將中英文分開
sentences = self.data[Index]
sentences = re.split('[\t\n]', sentences)
sentences = list(filter(None, sentences))
# print (sentences)
assert len(sentences) == 2
# 預備特殊字元
BOS = self.word2int_en['<BOS>']
EOS = self.word2int_en['<EOS>']
UNK = self.word2int_en['<UNK>']
# 在開頭添加 <BOS>,在結尾添加 <EOS> ,不在字典的 subword (詞) 用 <UNK> 取代
en, cn = [BOS], [BOS]
# 將句子拆解為 subword 並轉為整數
sentence = re.split(' ', sentences[0])
sentence = list(filter(None, sentence))
# print (f'en: {sentence}')
for word in sentence:
en.append(self.word2int_en.get(word, UNK))
en.append(EOS)
# 將句子拆解為單詞並轉為整數
# e.g. < BOS >, we, are, friends, < EOS > --> 1, 28, 29, 205, 2
sentence = re.split(' ', sentences[1])
sentence = list(filter(None, sentence))
# print (f'cn: {sentence}')
for word in sentence:
cn.append(self.word2int_cn.get(word, UNK))
cn.append(EOS)
en, cn = np.asarray(en), np.asarray(cn)
# 用 <PAD> 將句子補到相同長度
en, cn = self.transform(en), self.transform(cn)
en, cn = torch.LongTensor(en), torch.LongTensor(cn)
return en, cn
Model
Encoder
使用的多层双向GRU(参考笔记:ML-LHY-21: Recurrent Neural Network(RNN) 循环神经网络):
class Encoder(nn.Module):
def __init__(self, en_vocab_size, emb_dim, hid_dim, n_layers, dropout, device):
super().__init__()
# 这里的Embedding层参数没有预先训练,可以参考H4进行预训练
self.embedding = nn.Embedding(en_vocab_size, emb_dim)
self.hid_dim = hid_dim
self.n_layers = n_layers
self.rnn = nn.GRU(emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=True, bidirectional=True)
self.dropout = nn.Dropout(dropout)
def forward(self, input):
# input = [batch size, sequence len, vocab size(其实就是索引,把其看做one hot code,长度为vocab size)]
embedding = self.embedding(input)
outputs, hidden = self.rnn(self.dropout(embedding))
# outputs = [batch size, sequence len, hid dim * directions]
# hidden = [num_layers * directions, batch size , hid dim]
# outputs 是最上層RNN的輸出
return outputs, hidden
Decoder
class Decoder(nn.Module):
def __init__(self, cn_vocab_size, emb_dim, hid_dim, n_layers, dropout, isatt, device):
super().__init__()
self.cn_vocab_size = cn_vocab_size
self.hid_dim = hid_dim * 2
self.n_layers = n_layers
self.embedding = nn.Embedding(cn_vocab_size, emb_dim)
self.isatt = isatt
self.attention = Attention(self.hid_dim, device)
# 如果使用 Attention Mechanism 會使得輸入維度變化,請在這裡修改
# e.g. Attention 接在輸入後面會使得維度變化,所以輸入維度改為 ,这里是相连,由于我们选择相加,所以维度不需要改变
# self.input_dim = emb_dim + hid_dim * 2 if isatt else emb_dim
self.input_dim = emb_dim
self.rnn = nn.GRU(self.input_dim, self.hid_dim, self.n_layers, dropout=dropout, batch_first=True)
self.embedding2vocab1 = nn.Linear(self.hid_dim, self.hid_dim * 2)
self.embedding2vocab2 = nn.Linear(self.hid_dim * 2, self.hid_dim * 4)
self.embedding2vocab3 = nn.Linear(self.hid_dim * 4, self.cn_vocab_size)
self.dropout = nn.Dropout(dropout)
def forward(self, input, hidden, encoder_outputs):
# input = [batch size, vocab size]
# hidden = [batch size, n layers * directions, hid dim]
# Decoder 只會是單向,所以 directions=1
input = input.unsqueeze(1)
embedded = self.dropout(self.embedding(input))
# embedded = [batch size, 1, emb dim]
if self.isatt:
attn = self.attention(encoder_outputs, hidden)
# 在這裡決定如何使用 Attention,e.g. 这里是相加 当然也可以 接在後面, 請注意維度變化
for i in range(hidden.shape[0]):
hidden[i] += attn # embedding 相加
output, hidden = self.rnn(embedded, hidden)
# output = [batch size, 1, hid dim]
# hidden = [num_layers, batch size, hid dim]
# 將 RNN 的輸出轉為每個詞出現的機率
output = self.embedding2vocab1(output.squeeze(1))
output = self.embedding2vocab2(output)
prediction = self.embedding2vocab3(output)
# prediction = [batch size, vocab size]
return prediction, hidden
Attention:
Attention部分是个人实现,仅供参考!
match函数是有2层hidden layer的NN
class Attention(nn.Module):
def __init__(self, hid_dim, device):
super(Attention, self).__init__()
self.hid_dim = hid_dim
self.device = device
self.match_nn = nn.Sequential(
nn.Linear(self.hid_dim * 2, self.hid_dim * 4),
nn.Linear(self.hid_dim * 4, self.hid_dim * 2),
nn.Linear(self.hid_dim * 2, 1),
)
def match(self, h, z):
alpha = self.match_nn(torch.cat((h, z), dim=1))
return alpha
def forward(self, encoder_outputs, decoder_hidden):
# encoder_outputs = [batch size, sequence len, hid dim * directions]
# decoder_hidden = [num_layers, batch size, hid dim]
encoder_outputs = encoder_outputs.permute(1, 0, 2)
sl = encoder_outputs.shape[0]
batch_size = encoder_outputs.shape[1]
alphas = torch.zeros(sl, batch_size).to(self.device)
for i in range(sl):
h = encoder_outputs[i]
# 一般來說是取 Encoder 最後一層的 hidden state 來做 attention
z = decoder_hidden[-1]
alpha = self.match(h, z).squeeze()
alphas[i] = alpha
alphas = alphas.softmax(dim=0)
attention = torch.zeros(batch_size, self.hid_dim).to(self.device)
for i in range(sl):
alpha = alphas[i].unsqueeze(1)
h = encoder_outputs[i]
attention = attention + h * alpha # 做weigh sum
return attention
这部分就没画在图里面,参考:笔记:ML-LHY: Attention-based Model / Conditional Generation by RNN & Attention
Seq2Seq
这部分是总的模型,完整model代码
# model.py
import torch.nn as nn
import torch
import random
class Attention(nn.Module):
def __init__(self, hid_dim, device):
super(Attention, self).__init__()
self.hid_dim = hid_dim
self.device = device
self.match_nn = nn.Sequential(
nn.Linear(self.hid_dim * 2, self.hid_dim * 4),
nn.Linear(self.hid_dim * 4, self.hid_dim * 2),
nn.Linear(self.hid_dim * 2, 1),
)
def match(self, h, z):
alpha = self.match_nn(torch.cat((h, z), dim=1))
return alpha
def forward(self, encoder_outputs, decoder_hidden):
# encoder_outputs = [batch size, sequence len, hid dim * directions]
# decoder_hidden = [num_layers, batch size, hid dim]
encoder_outputs = encoder_outputs.permute(1, 0, 2)
sl = encoder_outputs.shape[0]
batch_size = encoder_outputs.shape[1]
alphas = torch.zeros(sl, batch_size).to(self.device)
for i in range(sl):
h = encoder_outputs[i]
# 一般來說是取 Encoder 最後一層的 hidden state 來做 attention
z = decoder_hidden[-1]
alpha = self.match(h, z).squeeze()
alphas[i] = alpha
alphas = alphas.softmax(dim=0)
attention = torch.zeros(batch_size, self.hid_dim).to(self.device)
for i in range(sl):
alpha = alphas[i].unsqueeze(1)
h = encoder_outputs[i]
attention = attention + h * alpha
return attention
class Encoder(nn.Module):
def __init__(self, en_vocab_size, emb_dim, hid_dim, n_layers, dropout, device):
super().__init__()
# 这里的Embedding层参数没有预先训练,可以参考H4进行预训练
self.embedding = nn.Embedding(en_vocab_size, emb_dim)
self.hid_dim = hid_dim
self.n_layers = n_layers
self.rnn = nn.GRU(emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=True, bidirectional=True)
self.dropout = nn.Dropout(dropout)
def forward(self, input):
# input = [batch size, sequence len, vocab size(其实就是索引,把其看做one hot code,长度为vocab size)]
embedding = self.embedding(input)
outputs, hidden = self.rnn(self.dropout(embedding))
# outputs = [batch size, sequence len, hid dim * directions]
# hidden = [num_layers * directions, batch size , hid dim]
# outputs 是最上層RNN的輸出
return outputs, hidden
class Decoder(nn.Module):
def __init__(self, cn_vocab_size, emb_dim, hid_dim, n_layers, dropout, isatt, device):
super().__init__()
self.cn_vocab_size = cn_vocab_size
self.hid_dim = hid_dim * 2
self.n_layers = n_layers
self.embedding = nn.Embedding(cn_vocab_size, emb_dim)
self.isatt = isatt
self.attention = Attention(self.hid_dim, device)
# 如果使用 Attention Mechanism 會使得輸入維度變化,請在這裡修改
# e.g. Attention 接在輸入後面會使得維度變化,所以輸入維度改為
# self.input_dim = emb_dim + hid_dim * 2 if isatt else emb_dim
self.input_dim = emb_dim
self.rnn = nn.GRU(self.input_dim, self.hid_dim, self.n_layers, dropout=dropout, batch_first=True)
self.embedding2vocab1 = nn.Linear(self.hid_dim, self.hid_dim * 2)
self.embedding2vocab2 = nn.Linear(self.hid_dim * 2, self.hid_dim * 4)
self.embedding2vocab3 = nn.Linear(self.hid_dim * 4, self.cn_vocab_size)
self.dropout = nn.Dropout(dropout)
def forward(self, input, hidden, encoder_outputs):
# input = [batch size, vocab size]
# hidden = [batch size, n layers * directions, hid dim]
# Decoder 只會是單向,所以 directions=1
input = input.unsqueeze(1)
embedded = self.dropout(self.embedding(input))
# embedded = [batch size, 1, emb dim]
if self.isatt:
attn = self.attention(encoder_outputs, hidden)
# TODO: 在這裡決定如何使用 Attention,e.g. 相加 或是 接在後面, 請注意維度變化
for i in range(hidden.shape[0]):
hidden[i] += attn
output, hidden = self.rnn(embedded, hidden)
# output = [batch size, 1, hid dim]
# hidden = [num_layers, batch size, hid dim]
# 將 RNN 的輸出轉為每個詞出現的機率
output = self.embedding2vocab1(output.squeeze(1))
output = self.embedding2vocab2(output)
prediction = self.embedding2vocab3(output)
# prediction = [batch size, vocab size]
return prediction, hidden
class Seq2Seq(nn.Module):
def __init__(self, encoder, decoder, device):
super().__init__()
self.encoder = encoder
self.decoder = decoder
self.device = device
assert encoder.n_layers == decoder.n_layers, \
"Encoder and decoder must have equal number of layers!"
def forward(self, input, target, teacher_forcing_ratio):
# input = [batch size (50), input len (60), vocab size]
# target = [batch size (50), target len (60), vocab size]
# teacher_forcing_ratio 是有多少機率使用正確答案來訓練
batch_size = target.shape[0]
target_len = target.shape[1]
vocab_size = self.decoder.cn_vocab_size
# 準備一個儲存空間來儲存輸出(vocab_size 用于概率one hot code)
outputs = torch.zeros(batch_size, target_len, vocab_size).to(self.device)
# 將輸入放入 Encoder
encoder_outputs, hidden = self.encoder(input)
# Encoder 最後的隱藏層(hidden state) 用來初始化 Decoder
# encoder_outputs 主要是使用在 Attention
# 因為 Encoder 是雙向的RNN,所以需要將同一層兩個方向的 hidden state 接在一起
# hidden = [num_layers * directions, batch size , hid dim] --> [num_layers, directions, batch size , hid dim]
hidden = hidden.view(self.encoder.n_layers, 2, batch_size, -1)
hidden = torch.cat((hidden[:, -2, :, :], hidden[:, -1, :, :]), dim=2)
# 取的 <BOS> token
input = target[:, 0]
preds = []
for t in range(1, target_len):
output, hidden = self.decoder(input, hidden, encoder_outputs)
outputs[:, t] = output
# 決定是否用正確答案來做訓練
teacher_force = random.random() <= teacher_forcing_ratio
# 取出機率最大的單詞
top1 = output.argmax(1)
# 如果是 teacher force 則用正解訓練,反之用自己預測的單詞做預測
input = target[:, t] if teacher_force and t < target_len else top1
preds.append(top1.unsqueeze(1))
preds = torch.cat(preds, 1)
return outputs, preds
def inference(self, input, target):
########
# TODO #
########
# 在這裡實施 Beam Search
# 此函式的 batch size = 1
# input = [batch size, input len, vocab size]
# target = [batch size, target len, vocab size]
batch_size = input.shape[0]
input_len = input.shape[1] # 取得最大字數
vocab_size = self.decoder.cn_vocab_size
# 準備一個儲存空間來儲存輸出
outputs = torch.zeros(batch_size, input_len, vocab_size).to(self.device)
# 將輸入放入 Encoder
encoder_outputs, hidden = self.encoder(input)
# Encoder 最後的隱藏層(hidden state) 用來初始化 Decoder
# encoder_outputs 主要是使用在 Attention
# 因為 Encoder 是雙向的RNN,所以需要將同一層兩個方向的 hidden state 接在一起
# hidden = [num_layers * directions, batch size , hid dim] --> [num_layers, directions, batch size , hid dim]
hidden = hidden.view(self.encoder.n_layers, 2, batch_size, -1)
hidden = torch.cat((hidden[:, -2, :, :], hidden[:, -1, :, :]), dim=2)
# 取的 <BOS> token
input = target[:, 0]
preds = []
for t in range(1, input_len):
output, hidden = self.decoder(input, hidden, encoder_outputs)
# 將預測結果存起來
outputs[:, t] = output
# 取出機率最大的單詞
top1 = output.argmax(1)
input = top1
preds.append(top1.unsqueeze(1))
preds = torch.cat(preds, 1)
return outputs, preds
def save_model(model, optimizer, store_model_path, step):
torch.save(model.state_dict(), f'{store_model_path}/model_{step}.ckpt')
return
def load_model(model, load_model_path):
print(f'Load model from {load_model_path}')
model.load_state_dict(torch.load(f'{load_model_path}.ckpt'))
return model
def build_model(config, en_vocab_size, cn_vocab_size, device):
# 建構模型
encoder = Encoder(en_vocab_size, config.emb_dim, config.hid_dim, config.n_layers, config.dropout, device)
decoder = Decoder(cn_vocab_size, config.emb_dim, config.hid_dim, config.n_layers, config.dropout, config.attention,
device)
model = Seq2Seq(encoder, decoder, device)
print(model)
# 建構 optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)
print(optimizer)
if config.load_model:
model = load_model(model, config.load_model_path)
model = model.to(device)
return model, optimizer
总model图: