#Datawhale AI 夏令营
本次学习是基于Transformer解决机器翻译任务,在Task2的baseline基础上,对模型结构部分的代码做出了修改,将模型架构从原来的Seq2Seq架构更改为了基于Transformer的架构。代码运行流程大致可参考往期Task2,本次学习只记录以下修改的代码部分。
修改后的模型结构代码如下:
# 位置编码
class PositionalEncoding(nn.Module):
def __init__(self, d_model, dropout=0.1, max_len=5000):
super(PositionalEncoding, self).__init__()
self.dropout = nn.Dropout(p=dropout)
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0).transpose(0, 1)
self.register_buffer('pe', pe)
def forward(self, x):
x = x + self.pe[:x.size(0), :]
return self.dropout(x)
# Transformer
class TransformerModel(nn.Module):
def __init__(self, src_vocab, tgt_vocab, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, dropout):
super(TransformerModel, self).__init__()
self.transformer = nn.Transformer(d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, dropout)
self.src_embedding = nn.Embedding(len(src_vocab), d_model)
self.tgt_embedding = nn.Embedding(len(tgt_vocab), d_model)
self.positional_encoding = PositionalEncoding(d_model, dropout)
self.fc_out = nn.Linear(d_model, len(tgt_vocab))
self.src_vocab = src_vocab
self.tgt_vocab = tgt_vocab
self.d_model = d_model
def forward(self, src, tgt):
# 调整src和tgt的维度
src = src.transpose(0, 1) # (seq_len, batch_size)
tgt = tgt.transpose(0, 1) # (seq_len, batch_size)
src_mask = self.transformer.generate_square_subsequent_mask(src.size(0)).to(src.device)
tgt_mask = self.transformer.generate_square_subsequent_mask(tgt.size(0)).to(tgt.device)
src_padding_mask = (src == self.src_vocab['<pad>']).transpose(0, 1)
tgt_padding_mask = (tgt == self.tgt_vocab['<pad>']).transpose(0, 1)
src_embedded = self.positional_encoding(self.src_embedding(src) * math.sqrt(self.d_model))
tgt_embedded = self.positional_encoding(self.tgt_embedding(tgt) * math.sqrt(self.d_model))
output = self.transformer(src_embedded, tgt_embedded,
src_mask, tgt_mask, None, src_padding_mask, tgt_padding_mask, src_padding_mask)
return self.fc_out(output).transpose(0, 1)
具体修改如下:
一、位置编码
在模型构建中的编码部分代码:
class Encoder(nn.Module):
def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
super().__init__()
self.hid_dim = hid_dim
self.n_layers = n_layers
self.embedding = nn.Embedding(input_dim, emb_dim)
self.gru = nn.GRU(emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=True)
self.dropout = nn.Dropout(dropout)
def forward(self, src):
# src = [batch size, src len]
embedded = self.dropout(self.embedding(src))
# embedded = [batch size, src len, emb dim]
outputs, hidden = self.gru(embedded)
# outputs = [batch size, src len, hid dim * n directions]
# hidden = [n layers * n directions, batch size, hid dim]
return outputs, hidden
class Attention(nn.Module):
def __init__(self, hid_dim):
super().__init__()
self.attn = nn.Linear(hid_dim * 2, hid_dim)
self.v = nn.Linear(hid_dim, 1, bias=False)
def forward(self, hidden, encoder_outputs):
# hidden = [1, batch size, hid dim]
# encoder_outputs = [batch size, src len, hid dim]
batch_size = encoder_outputs.shape[0]
src_len = encoder_outputs.shape[1]
hidden = hidden.repeat(src_len, 1, 1).transpose(0, 1)
# hidden = [batch size, src len, hid dim]
energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
# energy = [batch size, src len, hid dim]
attention = self.v(energy).squeeze(2)
# attention = [batch size, src len]
return F.softmax(attention, dim=1)
class Decoder(nn.Module):
def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout, attention):
super().__init__()
self.output_dim = output_dim
self.hid_dim = hid_dim
self.n_layers = n_layers
self.attention = attention
self.embedding = nn.Embedding(output_dim, emb_dim)
self.gru = nn.GRU(hid_dim + emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=True)
self.fc_out = nn.Linear(hid_dim * 2 + emb_dim, output_dim)
self.dropout = nn.Dropout(dropout)
def forward(self, input, hidden, encoder_outputs):
# input = [batch size, 1]
# hidden = [n layers, batch size, hid dim]
# encoder_outputs = [batch size, src len, hid dim]
input = input.unsqueeze(1)
embedded = self.dropout(self.embedding(input))
# embedded = [batch size, 1, emb dim]
a = self.attention(hidden[-1:], encoder_outputs)
# a = [batch size, src len]
a = a.unsqueeze(1)
# a = [batch size, 1, src len]
weighted = torch.bmm(a, encoder_outputs)
# weighted = [batch size, 1, hid dim]
rnn_input = torch.cat((embedded, weighted), dim=2)
# rnn_input = [batch size, 1, emb dim + hid dim]
output, hidden = self.gru(rnn_input, hidden)
# output = [batch size, 1, hid dim]
# hidden = [n layers, batch size, hid dim]
embedded = embedded.squeeze(1)
output = output.squeeze(1)
weighted = weighted.squeeze(1)
prediction = self.fc_out(torch.cat((output, weighted, embedded), dim=1))
# prediction = [batch size, output dim]
return prediction, hidden
修改为如下代码:
class PositionalEncoding(nn.Module):
def __init__(self, d_model, dropout=0.1, max_len=5000):
super(PositionalEncoding, self).__init__()
self.dropout = nn.Dropout(p=dropout)
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0).transpose(0, 1)
self.register_buffer('pe', pe)
def forward(self, x):
x = x + self.pe[:x.size(0), :]
return self.dropout(x)
位置编码是Transformer中的重要组件,因为Transformer不包含循环神经网络(RNN)或卷积神经网络(CNN)那样的位置信息,位置编码用于为模型提供序列中的位置信息。
pe
是位置编码矩阵,大小为(max_len, d_model),其中max_len
是序列的最大长度,d_model
是词嵌入的维度。position
表示位置的索引。div_term
用于位置编码的计算。forward
函数中,将位置编码加到输入的嵌入上。
二、Transformer模型
在模型构建的模型部分代码:
class TransformerModel(nn.Module):
def __init__(self, src_vocab, tgt_vocab, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, dropout):
super(TransformerModel, self).__init__()
self.transformer = nn.Transformer(d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, dropout)
self.src_embedding = nn.Embedding(len(src_vocab), d_model)
self.tgt_embedding = nn.Embedding(len(tgt_vocab), d_model)
self.positional_encoding = PositionalEncoding(d_model, dropout)
self.fc_out = nn.Linear(d_model, len(tgt_vocab))
self.src_vocab = src_vocab
self.tgt_vocab = tgt_vocab
self.d_model = d_model
def forward(self, src, tgt):
# 调整src和tgt的维度
src = src.transpose(0, 1) # (seq_len, batch_size)
tgt = tgt.transpose(0, 1) # (seq_len, batch_size)
src_mask = self.transformer.generate_square_subsequent_mask(src.size(0)).to(src.device)
tgt_mask = self.transformer.generate_square_subsequent_mask(tgt.size(0)).to(tgt.device)
src_padding_mask = (src == self.src_vocab['<pad>']).transpose(0, 1)
tgt_padding_mask = (tgt == self.tgt_vocab['<pad>']).transpose(0, 1)
src_embedded = self.positional_encoding(self.src_embedding(src) * math.sqrt(self.d_model))
tgt_embedded = self.positional_encoding(self.tgt_embedding(tgt) * math.sqrt(self.d_model))
output = self.transformer(src_embedded, tgt_embedded,
src_mask, tgt_mask, None, src_padding_mask, tgt_padding_mask, src_padding_mask)
return self.fc_out(output).transpose(0, 1)
修改为如下代码:
class TransformerModel(nn.Module):
def __init__(self, src_vocab, tgt_vocab, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, dropout):
super(TransformerModel, self).__init__()
self.transformer = nn.Transformer(d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, dropout)
self.src_embedding = nn.Embedding(len(src_vocab), d_model)
self.tgt_embedding = nn.Embedding(len(tgt_vocab), d_model)
self.positional_encoding = PositionalEncoding(d_model, dropout)
self.fc_out = nn.Linear(d_model, len(tgt_vocab))
self.src_vocab = src_vocab
self.tgt_vocab = tgt_vocab
self.d_model = d_model
def forward(self, src, tgt):
# 调整src和tgt的维度
src = src.transpose(0, 1) # (seq_len, batch_size)
tgt = tgt.transpose(0, 1) # (seq_len, batch_size)
src_mask = self.transformer.generate_square_subsequent_mask(src.size(0)).to(src.device)
tgt_mask = self.transformer.generate_square_subsequent_mask(tgt.size(0)).to(tgt.device)
src_padding_mask = (src == self.src_vocab['<pad>']).transpose(0, 1)
tgt_padding_mask = (tgt == self.tgt_vocab['<pad>']).transpose(0, 1)
src_embedded = self.positional_encoding(self.src_embedding(src) * math.sqrt(self.d_model))
tgt_embedded = self.positional_encoding(self.tgt_embedding(tgt) * math.sqrt(self.d_model))
output = self.transformer(src_embedded, tgt_embedded,
src_mask, tgt_mask, None, src_padding_mask, tgt_padding_mask, src_padding_mask)
return self.fc_out(output).transpose(0, 1)
这是一个标准的Transformer模型,用于序列到序列的任务,如机器翻译。
1. __init__
方法中定义了Transformer的参数以及所需的层(如嵌入层、位置编码层和输出的全连接层)。
nn.Transformer
:Transformer模型本身。src_embedding
和tgt_embedding
:源语言和目标语言的嵌入层。positional_encoding
:位置编码层。fc_out
:用于将Transformer输出转换为目标语言词汇表大小的线性层。
2. forward
方法中完成了以下步骤:
- 调整源和目标序列的维度,以匹配Transformer的输入要求。
- 生成源和目标序列的掩码(mask)。
- 计算源和目标序列的填充掩码。
- 对源和目标序列进行嵌入,并加上位置编码。
- 将嵌入后的序列输入Transformer模型。
- 输出层通过全连接层得到词汇表大小的输出,并调整维度。
三、模型调用
在定义模型的部分将seq2seq模型替换为transformer模型即可。
即在初始化模型函数中:
# 初始化模型
def initialize_model(input_dim, output_dim, emb_dim, hid_dim, n_layers, dropout, device):
attn = Attention(hid_dim)
enc = Encoder(input_dim, emb_dim, hid_dim, n_layers, dropout)
dec = Decoder(output_dim, emb_dim, hid_dim, n_layers, dropout, attn)
model = Seq2Seq(enc, dec, device).to(device)
return model
改为如下代码:
def initialize_model(src_vocab, tgt_vocab, d_model=512, nhead=8, num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=2048, dropout=0.1):
model = TransformerModel(src_vocab, tgt_vocab, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, dropout)
return model
参考链接:Docs
往期学习记录回顾: