三. Transformer代码理解
1. PyTorch 基础语法介绍
- import torch.nn.functional as F
- torch.matmul(t1, t2):矩阵乘法,可以高维。
- torch.triu(input, diagonal=0, out=None):返回矩阵上三角部分,其余部分定义为0。
- tensor.transpose(d1, d2):矩阵转置,两个维度互换。
- tensor.view():tensor.view()方法可以调整tensor的形状,但必须保证调整前后元素总数一致。
- tensor.unsqueeze(d):在d号位增加维度,便于批量处理。
- tensor.clone():充当中间变量,会保留在计算图中,参与梯度计算(回传叠加),但是一般不会保留自身梯度。
- tensor.detach():与原始tensor共享数据内存,会脱离计算图,不会牵扯梯度计算。
- nn.Module把__call__方法实现为类对象的forward函数,所以任意继承了nn.Module的类对象都可以简写来调用forward函数。相关介绍
- nn.Linear(in_features, out_features):设置网络中的全连接层的,需要注意的是全连接层的输入与输出都是二维张量,一般形状为[batch_size, size]。实现 y = x A T + b {y = xA}^{T}+{b} y=xAT+b 。
- nn.Dropout():带Dropout的网络可以防止出现过拟合。
- nn.LayerNorm():BN并不适用于RNN等动态网络和batchsize较小的时候效果不好。
- nn.embedding():其为一个简单的存储固定大小的词典的嵌入向量的查找表,意思就是说,给一个编号,嵌入层就能返回这个编号对应的嵌入向量,嵌入向量反映了各个编号代表的符号之间的语义关系。相关介绍
- nn.init.xavier_uniform_(): 为了使得网络中信息更好的流动,每一层输出的方差应该尽量相等。相关介绍
2. PyTorch 版本代码解读
在看代码时,在前面已经将逻辑理顺,接下来最难懂的就是参数的意义了,我尽可能地凭借自己的理解,为大家铺路。
- ScaledDotProductAttention
class ScaledDotProductAttention(nn.Module):
''' Scaled Dot-Product Attention '''
def __init__(self, temperature, attn_dropout=0.1):
''' temperature: q跟k的维度 '''
super().__init__()
self.temperature = temperature
self.dropout = nn.Dropout(attn_dropout)
def forward(self, q, k, v, mask=None):
# 1. 计算 Q * K' / √d
attn = torch.matmul(q / self.temperature, k.transpose(2, 3))
# 判断是否进行 mask 操作
if mask is not None:
attn = attn.masked_fill(mask == 0, -1e9)
# 2. 对attn的最后一个维度进行归一化
# 讲解链接: https://blog.csdn.net/sunyueqinghit/article/details/101113251
attn = self.dropout(F.softmax(attn, dim=-1))
# 3. 计算 A * V
output = torch.matmul(attn, v)
return output, attn
- PositionalEncoding
class PositionalEncoding(nn.Module):
''' 位置编码 '''
def __init__(self, d_hid, n_position=200):
'''
d_hid: 嵌入向量的维度, 即用多少维来表示一个符号
'''
super(PositionalEncoding, self).__init__()
# Not a parameter
self.register_buffer('pos_table', self._get_sinusoid_encoding_table(n_position, d_hid))
def _get_sinusoid_encoding_table(self, n_position, d_hid):
''' Sinusoid position encoding table '''
# TODO: make it with torch instead of numpy
# 计算位置信息
def get_position_angle_vec(position):
return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)]
sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)])
# 偶数位
sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i
# 奇数位
sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1
# 在原第一个维度位置增加一个维度
return torch.FloatTensor(sinusoid_table).unsqueeze(0)
def forward(self, x):
return x + self.pos_table[:, :x.size(1)].clone().detach()
- MultiHeadAttention
class MultiHeadAttention(nn.Module):
''' 多头注意力机制 '''
def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
'''
n_head: num of head
d_model: 最初单个词embedding的向量长度
d_k: 多头后, 矩阵 k 的维度
d_v: 多头后, 矩阵 v 的维度
'''
super().__init__()
self.n_head = n_head
self.d_k = d_k
self.d_v = d_v
# 全连接层, y = x * A' + b
# nn.Linear(): 第一个参数为输入维度, 第二个参数为输出维度
self.w_qs = nn.Linear(d_model, n_head * d_k, bias=False)
self.w_ks = nn.Linear(d_model, n_head * d_k, bias=False)
self.w_vs = nn.Linear(d_model, n_head * d_v, bias=False)
self.fc = nn.Linear(n_head * d_v, d_model, bias=False)
# 计算attention
# temperature: q跟k的维度(d)
self.attention = ScaledDotProductAttention(temperature=d_k ** 0.5)
# Dropout网络: 防止过拟合
self.dropout = nn.Dropout(dropout)
# LayerNorm层: 归一化
self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
def forward(self, q, k, v, mask=None):
'''
sz_b: batch_size
len_q, len_k, len_v: Q、K、V的maxlen
'''
d_k, d_v, n_head = self.d_k, self.d_v, self.n_head
sz_b, len_q, len_k, len_k= q.size(0), q.size(1), k.size(1), v.size(1)
residual = q
# Pass through the pre-attention projection: b x lq x (n*dv)
# Separate different heads: b x lq x n x dv
# q、k、v的第三个维度是 num of head
q = self.w_qs(q).view(sz_b, len_q, n_head, d_k)
k = self.w_ks(k).view(sz_b, len_k, n_head, d_k)
v = self.w_vs(v).view(sz_b, len_v, n_head, d_v)
# Transpose for attention dot product: b x n x lq x dv
q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)
if mask is not None:
# 在原第二个维度位置增加一个维度
mask = mask.unsqueeze(1) # For head axis broadcasting.
q, attn = self.attention(q, k, v, mask=mask)
# Transpose to move the head dimension back: b x lq x n x dv
# Combine the last two dimensions to concatenate all the heads together: b x lq x (n*dv)
q = q.transpose(1, 2).contiguous().view(sz_b, len_q, -1)
q = self.dropout(self.fc(q))
# 残差网络
q += residual
# 进行LN归一化
q = self.layer_norm(q)
return q, attn
- Feed Forward Network
class PositionwiseFeedForward(nn.Module):
''' A two-feed-forward-layer module '''
def __init__(self, d_in, d_hid, dropout=0.1):
super().__init__()
self.w_1 = nn.Linear(d_in, d_hid) # position-wise
self.w_2 = nn.Linear(d_hid, d_in) # position-wise
self.layer_norm = nn.LayerNorm(d_in, eps=1e-6)
self.dropout = nn.Dropout(dropout)
def forward(self, x):
residual = x
x = self.w_2(F.relu(self.w_1(x)))
x = self.dropout(x)
# 残差网络
x += residual
# 进行LN归一化
x = self.layer_norm(x)
return x
- EncoderLayer
class EncoderLayer(nn.Module):
''' Compose with two layers '''
def __init__(self, d_model, d_inner, n_head, d_k, d_v, dropout=0.1):
'''
d_model: 最初单个词embedding的向量长度
d_inner:
'''
super(EncoderLayer, self).__init__()
self.slf_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout)
self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, dropout=dropout)
def forward(self, enc_input, slf_attn_mask=None):
enc_output, enc_slf_attn = self.slf_attn(
enc_input, enc_input, enc_input, mask=slf_attn_mask)
enc_output = self.pos_ffn(enc_output)
return enc_output, enc_slf_attn
- DecoderLayer
class DecoderLayer(nn.Module):
''' Compose with three layers '''
def __init__(self, d_model, d_inner, n_head, d_k, d_v, dropout=0.1):
super(DecoderLayer, self).__init__()
self.slf_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout)
self.enc_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout)
self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, dropout=dropout)
def forward(self, dec_input, enc_output, slf_attn_mask=None, dec_enc_attn_mask=None):
dec_output, dec_slf_attn = self.slf_attn(dec_input, dec_input, dec_input, mask=slf_attn_mask)
dec_output, dec_enc_attn = self.enc_attn(dec_output, enc_output, enc_output, mask=dec_enc_attn_mask)
dec_output = self.pos_ffn(dec_output)
return dec_output, dec_slf_attn, dec_enc_attn
- Encoder
class Encoder(nn.Module):
''' A encoder model with self attention mechanism. '''
def __init__(
self, n_src_vocab, d_word_vec, n_layers, n_head, d_k, d_v,
d_model, d_inner, pad_idx, dropout=0.1, n_position=200, scale_emb=False):
'''
n_src_vocab: 词典的大小尺寸
d_word_vec: 嵌入向量的维度, 即用多少维来表示一个符号
n_layers: 循环执行次数 N
n_head: num of head
d_k: 多头后, 矩阵 k 的维度
d_v: 多头后, 矩阵 v 的维度
d_model: 最初单个词embedding的向量长度
d_inner:
pad_idx: 填充id, 次的句子长度并不一样, 空余部分需要进行填充, 网络在遇到填充id时, 就不会计算其与其它符号的相关性
'''
super().__init__()
# 词嵌入
self.src_word_emb = nn.Embedding(n_src_vocab, d_word_vec, padding_idx=pad_idx)
# 位置向量
self.position_enc = PositionalEncoding(d_word_vec, n_position=n_position)
self.dropout = nn.Dropout(p=dropout)
# 模型序列
self.layer_stack = nn.ModuleList([
EncoderLayer(d_model, d_inner, n_head, d_k, d_v, dropout=dropout)
for _ in range(n_layers)])
self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
self.scale_emb = scale_emb
self.d_model = d_model
def forward(self, src_seq, src_mask, return_attns=False):
enc_slf_attn_list = []
# -- Forward
enc_output = self.src_word_emb(src_seq)
if self.scale_emb:
enc_output *= self.d_model ** 0.5
enc_output = self.dropout(self.position_enc(enc_output))
enc_output = self.layer_norm(enc_output)
# 遍历循环所得 attention 值
for enc_layer in self.layer_stack:
enc_output, enc_slf_attn = enc_layer(enc_output, slf_attn_mask=src_mask)
# 堆叠相加
enc_slf_attn_list += [enc_slf_attn] if return_attns else []
if return_attns:
return enc_output, enc_slf_attn_list
return enc_output,
- Decoder
class Decoder(nn.Module):
''' A decoder model with self attention mechanism. '''
def __init__(self, n_trg_vocab, d_word_vec, n_layers, n_head, d_k, d_v, d_model, d_inner, pad_idx, n_position=200, dropout=0.1, scale_emb=False):
'''
n_trg_vocab: 词典的大小尺寸
d_word_vec: 嵌入向量的维度, 即用多少维来表示一个符号
n_layers: 循环执行次数 N
n_head: num of head
d_k: 多头后, 矩阵 k 的维度
d_v: 多头后, 矩阵 v 的维度
d_model: 最初单个词embedding的向量长度
d_inner:
pad_idx: 填充id, 次的句子长度并不一样, 空余部分需要进行填充, 网络在遇到填充id时, 就不会计算其与其它符号的相关性
'''
super().__init__()
# 词嵌入
self.trg_word_emb = nn.Embedding(n_trg_vocab, d_word_vec, padding_idx=pad_idx)
# 位置向量
self.position_enc = PositionalEncoding(d_word_vec, n_position=n_position)
self.dropout = nn.Dropout(p=dropout)
# 模型序列
self.layer_stack = nn.ModuleList([
DecoderLayer(d_model, d_inner, n_head, d_k, d_v, dropout=dropout)
for _ in range(n_layers)])
self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
self.scale_emb = scale_emb
self.d_model = d_model
def forward(self, trg_seq, trg_mask, enc_output, src_mask, return_attns=False):
dec_slf_attn_list, dec_enc_attn_list = [], []
# -- Forward
dec_output = self.trg_word_emb(trg_seq)
if self.scale_emb:
dec_output *= self.d_model ** 0.5
dec_output = self.dropout(self.position_enc(dec_output))
dec_output = self.layer_norm(dec_output)
for dec_layer in self.layer_stack:
dec_output, dec_slf_attn, dec_enc_attn = dec_layer(
dec_output, enc_output, slf_attn_mask=trg_mask, dec_enc_attn_mask=src_mask)
dec_slf_attn_list += [dec_slf_attn] if return_attns else []
dec_enc_attn_list += [dec_enc_attn] if return_attns else []
if return_attns:
return dec_output, dec_slf_attn_list, dec_enc_attn_list
return dec_output,
- Transformer
class Transformer(nn.Module):
''' A sequence to sequence model with attention mechanism. '''
def __init__(
self, n_src_vocab, n_trg_vocab, src_pad_idx, trg_pad_idx,
d_word_vec=512, d_model=512, d_inner=2048,
n_layers=6, n_head=8, d_k=64, d_v=64, dropout=0.1, n_position=200,
trg_emb_prj_weight_sharing=True, emb_src_trg_weight_sharing=True,
scale_emb_or_prj='prj'):
super().__init__()
self.src_pad_idx, self.trg_pad_idx = src_pad_idx, trg_pad_idx
assert scale_emb_or_prj in ['emb', 'prj', 'none']
scale_emb = (scale_emb_or_prj == 'emb') if trg_emb_prj_weight_sharing else False
self.scale_prj = (scale_emb_or_prj == 'prj') if trg_emb_prj_weight_sharing else False
self.d_model = d_model
# 实例化 Encoder 对象
self.encoder = Encoder(
n_src_vocab=n_src_vocab, n_position=n_position,
d_word_vec=d_word_vec, d_model=d_model, d_inner=d_inner,
n_layers=n_layers, n_head=n_head, d_k=d_k, d_v=d_v,
pad_idx=src_pad_idx, dropout=dropout, scale_emb=scale_emb)
# 实例化 Decoder 对象
self.decoder = Decoder(
n_trg_vocab=n_trg_vocab, n_position=n_position,
d_word_vec=d_word_vec, d_model=d_model, d_inner=d_inner,
n_layers=n_layers, n_head=n_head, d_k=d_k, d_v=d_v,
pad_idx=trg_pad_idx, dropout=dropout, scale_emb=scale_emb)
self.trg_word_prj = nn.Linear(d_model, n_trg_vocab, bias=False)
for p in self.parameters():
if p.dim() > 1:
# 初始化权重
nn.init.xavier_uniform_(p)
assert d_model == d_word_vec, \
'To facilitate the residual connections, \
the dimensions of all module outputs shall be the same.'
# 共享参数
if trg_emb_prj_weight_sharing:
# Share the weight between target word embedding & last dense layer
self.trg_word_prj.weight = self.decoder.trg_word_emb.weight
if emb_src_trg_weight_sharing:
self.encoder.src_word_emb.weight = self.decoder.trg_word_emb.weight
def forward(self, src_seq, trg_seq):
# 用于产生Encoder的Mask,它是一列Bool值,负责把标点mask掉。
src_mask = get_pad_mask(src_seq, self.src_pad_idx)
# 用于产生Decoder的Mask。它是一个矩阵,确保不会获得超前信息。
trg_mask = get_pad_mask(trg_seq, self.trg_pad_idx) & get_subsequent_mask(trg_seq)
enc_output, *_ = self.encoder(src_seq, src_mask)
dec_output, *_ = self.decoder(trg_seq, trg_mask, enc_output, src_mask)
seq_logit = self.trg_word_prj(dec_output)
if self.scale_prj:
seq_logit *= self.d_model ** -0.5
return seq_logit.view(-1, seq_logit.size(2))
- Mask
def get_pad_mask(seq, pad_idx):
return (seq != pad_idx).unsqueeze(-2)
def get_subsequent_mask(seq):
''' For masking out the subsequent info. '''
sz_b, len_s = seq.size()
subsequent_mask = (1 - torch.triu(torch.ones((1, len_s, len_s), device=seq.device), diagonal=1)).bool()
return subsequent_mask