1.transformer模型
Transformer 是 Google 的团队在 2017 年提出的一种 NLP 经典模型,现在比较火热的 Bert 也是基于 Transformer。Transformer 模型使用了 Self-Attention 机制,不采用 RNN 的顺序结构,使得模型可以并行化训练,而且能够拥有全局信息。
2.encoder部分实现(pytorch)
class EncoderLayer(nn.Module):
def __init__(self, hidden_size, filter_size, n_head, pre_lnorm, device, dropout):
super(EncoderLayer, self).__init__()
# self-attention part
self.self_attn = MultiHeadAttention(hidden_size, n_head, device)
self.self_attn_norm = nn.LayerNorm(hidden_size)
# feed forward network part
self.pff = PositionwiseFeedForward(hidden_size, filter_size, dropout)
self.pff_norm = nn.LayerNorm(hidden_size)
self.pre_lnorm = pre_lnorm
def forward(self, src, src_mask):
if self.pre_lnorm:
pre = self.self_attn_norm(src)
# residual connection
src = src + self.self_attn(pre, pre, pre, src_mask)
pre = self.pff_norm(src)
src = src + self.pff(pre) # residual connection
else:
# residual connection + layerNorm
src = self.self_attn_norm(
src + self.self_attn(src, src, src, src_mask))
# residual connection + layerNorm
src = self.pff_norm(src + self.pff(src))
return src
class Encoder(