classLayerNorm(nn.Module):"""Construct a layernorm module (See citation for details).
Layer 标准化
"""def__init__(self, features, eps=1e-6):super(LayerNorm, self).__init__()
self.a_2 = nn.Parameter(torch.ones(features))
self.b_2 = nn.Parameter(torch.zeros(features))
self.eps = eps
defforward(self, x):
mean = x.mean(-1, keepdim=True)
std = x.std(-1, keepdim=True)return self.a_2 *(x - mean)/(std + self.eps)+ self.b_2
classSublayerConnection(nn.Module):"""
A residual connection followed by a layer norm.
Note for code simplicity the norm is first as opposed to last.
"""def__init__(self, size, dropout):super(SublayerConnection, self).__init__()
self.norm = LayerNorm(size)
self.dropout = nn.Dropout(dropout)defforward(self, x, sublayer):"Apply residual connection to any sublayer with the same size."return x + self.dropout(sublayer(self.norm(x)))classPositionwiseFeedForward(nn.Module):"Implements FFN equation."def__init__(self, d_model, d_ff, dropout=0.1):"""
:param d_model: 词向量的维度
:param d_ff:
:param dropout:
"""super(PositionwiseFeedForward, self).__init__()
self.w_1 = nn.Linear(d_model, d_ff)
self.w_2 = nn.Linear(d_ff, d_model)
self.dropout = nn.Dropout(dropout)
self.activation = GELU()defforward(self, x):return self.w_2(self.dropout(self.activation(self.w_1(x))))classTransformerBlock(nn.Module):"""
Bidirectional Encoder = Transformer (self-attention)
Transformer = MultiHead_Attention + Feed_Forward with sublayer connection
"""def__init__(self, hidden, attn_heads, feed_forward_hidden, dropout):"""
:param hidden: hidden size of transformer
:param attn_heads: head sizes of multi-head attention
:param feed_forward_hidden: feed_forward_hidden, usually 4*hidden_size
:param dropout: dropout rate
"""super(TransformerBlock, self).__init__()
self.attention = MultiHeadedAttention(h=attn_heads, d_model=hidden)
self.feed_forward = PositionwiseFeedForward(d_model=hidden, d_ff=feed_forward_hidden, dropout=dropout)
self.input_sublayer = SublayerConnection(size=hidden, dropout=dropout)
self.output_sublayer = SublayerConnection(size=hidden, dropout=dropout)
self.dropout = nn.Dropout(p=dropout)defforward(self, x, mask):
x = self.input_sublayer(x,lambda _x: self.attention.forward(_x, mask=mask))
x = self.output_sublayer(x, self.feed_forward)return self.dropout(x)
激活函数
使用了作者提出的 GELU 激活函数
classGELU(nn.Module):"""
Paper Section 3.4, last paragraph notice that BERT used the GELU instead of RELU
在论文的 3.4 节中,作者重写设计了 GELU 激活函数来代替 RELU
"""defforward(self, x):return0.5* x *(1+ torch.tanh(math.sqrt(2/ math.pi)*(x +0.044715* torch.pow(x,3))))
BERT 网络代码
classBERT(nn.Module):"""
BERT model : Bidirectional Encoder Representations from Transformers.
"""def__init__(self, vocab_size, hidden=768, n_layers=12, attn_heads=12, dropout=0.1):"""
:param vocab_size: vocab_size of total words
:param hidden: BERT model hidden size
:param n_layers: numbers of Transformer blocks(layers)
:param attn_heads: number of attention heads
:param dropout: dropout rate
"""super(BERT, self).__init__()
self.hidden = hidden
self.n_layers = n_layers
self.attn_heads = attn_heads
# paper noted they used 4*hidden_size for ff_network_hidden_size
self.feed_forward_hidden = hidden *4# embedding for BERT, sum of positional, segment, token embeddings
self.embedding = BERTEmbedding(vocab_size=vocab_size, d_model=hidden)# multi-layers transformer blocks, deep network
self.transformer_blocks = nn.ModuleList([TransformerBlock(hidden, attn_heads, hidden *4, dropout)for _ inrange(n_layers)])defforward(self, x, segment_info, position_ids):# attention masking for padded token# torch.ByteTensor([batch_size, 1, seq_len, seq_len)
mask =(x >0).unsqueeze(1).repeat(1, x.size(1),1).unsqueeze(1)# embedding the indexed sequence to sequence of vectors
x = self.embedding(x, segment_info, position_ids)# running over multiple transformer blocksfor transformer in self.transformer_blocks:
x = transformer.forward(x, mask)return x