配置文件
bert base
{
"attention_probs_dropout_prob": 0.1,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"max_position_embeddings": 512,
"num_attention_heads": 12,
"num_hidden_layers": 12,
"type_vocab_size": 2,
"vocab_size": 30522
}
输入的处理
三个词向量:word_embedding
、position_embedding
、token_type_embedding
class BertEmbeddings(nn.Module):
"""Construct the embeddings from word, position and token_type embeddings.
"""
def __init__(self, config):
super(BertEmbeddings, self).__init__()
# 单词embedding (V, H)
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0)
# 位置embedding (P, H)
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
# 段落embedding (V, H)
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
# any TensorFlow checkpoint file
self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, input_ids, token_type_ids=None):
seq_length = input_ids.size(1)
position_ids = torch.arange(seq_length, dtype=torch.long,
device=input_ids.device) # 位置排列(0, 1, 2, 3, ..., seq_length-1)
position_ids = position_ids.unsqueeze(0).expand_as(input_ids) # 将position_ids维度扩充到input_ids的维度, (b, seq_length)
if token_type_ids is None:
token_type_ids = torch.zeros_like(input_ids)
words_embeddings = self.word_embeddings(input_ids) # (B, seq_length, H)
position_embeddings = self.position_embeddings(position_ids) # (B, seq_length, H)
token_type_embeddings = self.token_type_embeddings(token_type_ids) # (B, seq_length, H)
embeddings = words_embeddings + position_embeddings + token_type_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
自注意力模型,bert
里面隐藏层大小等于head
数目乘上head
大小。自注意力层输入和输出大小维度是相同的,都是(b, seq_len, h)
。
class BertSelfAttention(nn.Module):
def __init__(self, config, output_attentions=False, keep_multihead_output=False):
super(BertSelfAttention, self).__init__()
if config.hidden_size % config.num_attention_heads != 0: # 隐藏层大小需要是注意力头数目的倍数
raise ValueError(
"The hidden size (%d) is not a multiple of the number of attention "
"heads (%d)" % (config.hidden_size, config.num_attention_heads))
self.output_attentions = output_attentions
self.keep_multihead_output = keep_multihead_output
self.multihead_output = None
self.num_attention_heads = config.num_attention_heads # 注意力头的数目
self.attention_head_size = int(config.hidden_size / config