Bert Pretrain
预训练过程使用了Google基于Tensorflow发布的BERT源代码。首先从原始文件中创建训练数据,下面建立了基于空格的分词器。
class WhitespaceTokenizer(object):
"""WhitespaceTokenizer with vocab."""
def __init__(self, vocab_file):
self.vocab = load_vocab(vocab_file)
self.inv_vocab = {v: k for k, v in self.vocab.items()}
def tokenize(self, text):
split_tokens = whitespace_tokenize(text)
output_tokens = []
for token in split_tokens:
if token in self.vocab:
output_tokens.append(token)
else:
output_tokens.append("[UNK]")
return output_tokens
def convert_tokens_to_ids(self, tokens):
return convert_by_vocab(self.vocab, tokens)
def convert_ids_to_tokens(self, ids):
return convert_by_vocab(self.inv_vocab, ids)
预训练由于去除了NSP预训练任务,因此将文档处理多个最大长度为256的段,如果最后-一个 段的长度小于256/2则丢弃。每一个段执行按照BERT原文中执行掩码语言模型,然后处理成frecord格式。
def create_segments_from_document(document, max_segment_length):
"""Split single document to segments according to max_segment_length."""
assert len(document) == 1
document = document[0]
document_len = len(document)
index = list(range(0, document_len, max_segment_length))
other_len = document_len % max_segment_length
if other_len > max_segment_length / 2:
index.append(document_len)
segments = []
for i in range(len(index) - 1):
segment = document[index[i]: index[i+1]]
segments.append(segment)
return segments
在训练过程中,也只执行掩码语言模型任务,因此不再计算下一句预测任务的loss.
(masked_lm_loss, masked_lm_example_loss, masked_lm_log_probs) = get_masked_lm_output(
bert_config, model.get_sequence_output(), model.get_embedding_table(),
masked_lm_positions, masked_lm_ids, masked_lm_weights)
total_loss = masked_lm_loss
为了适配句子的长度,以及减小模型的训练时间,采用了BERT-mini模型,配置如下。
{
"hidden_size": 256,
"hidden_act": "gelu",
"initializer_range": 0.02,
"vocab_size": 5981,
"hidden_dropout_prob": 0.1,
"num_attention_heads": 4,
"type_vocab_size": 2,
"max_position_embeddings": 256,
"num_hidden_layers": 4,
"intermediate_size": 1024,
"attention_probs_dropout_prob": 0.1
}
{'hidden_size': 256,
'hidden_act': 'gelu',
'initializer_range': 0.02,
'vocab_size': 5981,
'hidden_dropout_prob': 0.1,
'num_attention_heads': 4,
'type_vocab_size': 2,
'max_position_embeddings': 256,
'num_hidden_layers': 4,
'intermediate_size': 1024,
'attention_probs_dropout_prob': 0.1}
最后转换为pytorch的权重。
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
# Initialise PyTorch model
config = BertConfig.from_json_file(bert_config_file)
print("Building PyTorch model from configuration: {}".format(str(config)))
model = BertForPreTraining(config)
# Load weights from tf checkpoint
load_tf_weights_in_bert(model, config, tf_checkpoint_path)
# Save pytorch-model
print("Save PyTorch model to {}".format(pytorch_dump_path))
torch.save(model.state_dict(), pytorch_dump_path)
微调最后一层的第一个token即【CLS】的隐藏向量作为句子的表示,然后输入到sotfmax层进行分类。
sequence_output, pooled_output = \
self.bert(input_ids=input_ids, token_type_ids=token_type_ids)
if self.pooled:
reps = pooled_output
else:
reps = sequence_output[:, 0, :] # sen_num x 256
if self.training:
reps = self.dropout(reps)