input 如何替换 input_如何训练你的BERT

模型细节

1.残差
2.Layer Normalization
3.masked multi-head attention
4.输出

739ce268aa9165519d00f4e52e87dc1d.png

c6be171f7f589574379ed05a2a6da3af.png

BN:取不同样本的同一个通道做归一化

LN:同一个样本的不同通道做归一化

掩码语言模型(Masked LM, MLM)

•每句话中有15%的单词被[mask]替换

•编码器( Transformer encoder )不知道它将被要求预测哪些单词,或者哪些单词已被随机单词替换

能够保持对每个输入 token 的分布式上下文表示的学习

句子关系判断(Next Sentence Prediction, NSP)

•补充Masked LM对单词的学习,帮助理解两个句子之间的关系

•支撑例如智能问答、自然语言推理等任务的完成

Step 1:构造单词查表,返回单词表达和映射关系

word_embeddings, embedding_tables = self.bert_embedding_lookup(input_ids)
self.bert_embedding_lookup = EmbeddingLookup(
            vocab_size=config.vocab_size,
            embedding_size=self.embedding_size,
            embedding_shape=output_embedding_shape,
            use_one_hot_embeddings=use_one_hot_embeddings,
            initializer_range=config.initializer_range)

Step 2:在词向量中融入位置编码和词类信息

embedding_output = self.bert_embedding_postprocessor(token_type_ids, word_embeddings)
self.bert_embedding_postprocessor = EmbeddingPostprocessor(
            embedding_size=self.embedding_size,
            embedding_shape=output_embedding_shape,
            use_token_type=True,
            token_type_vocab_size=config.type_vocab_size,
            use_one_hot_embeddings=use_one_hot_embeddings,
            initializer_range=0.02,
            max_position_embeddings=config.max_position_embeddings,
            dropout_prob=config.hidden_dropout_prob)

Step 3:构造注意力掩码

attention_mask = self._create_attention_mask_from_input_mask(input_mask)
self._create_attention_mask_from_input_mask = CreateAttentionMaskFromInputMask(config)
def construct(self, input_mask):
        if not self.input_mask_from_dataset:
            input_mask = self.input_mask
         input_mask = self.cast(self.reshape(input_mask, self.shape), mstype.float32)
        attention_mask = self.batch_matmul(self.broadcast_ones, input_mask)
        return attention_mask

Step 4:构造编码器

# bert encoder
encoder_output = self.bert_encoder(self.cast_compute_type(embedding_output),attention_mask)
sequence_output = self.cast(encoder_output[self.last_idx], self.dtype)
self.bert_encoder = BertTransformer(
            batch_size=self.batch_size,
            hidden_size=self.hidden_size,
            seq_length=self.seq_length,
            num_attention_heads=config.num_attention_heads,
            num_hidden_layers=self.num_hidden_layers,
            intermediate_size=config.intermediate_size,
            attention_probs_dropout_prob=config.attention_probs_dropout_prob,
            use_one_hot_embeddings=use_one_hot_embeddings,
            initializer_range=config.initializer_range,
            hidden_dropout_prob=config.hidden_dropout_prob,
            use_relative_positions=config.use_relative_positions,
            hidden_act=config.hidden_act,
            compute_type=config.compute_type,
            return_all_encoders=True)

Step 5:构造池化层

# pooler
sequence_slice = self.slice(sequence_output,(0, 0, 0),(self.batch_size, 1, self.hidden_size),
(1, 1, 1))
first_token = self.squeeze_1(sequence_slice)
pooled_output = self.dense(first_token)
self.slice = P.StridedSlice()
self.squeeze_1 = P.Squeeze(axis=1)
self.dense = nn.Dense(self.hidden_size, self.hidden_size,
                     activation="tanh",
                     weight_init=TruncatedNormal(config.initializer_range))

总览:如何用MindSpore构造BERT

def construct(self, input_ids, token_type_ids, input_mask):
        # embedding
        if not self.token_type_ids_from_dataset:
            token_type_ids = self.token_type_ids
        word_embeddings, embedding_tables = self.bert_embedding_lookup(input_ids)
        embedding_output = self.bert_embedding_postprocessor(token_type_ids,
                                                             word_embeddings)
        # attention mask [batch_size, seq_length, seq_length]
        attention_mask = self._create_attention_mask_from_input_mask(input_mask)
        # bert encoder
        encoder_output = self.bert_encoder(self.cast_compute_type(embedding_output),
                                           attention_mask)         sequence_output = self.cast(encoder_output[self.last_idx], self.dtype)
        # pooler
        sequence_slice = self.slice(sequence_output,
                                    (0, 0, 0),
                                    (self.batch_size, 1, self.hidden_size),
                                    (1, 1, 1))
        first_token = self.squeeze_1(sequence_slice)
        pooled_output = self.dense(first_token)
        return sequence_output, pooled_output, embedding_tables
import tensorflow as tf import tensorflow_hub as hub from tensorflow.keras import layers import bert import numpy as np from transformers import BertTokenizer, BertModel # 设置BERT模型的路径和参数 bert_path = "E:\\AAA\\523\\BERT-pytorch-master\\bert1.ckpt" max_seq_length = 128 train_batch_size = 32 learning_rate = 2e-5 num_train_epochs = 3 # 加载BERT模型 def create_model(): input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_word_ids") input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_mask") segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="segment_ids") bert_layer = hub.KerasLayer(bert_path, trainable=True) pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids]) output = layers.Dense(1, activation='sigmoid')(pooled_output) model = tf.keras.models.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=output) return model # 准备数据 def create_input_data(sentences, labels): tokenizer = bert.tokenization.FullTokenizer(vocab_file=bert_path + "trainer/vocab.small", do_lower_case=True) # tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') input_ids = [] input_masks = [] segment_ids = [] for sentence in sentences: tokens = tokenizer.tokenize(sentence) tokens = ["[CLS]"] + tokens + ["[SEP]"] input_id = tokenizer.convert_tokens_to_ids(tokens) input_mask = [1] * len(input_id) segment_id = [0] * len(input_id) padding_length = max_seq_length - len(input_id) input_id += [0] * padding_length input_mask += [0] * padding_length segment_id += [0] * padding_length input_ids.append(input_id) input_masks.append(input_mask) segment_ids.append(segment_id) return np.array(input_ids), np.array(input_masks), np.array(segment_ids), np.array(labels) # 加载训练数据 train_sentences = ["Example sentence 1", "Example sentence 2", ...] train_labels = [0, 1, ...] train_input_ids, train_input_masks, train_segment_ids, train_labels = create_input_data(train_sentences, train_labels) # 构建模型 model = create_model() model.compile(optimizer=tf.keras.optimizers.Adam(lr=learning_rate), loss='binary_crossentropy', metrics=['accuracy']) # 开始微调 model.fit([train_input_ids, train_input_masks, train_segment_ids], train_labels, batch_size=train_batch_size, epochs=num_train_epochs)这段代码有什么问题吗?
05-24
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值