# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Mask, padding and batching."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
from six.moves import xrange
def mask(batch_tokens,
seg_labels,
mask_word_tags,
total_token_num,
vocab_size,
CLS=1,
SEP=2,
MASK=3):
"""
Add mask for batch_tokens, return out, mask_label, mask_pos;
Note: mask_pos responding the batch_tokens after padded;
"""
max_len = max([len(sent) for sent in batch_tokens])
mask_label = []
mask_pos = []
prob_mask = np.random.rand(total_token_num)
# Note: the first token is [CLS], so [low=1]
replace_ids = np.random.randint(1, high=vocab_size, size=total_token_num)
pre_sent_len = 0
prob_index = 0
for sent_index, sent in enumerate(batch_tokens):
mask_flag = False
mask_word = mask_word_tags[sent_index]
prob_index += pre_sent_len
if mask_word:
beg = 0
for token_index, token in enumerate(sent):
seg_label = seg_labels[sent_index][token_index]
if seg_label == 1:
continue
if beg == 0:
if seg_label != -1:
beg = token_index
continue
prob = prob_mask[prob_index + beg]
if prob > 0.15:
pass
else:
ernie_mask部分代码阅读
最新推荐文章于 2022-11-05 10:12:25 发布
本文深入探讨了Ernie模型中mask部分的源代码,解析关键实现细节,帮助理解预训练模型在自然语言处理任务中的工作原理。
摘要由CSDN通过智能技术生成