采用一种简单的方式,截取每个样本前512个字符。随机mask一些词,其中80%被mask掉的词使用特殊符号代替,如[MASK],10%使用随机词替代,10%使用原本的词替代。参考transformers开源代码,如下:
def mask_tokens(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
labels = inputs.clone()
# We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
probability_matrix = torch.full(labels.shape, self.mlm_probability)
special_tokens_mask = [
self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
]
probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
if self.tokenizer._pad_token is not None:
padding_mask = labels.eq(self.tokenizer.pad_token_id)
probability_matrix.masked_fill_(padding_mask, value=0.0)
masked_indices = torch.bernoulli(probability_matrix).bool()
labels[~masked_indices] = -100 # We only compute loss on masked tokens
# 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
# 10% of the time, we replace masked input tokens with random word
indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long)
inputs[indices_random] = random_words[indices_random]
# The rest of the time (10% of the time) we keep the masked input tokens unchanged
return inputs, labels
以上为一批数据处理(如m, 512) 即假设inputs的shape为(m, 512)。m为batch数,512为序列长度。
关键函数:
torch.full:使用指定数据填充一个tensor。
masked_fill_:将mask为True的位置用指定数据替换。
torch.bernoulli:依据概率,生成伯努利(0,1)分布。
torch.randint:生成随机整数。
要使用tensorflow改写此方法存在问题:
1 tensor不允许修改。
如labels[~masked_indices] = -100将会报错。
2 非eager tensor下,类似如下代码会报错
special_tokens_mask = [
self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
]
因为这里labels.tolist(),非eager模式下,此处将报错。
3 pytorch此处针对一批数据,在训练时动态执行,tensorflow若也想像这样,则需通过map函数处理,即对单个样本处理,同时训练过程中不应该cache,因为要保证每次随机值不一样。
解决方案
1 tf.py_function
利用tf.py_function,可以像操作numpy一样处理tensor,可扩展性强,但是会降低速度。如下:
def mask_tokens(input_ids, tokenizer: PreTrainedTokenizer = tokenizer, mlm: bool = True,
mlm_probability: float = 0.15):
input_ids = input_ids.numpy()
labels = input_ids.copy()
probability_matrix = np.full(labels.shape, mlm_probability, dtype=np.float32)
############################################################################################
## 方法1
special_tokens_mask = tokenizer.get_special_tokens_mask(labels.tolist(), already_has_special_tokens=True)
probability_matrix = np.where(special_tokens_mask, 0.0, probability_matrix)
if tokenizer._pad_token is not None:
probability_matrix = np.where(labels == tokenizer.pad_token_id, 0.0, probability_matrix)
############################################################################################
##################################################################################################
# 方法2 耗时较长舍弃
# mask_ignore = list(input_ids[i] in tokenizer.all_special_ids for i in range(data_args.block_size))
# probability_matrix[mask_ignore] = 0.0
##################################################################################################
masked_indices = np.random.binomial(n=1, p=probability_matrix).astype(bool) # True为要预测的位置
labels[~masked_indices] = -1 # We only compute loss on masked tokens
# 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
indices_replaced = np.random.binomial(n=1, p=np.full(labels.shape, 0.8)).astype(bool) & masked_indices
input_ids[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)
# 10% of the time, we replace masked input tokens with random word
indices_random = np.random.binomial(n=1, p=np.full(labels.shape, 0.5)).astype(bool) & masked_indices \
& ~indices_replaced
# `high` is None (the default), then results are from [0, `low`).
# def randint(low, high=None, size=None, dtype='l')
random_words = np.random.randint(len(tokenizer), size=labels.shape, dtype=np.int32)
input_ids[indices_random] = random_words[indices_random]
# The rest of the time (10% of the time) we keep the masked input tokens unchanged
return input_ids, labels
def file_based_input_fn_builder(input_file, seq_length):
name_to_features = {
"input_ids": tf.io.FixedLenFeature([seq_length], tf.int64),
"attention_mask": tf.io.FixedLenFeature([seq_length], tf.int64),
}
def _decode_record(record, tokenizer: PreTrainedTokenizer = tokenizer, mlm: bool = True,
mlm_probability: float = 0.15):
# tf.Example only supports tf.int64, but the TPU only supports tf.int32.
# So cast all int64 to int32.
example = tf.io.parse_single_example(serialized=record, features=name_to_features)
for name in list(example.keys()):
t = example[name]
if t.dtype == tf.int64:
t = tf.cast(t, dtype=tf.int32)
example[name] = t
input_ids = example.pop("input_ids")
input_ids, label_ids = tf.py_function(mask_tokens, inp=[input_ids, ], Tout=(tf.int32, tf.int32))
input_ids.set_shape([data_args.block_size])
label_ids.set_shape([data_args.block_size])
example["input_ids"] = input_ids
return example, label_ids
d = tf.data.TFRecordDataset(input_file)
d = d.map(_decode_record, num_parallel_calls=-1)
return d
# data_args.block_size序列长度
train_dataset = file_based_input_fn_builder('./train.tfrecord', data_args.block_size)
# 测试耗时
aa = time.time()
len__data = train_dataset.reduce(tf.constant(0), lambda x, _: x + 1).numpy()
print(len__data) # 20000
print(time.time() - aa)
train_dataset = (
train_dataset.cache()
.shuffle(len__data)
.batch(4)
.prefetch(tf.data.experimental.AUTOTUNE)
)
for _ in train_dataset:
pass
print(time.time() - aa)
其中,对于要mask掉的位置,有一些应该忽略,如用于填充的字符、开始标记、结束标记等。
代码中采用如下形式较为耗时:
# 方法2 耗时较长舍弃
mask_ignore = list(input_ids[i] in tokenizer.all_special_ids for i in range(data_args.block_size))
probability_matrix[mask_ignore] = 0.0
此时测试耗时为408.34114956855774、819.2113552093506
采用如下形式可以节省时间:
## 方法1
special_tokens_mask = tokenizer.get_special_tokens_mask(labels.tolist(), already_has_special_tokens=True)
probability_matrix = np.where(special_tokens_mask, 0.0, probability_matrix)
if tokenizer._pad_token is not None:
probability_matrix = np.where(labels == tokenizer.pad_token_id, 0.0, probability_matrix)
此时测试耗时为139.19419836997986、279.870977640152
此时训练一个epoch耗时为987.9169714450836
方法2 使用tensorflow的方式,类似@tf.function
train_dataset = file_based_input_fn_builder('./train.tfrecord', data_args.block_size)
def mask_tokens(input_ids, tokenizer: PreTrainedTokenizer = tokenizer, mlm: bool = True,
mlm_probability: float = 0.15):
# probability_matrix = tf.ones_like(input_ids, dtype=tf.float32) * mlm_probability
probability_matrix = tf.fill(input_ids.shape, mlm_probability)
# ################################################################################################
# # 报错
# special_tokens_mask = tf.cast(tokenizer.get_special_tokens_mask(input_ids.numpy(),
# already_has_special_tokens=True), tf.bool)
# probability_matrix = tf.where(special_tokens_mask, 0.0, probability_matrix)
#
# if tokenizer._pad_token is not None:
# probability_matrix = tf.where(input_ids == tokenizer.pad_token_id, 0.0, probability_matrix)
# # #################################################################################################
mask_ignores = tf.cast(tf.reduce_max(tf.cast(tf.stack(list(input_ids == tokenizer.all_special_ids[i]
for i in range(len(tokenizer.all_special_ids))), axis=0),
dtype=tf.int32), axis=0),
tf.bool)
probability_matrix = tf.where(mask_ignores, 0.0, probability_matrix)
# masked_indices = np.random.binomial(n=1, p=probability_matrix).astype(bool) # True为要预测的位置
masked_indices = tf.cast(tf.keras.backend.random_binomial(input_ids.shape, probability_matrix), tf.bool)
# labels[~masked_indices] = -1 # We only compute loss on masked tokens
labels = tf.where(masked_indices, input_ids, -1)
# 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
indices_replaced = tf.cast(tf.keras.backend.random_binomial(input_ids.shape, tf.fill(input_ids.shape, 0.8)),
tf.bool) & masked_indices
input_ids = tf.where(indices_replaced, tokenizer.convert_tokens_to_ids(tokenizer.mask_token),
input_ids)
# 10% of the time, we replace masked input tokens with random word
indices_random = tf.cast(tf.keras.backend.random_binomial(input_ids.shape, tf.fill(input_ids.shape, 0.5)),
tf.bool) & masked_indices & ~indices_replaced
# random_words = np.random.randint(len(tokenizer), size=labels.shape, dtype=np.int32)
random_words = tf.random.uniform(shape=input_ids.shape, maxval=len(tokenizer), dtype=tf.int32)
input_ids = tf.where(indices_random, random_words, input_ids)
return input_ids, labels
def file_based_input_fn_builder(input_file, seq_length):
name_to_features = {
"input_ids": tf.io.FixedLenFeature([seq_length], tf.int64),
"attention_mask": tf.io.FixedLenFeature([seq_length], tf.int64),
}
def _decode_record(record, tokenizer: PreTrainedTokenizer = tokenizer, mlm: bool = True,
mlm_probability: float = 0.15):
# tf.Example only supports tf.int64, but the TPU only supports tf.int32.
# So cast all int64 to int32.
example = tf.io.parse_single_example(serialized=record, features=name_to_features)
for name in list(example.keys()):
t = example[name]
if t.dtype == tf.int64:
t = tf.cast(t, dtype=tf.int32)
example[name] = t
input_ids = example.pop("input_ids")
input_ids, label_ids = mask_tokens(input_ids)
input_ids.set_shape([data_args.block_size])
label_ids.set_shape([data_args.block_size])
example["input_ids"] = input_ids
return example, label_ids
# _decode_record(next(iter(train_dataset)), tokenizer)
d = tf.data.TFRecordDataset(input_file)
d = d.map(_decode_record, num_parallel_calls=-1)
return d
aa = time.time()
len__data = train_dataset.reduce(tf.constant(0), lambda x, _: x + 1).numpy()
print(len__data) # 20000
print(time.time()-aa) # 0.6985440254211426
train_dataset = (
train_dataset.cache()
.shuffle(len__data)
.batch(4)
.prefetch(tf.data.experimental.AUTOTUNE)
)
for _ in train_dataset:
pass
print(time.time() - aa) # 1.6970477104187012
非eager tensor下input_ids.numpy()会报错。
主要函数:
tf.where:重新赋值tensor,类似np.where。
tf.fill:用指定值填充tensor。
tf.random.uniform:指定dtype为tf.int32时,用于生成随机整数。
耗时测试程序输出为0.6985440254211426、1.6970477104187012。发现数据处理变快。
此时训练一个epoch耗时850.8861315250397