Transformer模型实现与解析
简介
Transformer是一个利用注意力机制来提高模型训练速度的模型,其的核心思想是自注意力机制(self-attention)——能注意输入序列的不同位置以计算该序列的表示的能力。
参考链接:https://www.tensorflow.org/tutorials/text/transformer?hl=zh-cn
模型结构
模型结构图如下:
这是一个典型的seq2seq模型,结构分为encoder和decoder两部分。左边为encoder,主要由一个多头的注意力层和一个前馈网络组成;右边的为decoder。主要由两个多头注意力层和一个前馈网络组成。transformer相较于其他的seq2seq模型最大的特点在于其使用了多头注意力来替换了rnn。
多头注意力的详细结构如下:
图中右边的是多头注意力结构,这里会首先将q、k、v三个输入连接一个线性层(全连接层)然后再传入一个缩放点积注意力层,这个层的结构画在了左边,最后将注意力的输出连接起来传入线性层中便得到了注意力的结果。
图中左边的缩放点积注意力层也很简单,首先会将q和k相乘,然后将其缩放,然后再对其进行mask处理,最后对其进行softmax,这样便得到了注意力权重,最后将权重与v相乘便可。
代码实现
代码主要分为4部分:数据、模型、训练、预测。
数据处理
import tensorflow as tf
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split
import unicodedata
import re
import os
import io
import time
import time
import numpy as np
import tensorflow_datasets as tfds
import tensorflow as tf
import time
import numpy as np
import matplotlib.pyplot as plt
examples, metadata = tfds.load('ted_hrlr_translate/pt_to_en', with_info=True,
as_supervised=True)
train_examples, val_examples = examples['train'], examples['validation']
tokenizer_en = tfds.features.text.SubwordTextEncoder.build_from_corpus(
(en.numpy() for pt, en in train_examples), target_vocab_size=2 ** 13)
tokenizer_pt = tfds.features.text.SubwordTextEncoder.build_from_corpus(
(pt.numpy() for pt, en in train_examples), target_vocab_size=2 ** 13)
# print("==============")
# sample_string = 'Transformer is awesome.'
#
# tokenized_string = tokenizer_en.encode(sample_string)
# print('Tokenized string is {}'.format(tokenized_string))
#
# original_string = tokenizer_en.decode(tokenized_string)
# print('The original string: {}'.format(original_string))
#
# assert original_string == sample_string
BUFFER_SIZE = 200
BATCH_SIZE = 4
MAX_LENGTH = 40
def encode(lang1, lang2):
lang1 = [tokenizer_pt.vocab_size] + tokenizer_pt.encode(
lang1.numpy()) + [tokenizer_pt.vocab_size + 1]
lang2 = [tokenizer_en.vocab_size] + tokenizer_en.encode(
lang2.numpy()) + [tokenizer_en.vocab_size + 1]
return lang1, lang2
def filter_max_length(x, y, max_length=MAX_LENGTH):
return tf.logical_and(tf.size(x) <= max_length,
tf.size(y) <= max_length)
def tf_encode(pt, en):
result_pt, result_en = tf.py_function(encode, [pt, en], [tf.int64, tf.int64])
result_pt.set_shape([None])
result_en.set_shape([None])
return result_pt, result_en
train_dataset = train_examples.map(tf_encode)
train_dataset = train_dataset.filter(filter_max_length)
# 将数据集缓存到内存中以加快读取速度。
train_dataset = train_dataset.cache()
train_dataset = train_dataset.shuffle(BUFFER_SIZE).padded_batch(BATCH_SIZE)
train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)
val_dataset = val_examples.map(tf_encode)
val_dataset = val_dataset.filter(filter_max_length).padded_batch(BATCH_SIZE)
# pt_batch, en_batch = next(iter(val_dataset))
# print(pt_batch, en_batch)
def get_angles(pos, i, d_model):
angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
return pos * angle_rates
def positional_encoding(position, d_model):
angle_rads = get_angles(np.arange(position)[:, np.newaxis],
np.arange(d_model)[np.newaxis, :],
d_model)
# 将 sin 应用于数组中的偶数索引(indices);2i
angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
# 将 cos 应用于数组中的奇数索引;2i+1
angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
pos_encoding = angle_rads[np.newaxis, ...]
return tf.cast(pos_encoding, dtype=tf.float32)
# pos_encoding = positional_encoding(50, 512)
# print(pos_encoding.shape)
#
# plt.pcolormesh(pos_encoding[0], cmap='RdBu')
# plt.xlabel('Depth')
# plt.xlim((0, 512))
# plt.ylabel('Position')
# plt.colorbar()
# plt.show()
def create_padding_mask(seq):
seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
# 添加额外的维度来将填充加到
# 注意力对数(logits)。
return seq[:, tf.newaxis, tf.newaxis, :] # (batch_size, 1, 1, seq_len)
def create_look_ahead_mask(size):
mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
return mask # (seq_len, seq_len)
这里的数据处理是通常的翻译数据处理方式,通过tensorflow_datasets的接口可以方便的下载数据和处理数据。
除了通常的词表处理外,还有一个位置编码和mask操作。位置编码是由于transformer用注意力代替了rnn,而注意力没法像rnn一样获取位置的信息,所以这里需要将单独提取出位置信息,然后添加到数据中去。另一个不一样的地方在于mask,由于注意力机制可以读取到所有的数据,但rnn中是先读取到第一个,这时他是没有下一个位置的信息的。transformer中使用mask来解决这个问题,在第一个位置的时候会遮挡住后面的位置来实现与rnn类似的效果。
模型结构
def scaled_dot_product_attention(q, k, v, mask):
"""计算注意力权重。
q, k, v 必须具有匹配的前置维度。
k, v 必须有匹配的倒数第二个维度,例如:seq_len_k = seq_len_v。
虽然 mask 根据其类型(填充或前瞻)有不同的形状,
但是 mask 必须能进行广播转换以便求和。
参数:
q: 请求的形状 == (..., seq_len_q, depth)
k: 主键的形状 == (..., seq_len_k, depth)
v: 数值的形状 == (..., seq_len_v, depth_v)
mask: Float 张量,其形状能转换成
(..., seq_len_q, seq_len_k)。默认为None。
返回值:
输出,注意力权重
"""
matmul_qk = tf.matmul(q, k, transpose_b=True) # (..., seq_len_q, seq_len_k)
# 缩放 matmul_qk
dk = tf.cast(tf.shape(k)[-1], tf.float32)
scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
# 将 mask 加入到缩放的张量上。
if mask is not None:
scaled_attention_logits += (mask * -1e9)
# softmax 在最后一个轴(seq_len_k)上归一化,因此分数
# 相加等于1。
attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1) # (..., seq_len_q, seq_len_k)
output = tf.matmul(attention_weights, v) # (..., seq_len_q, depth_v)
return output, attention_weights
#
# def print_out(q, k, v):
# temp_out, temp_attn = scaled_dot_product_attention(
# q, k, v, None)
# print('Attention weights are:')
# print(temp_attn)
# print('Output is:')
# print(temp_out)
#
#
# np.set_printoptions(suppress=True)
#
# temp_k = tf.constant([[10, 0, 0],
# [0, 10, 0],
# [0, 0, 10],
# [0, 0, 10]], dtype=tf.float32) # (4, 3)
#
# temp_v = tf.constant([[1, 0],
# [10, 0],
# [100, 5],
# [1000, 6]], dtype=tf.float32) # (4, 2)
#
# # 这条 `请求(query)符合第二个`主键(key)`,
# # 因此返回了第二个`数值(value)`。
# temp_q = tf.constant([[0, 10, 0]], dtype=tf.float32) # (1, 3)
# print_out(temp_q, temp_k, temp_v)
#
#
# # 这条请求符合重复出现的主键(第三第四个),
# # 因此,对所有的相关数值取了平均。
# temp_q = tf.constant([[0, 0, 10]], dtype=tf.float32) # (1, 3)
# print_out(temp_q, temp_k, temp_v)
#
# # 这条请求符合第一和第二条主键,
# # 因此,对它们的数值去了平均。
# temp_q = tf.constant([[10, 10, 0]], dtype=tf.float32) # (1, 3)
# print_out(temp_q, temp_k, temp_v)
#
#
# temp_q = tf.constant([[0, 0, 10], [0, 10, 0], [10, 10, 0]], dtype=tf.float32) # (3, 3)
# print_out(temp_q, temp_k, temp_v)
class MultiHeadAttention(tf.keras.layers.Layer):
def __init__(self, d_model, num_heads):
super(MultiHeadAttention, self).__init__()
self.num_heads = num_heads
self.d_model = d_model
assert d_model % self.num_heads == 0
self.depth = d_model // self.num_heads
self.wq = tf.keras.layers.Dense(d_model)
self.wk = tf.keras.layers.Dense(d_model)
self.wv = tf.keras.layers.Dense(d_model)
self.dense = tf.keras.layers.Dense(d_model)
def split_heads(self, x, batch_size):
"""分拆最后一个维度到 (num_heads, depth).
转置结果使得形状为 (batch_size, num_heads, seq_len, depth)
"""
x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
return tf.transpose(x, perm=[0, 2, 1, 3])
def call(self, v, k, q, mask):
batch_size = tf.shape(q)[0]
q = self.wq(q) # (batch_size, seq_len, d_model)
k = self.wk(k) # (batch_size, seq_len, d_model)
v = self.wv(v) # (batch_size, seq_len, d_model)
q = self.split_heads(q, batch_size) # (batch_size, num_heads, seq_len_q, depth)
k = self.split_heads(k, batch_size) # (batch_size, num_heads, seq_len_k, depth)
v = self.split_heads(v, batch_size) # (batch_size, num_heads, seq_len_v, depth)
# scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
# attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
# print(q.shape, k.shape, v.shape, mask.shape)
scaled_attention, attention_weights = scaled_dot_product_attention(
q, k, v, mask)
# print(scaled_attention.shape)
scaled_attention = tf.transpose(scaled_attention,
perm=[0, 2, 1, 3]) # (batch_size, seq_len_q, num_heads, depth)
concat_attention = tf.reshape(scaled_attention,
(batch_size, -1, self.d_model)) # (batch_size, seq_len_q, d_model)
output = self.dense(concat_attention) # (batch_size, seq_len_q, d_model)
return output, attention_weights
# temp_mha = MultiHeadAttention(d_model=512, num_heads=8)
# y = tf.random.uniform((1, 60, 512)) # (batch_size, encoder_sequence, d_model)
# out, attn = temp_mha(y, k=y, q=y, mask=None)
# print(out.shape, attn.shape)
def point_wise_feed_forward_network(d_model, dff):
return tf.keras.Sequential([
tf.keras.layers.Dense(dff, activation='relu'), # (batch_size, seq_len, dff)
tf.keras.layers.Dense(d_model) # (batch_size, seq_len, d_model)
])
class EncoderLayer(tf.keras.layers.Layer):
def __init__(self, d_model, num_heads, dff, rate=0.1):
super(EncoderLayer, self).__init__()
self.mha = MultiHeadAttention(d_model, num_heads)
self.ffn = point_wise_feed_forward_network(d_model, dff)
self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
self.dropout1 = tf.keras.layers.Dropout(rate)
self.dropout2 = tf.keras.layers.Dropout(rate)
def call(self, x, training, mask):
attn_output, _ = self.mha(x, x, x, mask) # (batch_size, input_seq_len, d_model)
attn_output = self.dropout1(attn_output, training=training)
out1 = self.layernorm1(x + attn_output) # (batch_size, input_seq_len, d_model)
ffn_output = self.ffn(out1) # (batch_size, input_seq_len, d_model)
ffn_output = self.dropout2(ffn_output, training=training)
out2 = self.layernorm2(out1 + ffn_output) # (batch_size, input_seq_len, d_model)
return out2
class DecoderLayer(tf.keras.layers.Layer):
def __init__(self, d_model, num_heads, dff, rate=0.1):
super(DecoderLayer, self).__init__()
self.mha1 = MultiHeadAttention(d_model, num_heads)
self.mha2 = MultiHeadAttention(d_model, num_heads)
self.ffn = point_wise_feed_forward_network(d_model, dff)
self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
self.dropout1 = tf.keras.layers.Dropout(rate)
self.dropout2 = tf.keras.layers.Dropout(rate)
self.dropout3 = tf.keras.layers.Dropout(rate)
def call(self, x, enc_output, training,
look_ahead_mask, padding_mask):
# enc_output.shape == (batch_size, input_seq_len, d_model)
attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask) # (batch_size, target_seq_len, d_model)
attn1 = self.dropout1(attn1, training=training)
out1 = self.layernorm1(attn1 + x)
attn2, attn_weights_block2 = self.mha2(
enc_output, enc_output, out1, padding_mask) # (batch_size, target_seq_len, d_model)
attn2 = self.dropout2(attn2, training=training)
out2 = self.layernorm2(attn2 + out1) # (batch_size, target_seq_len, d_model)
ffn_output = self.ffn(out2) # (batch_size, target_seq_len, d_model)
ffn_output = self.dropout3(ffn_output, training=training)
out3 = self.layernorm3(ffn_output + out2) # (batch_size, target_seq_len, d_model)
return out3, attn_weights_block1, attn_weights_block2
#
# sample_encoder_layer = EncoderLayer(512, 8, 2048)
#
# sample_encoder_layer_output = sample_encoder_layer(
# tf.random.uniform((64, 43, 512)), False, None)
#
# print(sample_encoder_layer_output.shape) # (batch_size, input_seq_len, d_model)
#
#
# sample_decoder_layer = DecoderLayer(512, 8, 2048)
#
# sample_decoder_layer_output, _, _ = sample_decoder_layer(
# tf.random.uniform((64, 50, 512)), sample_encoder_layer_output,
# False, None, None)
#
# print(sample_decoder_layer_output.shape) # (batch_size, target_seq_len, d_model)
class Encoder(tf.keras.layers.Layer):
def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,
maximum_position_encoding, rate=0.1):
super(Encoder, self).__init__()
self.d_model = d_model
self.num_layers = num_layers
self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
self.pos_encoding = positional_encoding(maximum_position_encoding,
self.d_model)
self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate)
for _ in range(num_layers)]
self.dropout = tf.keras.layers.Dropout(rate)
def call(self, x, training, mask):
seq_len = tf.shape(x)[1]
# 将嵌入和位置编码相加。
x = self.embedding(x) # (batch_size, input_seq_len, d_model)
x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
x += self.pos_encoding[:, :seq_len, :]
x = self.dropout(x, training=training)
for i in range(self.num_layers):
x = self.enc_layers[i](x, training, mask)
return x # (batch_size, input_seq_len, d_model)
class Decoder(tf.keras.layers.Layer):
def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size,
maximum_position_encoding, rate=0.1):
super(Decoder, self).__init__()
self.d_model = d_model
self.num_layers = num_layers
self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)
self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate)
for _ in range(num_layers)]
self.dropout = tf.keras.layers.Dropout(rate)
def call(self, x, enc_output, training,
look_ahead_mask, padding_mask):
seq_len = tf.shape(x)[1]
attention_weights = {}
x = self.embedding(x) # (batch_size, target_seq_len, d_model)
x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
x += self.pos_encoding[:, :seq_len, :]
x = self.dropout(x, training=training)
for i in range(self.num_layers):
x, block1, block2 = self.dec_layers[i](x, enc_output, training,
look_ahead_mask, padding_mask)
attention_weights['decoder_layer{}_block1'.format(i + 1)] = block1
attention_weights['decoder_layer{}_block2'.format(i + 1)] = block2
# x.shape == (batch_size, target_seq_len, d_model)
return x, attention_weights
# sample_encoder = Encoder(num_layers=2, d_model=512, num_heads=8,
# dff=2048, input_vocab_size=8500,
# maximum_position_encoding=10000)
#
# sample_encoder_output = sample_encoder(tf.random.uniform((64, 62)),
# training=False, mask=None)
#
# print(sample_encoder_output.shape) # (batch_size, input_seq_len, d_model)
#
# sample_decoder = Decoder(num_layers=2, d_model=512, num_heads=8,
# dff=2048, target_vocab_size=8000,
# maximum_position_encoding=5000)
#
# output, attn = sample_decoder(tf.random.uniform((64, 26)),
# enc_output=sample_encoder_output,
# training=False, look_ahead_mask=None,
# padding_mask=None)
#
# print(output.shape, attn['decoder_layer2_block2'].shape)
class Transformer(tf.keras.Model):
def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,
target_vocab_size, pe_input, pe_target, rate=0.1):
super(Transformer, self).__init__()
self.encoder = Encoder(num_layers, d_model, num_heads, dff,
input_vocab_size, pe_input, rate)
self.decoder = Decoder(num_layers, d_model, num_heads, dff,
target_vocab_size, pe_target, rate)
self.final_layer = tf.keras.layers.Dense(target_vocab_size)
def call(self, inp, tar, training, enc_padding_mask,
look_ahead_mask, dec_padding_mask):
enc_output = self.encoder(inp, training, enc_padding_mask) # (batch_size, inp_seq_len, d_model)
# dec_output.shape == (batch_size, tar_seq_len, d_model)
dec_output, attention_weights = self.decoder(
tar, enc_output, training, look_ahead_mask, dec_padding_mask)
final_output = self.final_layer(dec_output) # (batch_size, tar_seq_len, target_vocab_size)
return final_output, attention_weight
这里首先是scaled_dot_product_attention方法,这个方法如同上面图中所画,会先将q和k相乘,然后除以一个缩放值,这个值是其最后一个维度的平方根。然后是mask的处理,在创建mask的时候会将有值的位置设置为0,填充的或需要遮挡的位置设置为1。在处理mask的时候会将mask乘以一个非常大的负数(-1乘以10的9次方),然后将他与缩放后的点积相加,这样mask了的位置便会变成一个很大的负数,这样在随后的softmax操作中其值便会变成0,即mask了的位置的权重为0.最后便是将权重和v相乘得到输出。
然后是MultiHeadAttention类,这个即多头注意力层。如上图所示,他会先将q、k、v分别传入一个线性层(全连接层),然后按照传入的多头的数量拆分出一个新的维度来,然利用拆分后的数据来计算其注意力。计算完注意力后再还原回原来的维度,最后再通过一个线性层(全连接层)得到输出。
然后是point_wise_feed_forward_network方法,这个方法很简单就是两个全连接层。
然后是EncoderLayer类,这里对应的是第一张图中左边框中的内容,主要是一个多头注意力层和一个前馈网络。在call方法中首先会将传入的x作为多头注意力的q、k、v,来获取注意力输出。然后是一个dropout和layerNorm层,这里需要注意的是在做layerNorm前实际是使用残差的思路的,将输入的x和处理后的attn_output相加,对其结果再做layerNorm操作。最后是一个前馈网络和dropout+layerNorm。
接着是DecoderLayer类,他对应的是第一张图中右边框中的内容,这里是两个多头注意力层和一个前馈网络。在call方法中首先是第一个多头注意力层,这里是将输入的x作为q、k、v,然后是与编码层相同的dropout+layerNorm。然后是第二个多头注意力层,这里需要注意的是他输入的q与k、v不再一样,q是上一层的多头的输出,而k、v是编码器的输出。最后再接一个前馈网络。
然后是Encoder和Decoder类,这两个类很相似,都是先将输入的词编码成词向量,然后加上其位置编码,得到的值作为输入,循环指定次数的编码器层或解码器层。
最后的Transformer类也很简单,先执行编码器,然后再执行解码器,最后将解码器的结果传入一个全连接层,获取最后的输出。
模型训练
num_layers = 4
d_model = 128
dff = 512
num_heads = 8
input_vocab_size = tokenizer_pt.vocab_size + 2
target_vocab_size = tokenizer_en.vocab_size + 2
dropout_rate = 0.1
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
def __init__(self, d_model, warmup_steps=4000):
super(CustomSchedule, self).__init__()
self.d_model = d_model
self.d_model = tf.cast(self.d_model, tf.float32)
self.warmup_steps = warmup_steps
def __call__(self, step):
arg1 = tf.math.rsqrt(step)
arg2 = step * (self.warmup_steps ** -1.5)
return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)
learning_rate = CustomSchedule(d_model)
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
epsilon=1e-9)
# temp_learning_rate_schedule = CustomSchedule(d_model)
#
# plt.plot(temp_learning_rate_schedule(tf.range(40000, dtype=tf.float32)))
# plt.ylabel("Learning Rate")
# plt.xlabel("Train Step")
# plt.show()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
from_logits=True, reduction='none')
def loss_function(real, pred):
mask = tf.math.logical_not(tf.math.equal(real, 0))
loss_ = loss_object(real, pred)
mask = tf.cast(mask, dtype=loss_.dtype)
loss_ *= mask
return tf.reduce_mean(loss_)
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
name='train_accuracy')
transformer = Transformer(num_layers, d_model, num_heads, dff,
input_vocab_size, target_vocab_size,
pe_input=input_vocab_size,
pe_target=target_vocab_size,
rate=dropout_rate)
def create_masks(inp, tar):
# 编码器填充遮挡
enc_padding_mask = create_padding_mask(inp)
# 在解码器的第二个注意力模块使用。
# 该填充遮挡用于遮挡编码器的输出。
dec_padding_mask = create_padding_mask(inp)
# 在解码器的第一个注意力模块使用。
# 用于填充(pad)和遮挡(mask)解码器获取到的输入的后续标记(future tokens)。
look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
dec_target_padding_mask = create_padding_mask(tar)
# print("look_ahead_mask:", look_ahead_mask.shape)
# print("dec_target_padding_mask:", dec_target_padding_mask.shape)
combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)
return enc_padding_mask, combined_mask, dec_padding_mask
checkpoint_path = "../savemodel/transformer"
ckpt = tf.train.Checkpoint(transformer=transformer,
optimizer=optimizer)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)
# 如果检查点存在,则恢复最新的检查点。
if ckpt_manager.latest_checkpoint:
ckpt.restore(ckpt_manager.latest_checkpoint)
print('Latest checkpoint restored!!')
EPOCHS = 10
# 该 @tf.function 将追踪-编译 train_step 到 TF 图中,以便更快地
# 执行。该函数专用于参数张量的精确形状。为了避免由于可变序列长度或可变
# 批次大小(最后一批次较小)导致的再追踪,使用 input_signature 指定
# 更多的通用形状。
train_step_signature = [
tf.TensorSpec(shape=(None, None), dtype=tf.int64),
tf.TensorSpec(shape=(None, None), dtype=tf.int64),
]
# @tf.function
@tf.function(input_signature=train_step_signature)
def train_step(inp, tar):
tar_inp = tar[:, :-1]
tar_real = tar[:, 1:]
enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)
# print("enc_padding_mask:", enc_padding_mask.shape)
# print("combined_mask:", combined_mask.shape)
# print("dec_padding_mask:", dec_padding_mask.shape)
with tf.GradientTape() as tape:
predictions, _ = transformer(inp, tar_inp,
True,
enc_padding_mask,
combined_mask,
dec_padding_mask)
loss = loss_function(tar_real, predictions)
gradients = tape.gradient(loss, transformer.trainable_variables)
optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
train_loss(loss)
train_accuracy(tar_real, predictions)
for epoch in range(EPOCHS):
start = time.time()
train_loss.reset_states()
train_accuracy.reset_states()
# inp -> portuguese, tar -> english
for (batch, (inp, tar)) in enumerate(train_dataset):
train_step(inp, tar)
if batch % 50 == 0:
print('Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}'.format(
epoch + 1, batch, train_loss.result(), train_accuracy.result()))
if (epoch + 1) % 5 == 0:
ckpt_save_path = ckpt_manager.save()
print('Saving checkpoint for epoch {} at {}'.format(epoch + 1,
ckpt_save_path))
print('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1,
train_loss.result(),
train_accuracy.result()))
print('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))
模型训练与通常的seq2seq模型相同。其中CustomSchedule类是一个用来根据训练批次来动态调整学习率的类,然后是损失函数,其与一般的seq2seq的损失函数相同。然后是创建mask的create_masks方法,这里调用的create_padding_mask方法在前面解析过,他会将输入中的填充数据添加成mask。这里输入的inp会创建两个mask,一个在编码器使用,一个在解码器使用。然后是输出的tar会先做一个逐步向前的mask和一个填充的mask,然后将其合并。
最后需要说明的是train_step方法。这里其实也很简单,首先是根据输入输出创建mask,然后将输入输出传入到transformer中,获取预测值,最后再计算损失更新参数。
模型预测
def evaluate(inp_sentence):
start_token = [tokenizer_pt.vocab_size]
end_token = [tokenizer_pt.vocab_size + 1]
# 输入语句是葡萄牙语,增加开始和结束标记
inp_sentence = start_token + tokenizer_pt.encode(inp_sentence) + end_token
encoder_input = tf.expand_dims(inp_sentence, 0)
# 因为目标是英语,输入 transformer 的第一个词应该是
# 英语的开始标记。
decoder_input = [tokenizer_en.vocab_size]
output = tf.expand_dims(decoder_input, 0)
for i in range(MAX_LENGTH):
enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
encoder_input, output)
# predictions.shape == (batch_size, seq_len, vocab_size)
predictions, attention_weights = transformer(encoder_input,
output,
False,
enc_padding_mask,
combined_mask,
dec_padding_mask)
# 从 seq_len 维度选择最后一个词
predictions = predictions[:, -1:, :] # (batch_size, 1, vocab_size)
predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)
# 如果 predicted_id 等于结束标记,就返回结果
if predicted_id == tokenizer_en.vocab_size + 1:
return tf.squeeze(output, axis=0), attention_weights
# 连接 predicted_id 与输出,作为解码器的输入传递到解码器。
output = tf.concat([output, predicted_id], axis=-1)
return tf.squeeze(output, axis=0), attention_weights
def plot_attention_weights(attention, sentence, result, layer):
fig = plt.figure(figsize=(16, 8))
sentence = tokenizer_pt.encode(sentence)
attention = tf.squeeze(attention[layer], axis=0)
for head in range(attention.shape[0]):
ax = fig.add_subplot(2, 4, head + 1)
# 画出注意力权重
ax.matshow(attention[head][:-1, :], cmap='viridis')
fontdict = {'fontsize': 10}
ax.set_xticks(range(len(sentence) + 2))
ax.set_yticks(range(len(result)))
ax.set_ylim(len(result) - 1.5, -0.5)
ax.set_xticklabels(
['<start>'] + [tokenizer_pt.decode([i]) for i in sentence] + ['<end>'],
fontdict=fontdict, rotation=90)
ax.set_yticklabels([tokenizer_en.decode([i]) for i in result
if i < tokenizer_en.vocab_size],
fontdict=fontdict)
ax.set_xlabel('Head {}'.format(head + 1))
plt.tight_layout()
plt.show()
def translate(sentence, plot=''):
result, attention_weights = evaluate(sentence)
predicted_sentence = tokenizer_en.decode([i for i in result
if i < tokenizer_en.vocab_size])
print('Input: {}'.format(sentence))
print('Predicted translation: {}'.format(predicted_sentence))
if plot:
plot_attention_weights(attention_weights, sentence, result, plot)
translate("este é um problema que temos que resolver.")
print ("Real translation: this is a problem we have to solve .")
translate("os meus vizinhos ouviram sobre esta ideia.")
print ("Real translation: and my neighboring homes heard about this idea .")
这里主要是调用训练好的模型,翻译文本。入口方法是translate。