基于`transformer`的中译英模型
- 参考
- 一、语料数据处理及向量化
- 二、位置编码(Positional encoding)
- 三、遮挡(Masking):遮挡序列中满足特定条件的标记
- 四、缩放点积注意力(Scaled dot product attention)
- 五、多头注意力层(Multi-head attention)
- 六、点式前馈网络(Point wise feed forward network)
- 七、编码与解码(Encoder and decoder)
- 八、Transformer
- 九、配置超参数
- 十、优化器(Optimizer)
- 十一、损失函数与指标(Loss and metrics)
- 十二、训练与检查点(Training and checkpointing)
- 十三、评估
- 结尾
参考
import io
import re
import jieba
import tensorflow as tf
import time
import numpy as np
import matplotlib.pyplot as plt
import zhconv
import json
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.font_manager import FontManager
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import tokenizer_from_json
plt.rcParams['font.sans-serif']=['SimHei']
tf.__version__
实验tensorflow版本2.6.0
一、语料数据处理及向量化
data_dir = '../datasets/cmn-eng/cmn.txt'
解压后数据集的位置
数据集预览
['Hi.\t嗨。\tCC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #891077 (Martha)\n',
'Hi.\t你好。\tCC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #4857568 (musclegirlxyp)\n',
'Run.\t你用跑的。\tCC-BY 2.0 (France) Attribution: tatoeba.org #4008918 (JSakuragi) & #3748344 (egg0073)\n',
'Wait!\t等等!\tCC-BY 2.0 (France) Attribution: tatoeba.org #1744314 (belgavox) & #4970122 (wzhd)\n',
'Wait!\t等一下!\tCC-BY 2.0 (France) Attribution: tatoeba.org #1744314 (belgavox) & #5092613 (mirrorvan)\n']
# 英文处理
def preprocess_eng(w):
w = w.lower().strip()
# 在单词与跟在其后的标点符号之间插入一个空格
# 例如: "he is a boy." => "he is a boy ."
w = re.sub(r"([?.!,¿])", r" \1 ", w)
w = re.sub(r"[' ']+", " ", w)
# 除了 (a-z, A-Z, ".", "?", "!", ","),将所有字符替换为空格
w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
w = w.rstrip().strip()
# 给句子加上开始和结束标记,以便模型知道何时开始和结束预测
w = '<start> ' + w + ' <end>'
return w
w = re.sub(r"([?.!,¿])", r" \1 ", w) # 将标点符号与文本分隔
w = re.sub(r"[' ']+", " ", w) # 清除多余空格
w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w) # 替换意外字符
# 中文处理
def preprocess_zh(w):
w = zhconv.convert(w, "zh-cn")
w = ' '.join(jieba.cut(w))
w = w.rstrip().strip()
w = "<start> " + w + " <end>"
return w
zhcon.convert()方法将文本转化为简体中文
jieba.cut()方法中文分词
# 原始语料预处理
def create_dataset(data_dir, num_examples):
lines = io.open(data_dir, encoding='utf-8').read().strip().split('\n')
eng, zh = [], []
for line in lines[:num_examples]:
eng_, zh_, _ = line.split('\t')
word_pairs = [eng_,zh_]
eng.append(preprocess_eng(word_pairs[0]))
zh.append(preprocess_zh(word_pairs[1]))
return [eng, zh]
将数据集文本分割出中英文部分,同时中文部分完成分词
eng, zh = create_dataset(data_dir, 10)
print(eng[-1])
print(zh[-1])
<start> there s something strange about tom today . <end>
<start> 今天 汤姆 有点 奇怪 。 <end>
# 处理后数据预览
# 文本向量化
def tokenize(lang):
lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters=' ')
lang_tokenizer.fit_on_texts(lang)
tensor = lang_tokenizer.texts_to_sequences(lang)
tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
padding="post")
return tensor, lang_tokenizer
def load_dataset(data_dir, num_examples=None):
eng, zh = create_dataset(data_dir, num_examples)
zh_tensor, zh_tokenizer = tokenize(zh)
eng_tensor, eng_tokenizer = tokenize(eng)
return zh_tensor, zh_tokenizer, eng_tensor, eng_tokenizer
num_examples = None
zh_tensor, zh_tokenizer, eng_tensor, eng_tokenizer = load_dataset(data_dir, num_examples)
# KEY:将分词器保存,方便后续使用
tokenizer_dir = '../save/zh2eng_transformer_tokenizer/'
zh_tokenizer_dir = tokenizer_dir + 'zh_tokenizer.json'
eng_tokenizer_dir = tokenizer_dir + 'eng_tokenizer.json'
tokenizer_json = zh_tokenizer.to_json()
with io.open(zh_tokenizer_dir, 'w', encoding='utf-8') as f:
f.write(json.dumps(tokenizer_json, ensure_ascii=False))
tokenizer_json = eng_tokenizer.to_json()
with io.open(eng_tokenizer_dir, 'w', encoding='utf-8') as f:
f.write(json.dumps(tokenizer_json, ensure_ascii=False))
# 拆分训练集与测试集
zh_tensor_train, zh_tensor_val, eng_tensor_train, eng_tensor_val = train_test_split(
zh_tensor, eng_tensor, test_size=0.2)
def convert(tokenizer, tensor):
for t in tensor:
if t != 0:
print("%d ---> %s" % (t, tokenizer.index_word[t]))
print("Input Language; index to word mapping")
convert(zh_tokenizer, zh_tensor_train[0])
print()
print("Target Language; index to word mapping")
convert(eng_tokenizer, eng_tensor_train[0])
# convert()方法将将向量转回文字
# 预览
Input Language; index to word mapping
1 ---> <start>
22 ---> 这
834 ---> 条
2534 ---> 小路
1710 ---> 沿着
9002 ---> 陡峭
5 ---> 的
9003 ---> 斜坡
5401 ---> 蜿蜒
360 ---> 而
45 ---> 上
3 ---> 。
2 ---> <end>
Target Language; index to word mapping
1 ---> <start>
5 ---> the
1876 ---> path
3813 ---> zigzagged
61 ---> up
5 ---> the
3814 ---> steep
3815 ---> slope
3 ---> .
2 ---> <end>
BUFFER_SIZE = len(zh_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = len(zh_tensor_train) // BATCH_SIZE
embedding_size = 256
# units = 1024
zh_vocab_size = len(zh_tokenizer.word_index) + 1
eng_vocab_size = len(eng_tokenizer.word_index) + 1
# 创建 tf.dataset 数据集
dataset = tf.data.Dataset.from_tensor_slices(
(zh_tensor_train, eng_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True).prefetch(
tf.data.experimental.AUTOTUNE)
example_zh_batch, example_eng_batch = next(iter(dataset))
二、位置编码(Positional encoding)
对 token 序列中,每个 token 所处的位置信息进行编码,每个位置编码成一个维度为 d_model 的向量
P E ( p o s , 2 i ) = s i n ( p o s / 1000 0 2 i / d _ m o d e l ) PE_{(pos, 2i)} = sin(pos/10000^{2i/d\_model}) PE(pos,2i)=sin(pos/100002i/d_model)
P E ( p o s , 2 i + 1 ) = s i n ( p o s / 1000 0 2 i / d _ m o d e l ) PE_{(pos, 2i+1)} = sin(pos/10000^{2i/d\_model}) PE(pos,2i+1)=sin(pos/100002i/d_model)
transformer采用自注意力编码器摒弃了循环网络,循环网络的顺序信息在自注意力编码器中没有了,而语言往往是顺序不同意思相差很大。所以transformer的作者设计了一种三角函数位置编码方式,为每个不同位置的token单独生成一个位置向量。
def get_angles(pos, i, d_model):
"""
pos: [seq_len, 1]
i: [1, d_model]
d_model: int
return: [seq_len, d_model]
"""
angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
return pos * angle_rates
def positional_encoding(position, d_model):
# 位置总数 position 个,每个位置编码成 d_model 维向量
angle_rads = get_angles(
np.arange(position)[:, np.newaxis],
np.arange(d_model)[np.newaxis, :], d_model)
# 将 sin 应用于 d_model 维中的偶数索引
angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
# 将 cos 应用于 d_model 维中的偶数索引
angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
# 返回:[1, position, d_model]
pos_encoding = angle_rads[np.newaxis, ...]
return tf.cast(pos_encoding, dtype=tf.float32)
pos_encoding = positional_encoding(30, 512)
plt.pcolormesh(pos_encoding[0], cmap='RdBu')
plt.xlabel('Depth')
plt.xlim((0, 512))
plt.ylabel('Position')
plt.colorbar()
plt.show()
图表示位置编码随x,y坐标的变化
三、遮挡(Masking):遮挡序列中满足特定条件的标记
- 遮挡值为 0 的标记
- 前瞻遮挡(look-ahead mask)用于遮挡一个序列中的后续标记(future tokens)。
例如序列生成中:
当要预测第三个词时,将仅使用第一个和第二个词,后续所有词被遮挡;
与此类似,预测第四个词,仅使用第一个,第二个和第三个词,依此类推。
def create_padding_mask(seq):
# 将 seq 序列中为 0 的位置标记为 1,其它的位置标记为 0
seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
return seq[:, tf.newaxis, tf.newaxis, :] # (batch_size, 1, 1, seq_len)
# 增加到 4 维,是因为注意力权重形状为 [batch_size, num_heads, seq_len, seq_len]
# 测试
x = tf.constant([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]])
create_padding_mask(x)
# 返回值
<tf.Tensor: shape=(3, 1, 1, 5), dtype=float32, numpy=
array([[[[0., 0., 1., 1., 0.]]],
[[[0., 0., 0., 1., 1.]]],
[[[1., 1., 1., 0., 0.]]]], dtype=float32)>
def create_look_ahead_mask(size):
# tf.linalg.band_part: 以对角线为中心,取指定带宽的上三角和下三角,其它数据 0 填充
mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
return mask
# 测试
x = tf.random.uniform((1, 3))
print(create_look_ahead_mask(x.shape[1]))
# 返回值
tf.Tensor(
[[0. 1. 1.]
[0. 0. 1.]
[0. 0. 0.]], shape=(3, 3), dtype=float32)
四、缩放点积注意力(Scaled dot product attention)
在Transformer模型中,Scaled Dot-Product Attention被用于实现Multi-Head Attention。具体来说,Multi-Head Attention将输入矩阵分别进行多个头的线性变换,然后对每个头的变换结果分别计算Scaled Dot-Product Attention,最后将每个头的Attention结果拼接在一起并通过一个线性变换输出。
————————————————
版权声明:本文为CSDN博主「chattyfish」的原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接及本声明。
原文链接:https://blog.csdn.net/chattyfish/article/details/130234804
- 原理如下图所示:
注意力函数有三个输入:Q(请求(query))、K(主键(key))、V(数值(value)).用于计算注意力权重的等式为:
A t t e n t i o n ( Q , K , V ) = s o f t m a x ( Q K T d k ) V Attention(Q,K,V)=softmax(\frac{QK^T}{\sqrt{d_k}})V Attention(Q,K,V)=softmax(dkQKT)V
点积注意力被缩小了深度的平方根倍 d k \sqrt{d_k} dk。这样做是因为对于较大的深度值,点积的大小会增大,从而推动 softmax 函数往仅有很小的梯度的方向靠拢,导致了一种很硬的(hard)softmax。
- 在机器翻译中,点积注意力的用法如下所示
图中向量 q t q_t qt 与 K = [ k 1 , k 2 , k 3 , k 4 ] K=[k_1, k_2, k_3, k_4] K=[k1,k2,k3,k4] 每个向量做点积,然后转换为权重,权重值越大可以认为越“关注”该处的信息,然后每个位置的 v v v 与该处的权重相乘,然后求和得到 q t q_t qt 对 K K K 做注意力的输出向量
def scaled_dot_product_attention(q, k, v, mask):
"""
:param q: (batch_size, seq_len_q, depth)
:param k: (batch_size, seq_len_k, depth)
:param v: (batch_size, seq_len_v, depth_v)
:param mask: (batch_size, seq_len_q, deq_len_k)
:return:
"""
matmul_qk = tf.matmul(q, k, transpose_b=True)
# (..., seq_len_q, seq_len_k)
dk = tf.cast(tf.shape(k)[-1], tf.float32)
scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
#print("shape of atten:", scaled_attention_logits.shape)
if mask is not None:
#print("shape of mask:", mask.shape)
scaled_attention_logits += (mask * -1e9)
# 在待遮挡处减去一个较大值,softmax 即可忽略该位置
attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
# (..., seq_len_q, seq_len_k)
output = tf.matmul(attention_weights, v)
# (..., seq_len_q, depth_v)
return output, attention_weights
def print_out(q, k, v):
# 创建一个与 q * k 形状相同的 mask
mask = tf.matmul(q, k, transpose_b=True)
temp_out, temp_attn = scaled_dot_product_attention(q, k, v, mask)
print('Attention weights are:')
print(temp_attn)
print('Output is:')
print(temp_out)
np.set_printoptions(suppress=True)
temp_k = tf.constant([[10, 0, 0], [0, 10, 0], [0, 0, 10], [0, 0, 10]],
dtype=tf.float32) # (4, 3)
temp_v = tf.constant([[1, 0], [10, 0], [100, 5], [1000, 6]],
dtype=tf.float32) # (4, 2)
# 这条 `请求(query)符合第二个`主键(key)`,
# 因此返回了第二个`数值(value)`。
temp_q = tf.constant([[0, 10, 0]], dtype=tf.float32) # (1, 3)
print_out(temp_q, temp_k, temp_v)
Attention weights are:
tf.Tensor([[0.33333334 0. 0.33333334 0.33333334]], shape=(1, 4), dtype=float32)
Output is:
tf.Tensor([[366.9104 3.6657715]], shape=(1, 2), dtype=float32)
五、多头注意力层(Multi-head attention)
- 多头注意力如下图所示:
Q、K、和 V 被拆分到了多个头,允许模型共同注意来自不同表示空间的不同位置的信息
class MultiHeadAttention(tf.keras.layers.Layer):
def __init__(self, d_model, num_heads):
super(MultiHeadAttention, self).__init__()
self.num_heads = num_heads
self.d_model = d_model
assert self.d_model % self.num_heads == 0
self.depth = d_model // self.num_heads
self.wq = tf.keras.layers.Dense(self.d_model)
self.wk = tf.keras.layers.Dense(self.d_model)
self.wv = tf.keras.layers.Dense(self.d_model)
self.dense = tf.keras.layers.Dense(d_model)
def split_heads(self, x, batch_size):
x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
return tf.transpose(x, perm=(0, 2, 1, 3))
# (batch_size, num_heads, seq_len, depth),便于计算
def call(self, v, k, q, mask):
batch_size = tf.shape(q)[0]
q = self.wq(q) # (batch_size, seq_len, d_model)
k = self.wk(k) # (batch_size, seq_len, d_model)
v = self.wv(v)
q = self.split_heads(
q, batch_size) # (batch_size, num_heads, seq_len_q, depth)
k = self.split_heads(k, batch_size)
v = self.split_heads(v, batch_size)
# scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
# attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
scaled_attention, attention_weights = scaled_dot_product_attention(
q, k, v, mask)
scaled_attention = tf.transpose(
scaled_attention,
perm=[0, 2, 1, 3]) # (batch_size, seq_len_q, num_heads, depth)
concat_attention = tf.reshape(scaled_attention,
(batch_size, -1, self.d_model))
output = self.dense(
concat_attention) # (batch_size, seq_len_q, d_model)
return output, attention_weights
temp_mha = MultiHeadAttention(d_model=512, num_heads=8)
y = tf.random.uniform((1, 60, 512)) # (batch_size, encoder_sequence, d_model)
out, attn = temp_mha(y, k=y, q=y, mask=None)
out.shape, attn.shape
(TensorShape([1, 60, 512]), TensorShape([1, 8, 60, 60]))
六、点式前馈网络(Point wise feed forward network)
由两层全联接层组成,两层之间有一个 ReLU 激活函数
def point_wise_feed_forward_network(d_model, dff):
return tf.keras.Sequential([
tf.keras.layers.Dense(dff, activation='relu'),
# (batch_size, seq_len, dff)
tf.keras.layers.Dense(d_model)
# (batch_size, seq_len, d_model)
])
sample_ffn = point_wise_feed_forward_network(512, 2048)
print(sample_ffn(tf.random.uniform((64, 50, 512))).shape)
(64, 50, 512)
七、编码与解码(Encoder and decoder)
Transformer 模型与标准的具有注意力机制的序列到序列模型(sequence to sequence with attention model),遵循相同的一般模式。其结构如下所示:
1. 编码器层(Encoder layer)
class EncoderLayer(tf.keras.layers.Layer):
def __init__(self, d_model, num_heads, dff, rate=0.1):
super(EncoderLayer, self).__init__()
self.mha = MultiHeadAttention(d_model, num_heads)
self.ffn = point_wise_feed_forward_network(d_model, dff)
self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
self.dropout1 = tf.keras.layers.Dropout(rate)
self.dropout2 = tf.keras.layers.Dropout(rate)
def call(self, x, training, mask):
attn_output, _ = self.mha(x, x, x,mask) # (batch_size, input_seq_len, d_model)
attn_output = self.dropout1(attn_output, training=training)
out1 = self.layernorm1(x + attn_output) # (batch_size, input_seq_len, d_model)
ffn_output = self.ffn(out1) # (batch_size, input_seq_len, d_model)
ffn_output = self.dropout2(ffn_output, training=training)
out2 = self.layernorm2(out1 + ffn_output) # (batch_size, input_seq_len, d_model)
return out2
sample_encoder_layer = EncoderLayer(512, 8, 2048)
sample_encoder_layer_output = sample_encoder_layer(
tf.random.uniform((64, 43, 512)), False, None)
print(sample_encoder_layer_output.shape)
(64, 43, 512)
2. 解码器层(Decoder layer)
class DecoderLayer(tf.keras.layers.Layer):
def __init__(self, d_model, num_heads, dff, rate=0.1):
super(DecoderLayer, self).__init__()
self.mha1 = MultiHeadAttention(d_model, num_heads)
self.mha2 = MultiHeadAttention(d_model, num_heads)
self.ffn = point_wise_feed_forward_network(d_model, dff)
self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
self.dropout1 = tf.keras.layers.Dropout(rate)
self.dropout2 = tf.keras.layers.Dropout(rate)
self.dropout3 = tf.keras.layers.Dropout(rate)
def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
# enc_output.shape == (batch_size, input_seq_len, d_model)
attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask) # (batch_size, target_seq_len, d_model)
attn1 = self.dropout1(attn1, training=training)
out1 = self.layernorm1(attn1 + x)
attn2, attn_weights_block2 = self.mha2(
enc_output, enc_output, out1, padding_mask) # (batch_size, target_seq_len, d_model)
attn2 = self.dropout2(attn2, training=training)
out2 = self.layernorm2(attn2 + out1) # (batch_size, target_seq_len, d_model)
ffn_output = self.ffn(out2) # (batch_size, target_seq_len, d_model)
ffn_output = self.dropout3(ffn_output, training=training)
out3 = self.layernorm3(ffn_output + out2) # (batch_size, target_seq_len, d_model)
return out3, attn_weights_block1, attn_weights_block2
sample_decoder_layer = DecoderLayer(512, 8, 2048)
sample_decoder_layer_output, _, _ = sample_decoder_layer(
tf.random.uniform((64, 50, 512)), sample_encoder_layer_output, False, None,
None)
print(sample_decoder_layer_output.shape) # (batch_size, target_seq_len, d_model)
(64, 50, 512)
3. 编码器
- 输入嵌入(Input Embedding)
- 位置编码(Positional Encoding)
- N 个编码器层(encoder layers)
class Encoder(tf.keras.layers.Layer):
def __init__(self,
num_layers,
d_model,
num_heads,
dff,
input_vocab_size,
maximum_position_encoding,
rate=0.1):
super(Encoder, self).__init__()
self.d_model = d_model
self.num_layers = num_layers
self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
self.pos_encoding = positional_encoding(maximum_position_encoding,
self.d_model)
self.enc_layers = [
EncoderLayer(d_model, num_heads, dff, rate)
for _ in range(num_layers)
]
self.dropout = tf.keras.layers.Dropout(rate)
def call(self, x, training, mask):
seq_len = tf.shape(x)[1]
x = self.embedding(x)
x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
x += self.pos_encoding[:, :seq_len, :]
x = self.dropout(x, training=training)
for i in range(self.num_layers):
x = self.enc_layers[i](x, training, mask)
return x
sample_encoder = Encoder(num_layers=2,
d_model=512,
num_heads=8,
dff=2048,
input_vocab_size=8500,
maximum_position_encoding=10000)
sample_encoder_output = sample_encoder(tf.random.uniform((64, 62)),
training=False,
mask=None)
print(sample_encoder_output.shape) # (batch_size, input_seq_len, d_model)
(64, 62, 512)
4. 解码器(Decoder)
- 输出嵌入(Output Embedding)
- 位置编码(Positional Encoding)
- N 个解码器层(decoder layers)
class Decoder(tf.keras.layers.Layer):
def __init__(self,
num_layers,
d_model,
num_heads,
dff,
target_vocab_size,
maximum_position_encoding,
rate=0.1):
super(Decoder, self).__init__()
self.d_model = d_model
self.num_layers = num_layers
self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
self.pos_encoding = positional_encoding(maximum_position_encoding,
d_model)
self.dec_layers = [
DecoderLayer(d_model, num_heads, dff, rate)
for _ in range(num_layers)
]
self.dropout = tf.keras.layers.Dropout(rate)
def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
seq_len = tf.shape(x)[1]
attention_weights = {}
x = self.embedding(x)
x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
x += self.pos_encoding[:, :seq_len, :]
x = self.dropout(x, training=training)
for i in range(self.num_layers):
x, block1, block2 = self.dec_layers[i](x, enc_output, training,
look_ahead_mask,
padding_mask)
attention_weights['decoder_layer{}_block1'.format(i + 1)] = block1
attention_weights['decoder_layer{}_block2'.format(i + 1)] = block2
return x, attention_weights
sample_decoder = Decoder(num_layers=2, d_model=512, num_heads=8, dff=2048,
target_vocab_size=8000, maximum_position_encoding=5000)
output, attn = sample_decoder(tf.random.uniform((64, 26)),
enc_output=sample_encoder_output, training=False,
look_ahead_mask=None, padding_mask=None)
print(output.shape, attn['decoder_layer2_block2'].shape)
(64, 26, 512) (64, 8, 26, 62)
八、Transformer
class Transformer(tf.keras.Model):
def __init__(self,
num_layers,
d_model,
num_heads,
dff,
input_vocab_size,
target_vocab_size,
pe_input,
pe_target,
rate=0.1):
super(Transformer, self).__init__()
self.encoder = Encoder(num_layers, d_model, num_heads, dff,
input_vocab_size, pe_input, rate)
self.decoder = Decoder(num_layers, d_model, num_heads, dff,
target_vocab_size, pe_target, rate)
self.final_layer = tf.keras.layers.Dense(target_vocab_size)
def call(self, inp, tar, training, enc_padding_mask, look_ahead_mask,
dec_padding_mask):
enc_output = self.encoder(inp, training, enc_padding_mask)
dec_ouput, attention_weights = self.decoder(tar, enc_output, training,
look_ahead_mask,
dec_padding_mask)
final_output = self.final_layer(dec_ouput)
return final_output, attention_weights
sample_transformer = Transformer(num_layers=2,
d_model=512,
num_heads=8,
dff=2048,
input_vocab_size=8500,
target_vocab_size=8000,
pe_input=10000,
pe_target=6000)
temp_input = tf.random.uniform((64, 62))
temp_target = tf.random.uniform((64, 26))
fn_out, _ = sample_transformer(temp_input,
temp_target,
training=False,
enc_padding_mask=None,
look_ahead_mask=None,
dec_padding_mask=None)
print(fn_out.shape)
(64, 26, 8000)
九、配置超参数
num_layers = 4
d_model = 128
dff = 512
num_heads = 8
input_vocab_size = len(zh_tokenizer.word_index) + 1
target_vocab_size = len(eng_tokenizer.word_index) + 1
dropout_rate = 0.1
十、优化器(Optimizer)
将 Adam 优化器与自定义的学习速率调度程序(scheduler)配合使用:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
def __init__(self, d_model, warmup_steps=4000):
super(CustomSchedule, self).__init__()
self.d_model = d_model
self.d_model = tf.cast(self.d_model, tf.float32)
self.warmup_steps = warmup_steps
def __call__(self, step):
step = tf.cast(step, tf.float32)
arg1 = tf.math.rsqrt(step)
arg2 = step * (self.warmup_steps**-1.5)
return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)
learning_rate = CustomSchedule(d_model)
optimizer = tf.keras.optimizers.Adam(learning_rate,
beta_1=0.9,
beta_2=0.98,
epsilon=1e-9)
temp_learning_rate_schedule = CustomSchedule(d_model)
plt.plot(temp_learning_rate_schedule(tf.range(40000, dtype=tf.float32)))
plt.ylabel("Learning Rate")
plt.xlabel("Train Step")
plt.show()
十一、损失函数与指标(Loss and metrics)
由于目标序列是填充(padded)过的,因此在计算损失函数时,应用填充遮挡非常重要。
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,
reduction='none')
def loss_function(real, pred):
mask = tf.math.logical_not(tf.math.equal(real, 0))
loss_ = loss_object(real, pred)
mask = tf.cast(mask, dtype=loss_.dtype)
loss_ *= mask
return tf.reduce_mean(loss_)
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
name='train_accuracy')
十二、训练与检查点(Training and checkpointing)
transformer = Transformer(num_layers,
d_model,
num_heads,
dff,
input_vocab_size,
target_vocab_size,
pe_input=input_vocab_size,
pe_target=target_vocab_size,
rate=dropout_rate)
# TODO:各有什么作用??
def create_masks(inp, tar):
# inp:(batch_size, seq_len_inp)
# tar:(batch_size, seq_len_tar - 1)
# 1,编码器填充遮挡,将输入序列 pad 的部分填充掉:
# (batch_size, 1, 1, seq_len_inp)
enc_padding_mask = create_padding_mask(inp)
# 2,用于解码器对编码器的输入进行注意力做 mask,因此维度为 seq_len_inp
# (batch_size, 1, 1, seq_len_inp),
dec_padding_mask = create_padding_mask(inp)
# 3. 不仅遮挡输入末尾填充(pad)的 0,还要遮挡输入的后续标记(future tokens):
# (seq_len_tar - 1, seq_len_tar - 1)
look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
# (batch_size, 1, 1, seq_len_tar - 1)
dec_target_padding_mask = create_padding_mask(tar)
combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)
return enc_padding_mask, combined_mask, dec_padding_mask
checkpoint_path = "../H/save/zh2eng_transformer"
ckpt = tf.train.Checkpoint(transformer=transformer, optimizer=optimizer)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)
# 如果检查点存在,则恢复最新的检查点。
if ckpt_manager.latest_checkpoint:
ckpt.restore(ckpt_manager.latest_checkpoint)
print('Latest checkpoint restored!!')
EPOCHS = 20
# 该 @tf.function 将追踪-编译 train_step 到 TF 图中,以便更快地
# 执行。该函数专用于参数张量的精确形状。为了避免由于可变序列长度或可变
# 批次大小(最后一批次较小)导致的再追踪,使用 input_signature 指定
# 更多的通用形状。
@tf.function()
def train_step(inp, tar):
tar_inp = tar[:, :-1]
tar_real = tar[:, 1:] # 给定前面的词,预测下一个词
enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
inp, tar_inp)
with tf.GradientTape() as tape:
predictions, _ = transformer(inp, tar_inp, True, enc_padding_mask,
combined_mask, dec_padding_mask)
loss = loss_function(tar_real, predictions)
gradients = tape.gradient(loss, transformer.trainable_variables)
optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
train_loss(loss)
train_accuracy(tar_real, predictions)
for epoch in range(EPOCHS):
start = time.time()
train_loss.reset_states()
train_accuracy.reset_states()
# inp -> zh, tar -> english
for (batch, (inp, tar)) in enumerate(dataset):
train_step(inp, tar)
if batch % 50 == 0:
print('Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}'.format(
epoch + 1, batch, train_loss.result(),
train_accuracy.result()))
if (epoch + 1) % 5 == 0:
ckpt_save_path = ckpt_manager.save()
print('Saving checkpoint for epoch {} at {}'.format(
epoch + 1, ckpt_save_path))
print('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(
epoch + 1, train_loss.result(), train_accuracy.result()))
print('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))
十三、评估
max_length_ch = zh_tensor.shape[-1]
max_length_eng = eng_tensor.shape[-1]
def evaluate(inp_sentence):
sentence = preprocess_zh(inp_sentence)
inputs = [zh_tokenizer.word_index[i] for i in sentence.split(' ')]
inputs = tf.keras.preprocessing.sequence.pad_sequences(
[inputs], maxlen=max_length_ch, padding='post')
encoder_input = tf.convert_to_tensor(inputs)
# encoder_input = tf.expand_dims(inp_sentence, 0)
# 因为目标是英语,输入 transformer 的第一个词应该是
# 英语的开始标记。
decoder_input = [eng_tokenizer.word_index['<start>']]
output = tf.expand_dims(decoder_input, 0)
for i in range(max_length_eng):
enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
encoder_input, output)
# predictions.shape == (batch_size, seq_len, vocab_size)
predictions, attention_weights = transformer(encoder_input, output,
False, enc_padding_mask,
combined_mask,
dec_padding_mask)
# 从 seq_len 维度选择最后一个词
predictions = predictions[:, -1:, :] # (batch_size, 1, vocab_size)
predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)
# 如果 predicted_id 等于结束标记,就返回结果
if predicted_id == eng_tokenizer.word_index['<end>']:
return tf.squeeze(output, axis=0), attention_weights
# 连接 predicted_id 与输出,作为解码器的输入传递到解码器。
output = tf.concat([output, predicted_id], axis=-1)
return tf.squeeze(output, axis=0), attention_weights
def plot_attention_weights(attention, sentence, result, layer):
print(result)
fig = plt.figure(figsize=(16, 8))
sentence = preprocess_zh(sentence)
sentence = [zh_tokenizer.word_index[i] for i in sentence.split(' ')]
attention = tf.squeeze(attention[layer], axis=0)
for head in range(attention.shape[0]):
ax = fig.add_subplot(2, 4, head + 1)
# 画出注意力权重
ax.matshow(attention[head][:-1, :], cmap='viridis')
fontdict = {'fontsize': 10}
ax.set_xticks(range(len(sentence) + 2))
ax.set_yticks(range(len(result) + 1)) # 考虑到 '<end>' 标签
ax.set_ylim(len(result) + 0.5, -0.5)
ax.set_xlim(-0.5, len(sentence) + 1) # 考虑到 '<start>' 和 '<end>' 标签
ax.set_xticklabels([zh_tokenizer.index_word[i] for i in sentence] + ['<start>', '<end>'],
fontdict=fontdict)
ax.set_yticklabels([eng_tokenizer.index_word[i.numpy()] for i in result] + ['<end>'],
fontdict=fontdict)
ax.set_xlabel('Head {}'.format(head + 1))
plt.tight_layout()
plt.show()
def translate(sentence, plot=''):
result, attention_weights = evaluate(sentence)
predicted_sentence = ' '.join(
[eng_tokenizer.index_word[i] for i in result.numpy()[1:]])
print('Input: {}'.format(sentence))
print('Predicted translation: {}'.format(predicted_sentence))
if plot:
plot_attention_weights(attention_weights, sentence, result, plot)
translate("我今天心情有点奇怪。", plot='decoder_layer4_block2')
#print("Real translation: you should begin right away.")
Input: 我今天心情有点奇怪。
Predicted translation: i feel like a strange mood today .
tf.Tensor([ 1 4 224 37 8 565 1981 137 3], shape=(9,), dtype=int32)
结尾
文章代码来源
作者主页
此仓库是我在学习transformer的途中找到的,写的非常好,该仓库下还有一系列的自然语言处理教程,强力推荐。
同时该教程从基础部分开始就比较详细,让我发现了许多之前学习缺失的部分,感谢作者。