一、模型总体架构
二、项目实现步骤及代码
1、加载数据及预处理
2、模型结构--Attention
缩放点击--Attention
疑问? 为啥要除以 根号 DK???答:防止内积总和过大
def scaled_dot_product_attention(q, k, v, mask):
"""
Args:
- q: shape == (..., seq_len_q, depth_q)
- k: shape == (..., seq_len_k, depth_k)
- v: shape == (..., seq_len_v, depth_v)
- depth_q = depth_k
- seq_len_k = seq_len_v
- mask: shape == (..., seq_len_q, seq_len_k)
Returns:
- output: weighted sum
- attention_weights: weights of attention
"""
# matmul_qk.shape : (..., seq_len_q, seq_len_k)
matmul_qk = tf.matmul(q, k, transpose_b=True)
dk = tf.cast(tf.shape(k)[-1], dtype=tf.float32)
scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
if mask is not None:
# 可以使得softmax之后的值无限趋近于0
scaled_attention_logits += (mask * -1e9)
# attention_weights.shape: (..., seq_len_q, seq_len_k)
attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
# output.shape: (..., seq_len_q, depth_v)
output = tf.matmul(attention_weights, v)
return output, attention_weights
2、多头-attention:每一个QKV都要进行缩放点击,再把结果concat,再Linear(全连接)
class MultiHeadAttention(keras.layers.Layer):
"""
理论上:
x -> Wq0 -> q0
x -> Wk0 -> k0
x -> Wv0 -> v0
实际上:
q -> Wq0 -> q0
k -> Wk0 -> k0
v -> Wv0 -> v0
实战技巧:
q -> Wq -> Q -> split -> q0, q1, ..., qn
"""
def __init__(self, d_model, num_heads):
super().__init__()
self.num_heads = num_heads
self.d_model = d_model
assert self.d_model % self.num_heads == 0
self.depth = self.d_model // self.num_heads
self.WQ = keras.layers.Dense(self.d_model)
self.WK = keras.layers.Dense(self.d_model)
self.WV = keras.layers.Dense(self.d_model)
self.dense = keras.layers.Dense(self.d_model)
def split_heads(self, x, batch_size):
# x.shape: (batch_size, seq_len, d_model)
# d_model = num_heads * depth
# x -> (batch_size, num_heads, seq_len, depth)
x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
return tf.transpose(x, perm=[0, 2, 1, 3])
def call(self, q, k, v, mask):
batch_size = tf.shape(q)[0]
# q.shape: (batch_size, seq_len_q, d_model)
q = self.WQ(q)
# k.shape: (batch_size, seq_len_k, d_model)
k = self.WK(k)
# v.shape: (batch_size, seq_len_v, d_model)
v = self.WV(v)
# q.shape: (batch_size, num_heads, seq_len_q, depth)
q = self.split_heads(q, batch_size)
# k.shape: (batch_size, num_heads, seq_len_k, depth)
k = self.split_heads(k, batch_size)
# v.shape: (batch_size, num_heads, seq_len_v, depth)
v = self.split_heads(v, batch_size)
# scaled_attention_outputs.shape: (batch_size, num_heads, seq_len_q, depth)
# attention_weights.shape: (batch_size, num_heads, seq_len_q, seq_len_k)
scaled_attention_outputs, attention_weights = scaled_dot_product_attention(q, k, v, mask)
# scaled_attention_outputs.shape: (batch_size, seq_len_q, num_heads, depth)
scaled_attention_outputs = tf.transpose(scaled_attention_outputs, perm=[0, 2, 1, 3])
# concat_attention.shape: (batch_size, seq_len_q, d_model)
concat_attention = tf.reshape(scaled_attention_outputs, (batch_size, -1, self.d_model))
# output.shape : (batch_size, seq_len_q, d_model)
output = self.dense(concat_attention)
return output, attention_weights
3、前馈网络
def feed_forward_network(d_model, dff):
# dff: dim of feed forward network
return keras.Sequential([
keras.layers.Dense(dff, activation='relu'),
keras.layers.Dense(d_model)
])
3、位置编码
4、EncoderLayer 模块
class EncoderLayer(keras.layers.Layer):
"""
x -> self attention -> add & normalize & dropout -> feed_forward -> add & normalize & dropout
"""
def __init__(self, d_model, num_heads, dff, rate=0.1):
super().__init__()
self.mha = MultiHeadAttention(d_model, num_heads)
self.ffn = feed_forward_network(d_model, dff)
self.layer_norm1 = keras.layers.LayerNormalization(epsilon=1e-6)
self.layer_norm2 = keras.layers.LayerNormalization(epsilon=1e-6)
self.dropout1 = keras.layers.Dropout(rate)
self.dropout2 = keras.layers.Dropout(rate)
def call(self, x, training, encoder_padding_mask):
# x.shape : (batch_size, seq_len, dim = d_model)
# attn_output.shape : (bathc_size, seq_len, d_model)
attn_output, _ = self.mha(x, x, x, encoder_padding_mask)
attn_output = self.dropout1(attn_output, training=training)
# out1.shape : (batch_size, seq_len, d_model)
out1 = self.layer_norm1(x + attn_output)
# ffn_output.shape : (batch_size, seq_len, d_model)
ffn_output = self.ffn(out1)
ffn_output = self.dropout2(ffn_output, training=training)
# out2.shape : (batch_size, seq_len, d_model)
out2 = self.layer_norm2(out1 + ffn_output)
return out2
5、DecoderLayer
1)、Train的时候并行化;
2)、Inference的时候仍然要序列式完成;
3)、Self_Attention时前词不能见后词(Mask来实现)
class DecoderLayer(keras.layers.Layer):
"""
x -> self attention -> add & normalize & dropout -> out1
out1, encoding_outputs -> attention -> add & normalize & dropout -> out2
out2 -> ffn -> add & normalize & dropout -> out3
"""
def __init__(self, d_model, num_heads, dff, rate=0.1):
super().__init__()
self.mha1 = MultiHeadAttention(d_model, num_heads)
self.mha2 = MultiHeadAttention(d_model, num_heads)
self.ffn = feed_forward_network(d_model, dff)
self.layer_norm1 = keras.layers.LayerNormalization(epsilon=1e-6)
self.layer_norm2 = keras.layers.LayerNormalization(epsilon=1e-6)
self.layer_norm3 = keras.layers.LayerNormalization(epsilon=1e-6)
self.dropout1 = keras.layers.Dropout(rate)
self.dropout2 = keras.layers.Dropout(rate)
self.dropout3 = keras.layers.Dropout(rate)
def call(self, x, encoding_outputs, training, decoder_mask, encoder_decoder_padding_mask):
# x.shape : (batch_size, target_seq_len, d_model)
# attn1, out1.shape : (batch_size, target_seq_len, d_model)
attn1, attn_weights1 = self.mha1(x, x, x, decoder_mask)
attn1 = self.dropout1(attn1, training=training)
out1 = self.layer_norm1(attn1 + x)
# attn2, output.shape : (batch_size, target_seq_len, d_model)
# encoding_outputs.shape: (batch_size, input_seq_len, d_model)
attn2, attn_weights2 = self.mha2(out1, encoding_outputs, encoding_outputs, encoder_decoder_padding_mask)
attn2 = self.dropout2(attn2, training=training)
out2 = self.layer_norm2(attn2 + out1)
# ffn_output, out3.shape : (batch_size, target_seq_len, d_model)
ffn_output = self.ffn(out2)
ffn_output = self.dropout3(ffn_output, training=training)
out3 = self.layer_norm3(ffn_output + out2)
return out3, attn_weights1, attn_weights2
6、EncoderModel
class EncoderModel(keras.layers.Layer):
def __init__(self, num_layers, input_vocab_size, max_length, d_model, num_heads, dff, rate=0.1):
super().__init__()
self.d_model = d_model
self.num_layers = num_layers
self.max_length = max_length
self.embedding = keras.layers.Embedding(input_vocab_size, self.d_model)
self.position_embedding = get_position_embedding(max_length, self.d_model)
self.dropout = keras.layers.Dropout(rate)
self.encoder_layers = [EncoderLayer(d_model, num_heads, dff, rate) for _ in range(self.num_layers)]
def call(self, x, training, encoder_padding_mask):
# x.shape: (batch_size, input_seq_len)
input_seq_len = tf.shape(x)[1]
tf.debugging.assert_less_equal(input_seq_len, self.max_length,
'input_seq_len should be less or equal to self.max_length')
# x.shape: (batch_size, input_seq_len, d_model)
x = self.embedding(x)
# 让x在和position_embedding相加的结果中占更大作用
x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
x += self.position_embedding[:, :input_seq_len, :]
x = self.dropout(x, training=training)
for i in range(self.num_layers):
x = self.encoder_layers[i](x, training, encoder_padding_mask)
# x.shape: (batch_size, input_seq_len, d_model)
return x
7、DecoderModel
class DecoderModel(keras.layers.Layer):
def __init__(self, num_layers, target_vocab_size, max_length, d_model, num_heads, dff, rate=0.1):
super().__init__()
self.num_layers = num_layers
self.max_length = max_length
self.d_model = d_model
self.embedding = keras.layers.Embedding(target_vocab_size, d_model)
self.position_embedding = get_position_embedding(max_length, d_model)
self.dropout = keras.layers.Dropout(rate)
self.decoder_layers = [DecoderLayer(d_model, num_heads, dff, rate) for _ in range(self.num_layers)]
def call(self, x, encoding_outputs, training, decoder_mask, encoder_decoder_padding_mask):
# x.shape: (batch_size, output_seq_len)
output_seq_len = tf.shape(x)[1]
tf.debugging.assert_less_equal(output_seq_len, self.max_length,
'output_seq_len should be less or equal to self.max_length')
attention_weights = {}
# x.shape: (batch_size, output_seq_len, d_model)
x = self.embedding(x)
x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
x += self.position_embedding[:, :output_seq_len, :]
x = self.dropout(x, training=training)
for i in range(self.num_layers):
x, att1, att2 = self.decoder_layers[i](x, encoding_outputs, training, decoder_mask, encoder_decoder_padding_mask)
attention_weights['decoder_layer{}_att1'.format(i + 1)] = att1
attention_weights['decoder_layer{}_att2'.format(i + 1)] = att2
# x.shape : (batch_size, output_seq_len, d_model)
return x, attention_weights
8、Transformer
class Transformer(keras.Model):
def __init__(self, num_layers, input_vocab_size, target_vocab_size, max_length, d_model, num_heads, dff, rate=0.1):
super().__init__()
self.encoder_model = EncoderModel(num_layers, input_vocab_size, max_length, d_model, num_heads, dff, rate)
self.decoder_model = DecoderModel(num_layers, target_vocab_size, max_length, d_model, num_heads, dff, rate)
self.final_layer = keras.layers.Dense(target_vocab_size)
def call(self, inp, tar, training, encoder_padding_mask, decoder_mask, encoder_decoder_padding_mask):
# encoding_outputs.shape: (batch_size, input_seq_len, d_model)
encoding_outputs = self.encoder_model(inp, training, encoder_padding_mask)
# decoding_outputs.shape: (batch_size, output_seq_len, d_model)
decoding_outputs, attention_weights = self.decoder_model(tar, encoding_outputs, training, decoder_mask, encoder_decoder_padding_mask)
# predictions.shape: (batch_size, output_seq_len, target_vocab_size)
predictions = self.final_layer(decoding_outputs)
return predictions, attention_weights
9、创建 Mask
def create_masks(inp, tar):
"""
Encoder:
- encoder_padding_mask (self attention of EncoderLayer)
Decoder:
- look_ahead_mask (self attention of DecoderLayer)
- decoder_padding_mask (self attention of DecoderLayer)
- encoder_decoder_padding_mask (encoder-decoder attention of DecoderLayer)
"""
encoder_padding_mask = create_padding_mask(inp)
encoder_decoder_padding_mask = create_padding_mask(inp)
look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
decoder_padding_mask = create_padding_mask(tar)
decoder_mask = tf.maximum(decoder_padding_mask, look_ahead_mask)
10、attention可视化
def plot_encoder_decoder_attention(attention, input_sentence, result, layer_name):
fig = plt.figure(figsize=(16, 8))
input_id_sentence = pt_tokenizer.encode(input_sentence)
# attention.shape (num_heads, tar_seq_len, input_seq_len)
attention = tf.squeeze(attention[layer_name], axis=0)
for head in range(attention.shape[0]):
ax = fig.add_subplot(2, 4, head + 1)
ax.matshow(attention[head][:-1, :])
fontdict = {'fontsize': 10}
ax.set_xticks(range(len(input_id_sentence) + 2))
ax.set_yticks(range(len(result) - 1))
print('yticks', len(result))
print(result)
ax.set_ylim(len(result) - 1.5, -0.5)
ax.set_xticklabels(['<start>'] + [pt_tokenizer.decode([i]) for i in input_id_sentence] + ['<end>'],
fontdict=fontdict, rotation=90)
print([en_tokenizer.decode([i]) for i in result if i < en_tokenizer.vocab_size])
ax.set_yticklabels([en_tokenizer.decode([i]) for i in result if i < en_tokenizer.vocab_size],
fontdict=fontdict)
ax.set_xlabel('Head {}'.format(head + 1))
plt.tight_layout()
plt.show()