基于 Transformer 的机器翻译

Levante_ceo

已于 2024-07-25 14:44:29 修改

阅读量87

点赞数 3

文章标签： python nlp

于 2024-07-23 20:36:01 首次发布

本文链接：https://blog.csdn.net/Levante_ceo/article/details/140645689

版权

一、模型总体架构

二、项目实现步骤及代码

1、加载数据及预处理

2、模型结构--Attention

缩放点击--Attention

疑问？为啥要除以根号 DK？？？答：防止内积总和过大

def scaled_dot_product_attention(q, k, v, mask):
    """
    Args:
    - q: shape == (..., seq_len_q, depth_q)
    - k: shape == (..., seq_len_k, depth_k)
    - v: shape == (..., seq_len_v, depth_v)
    - depth_q = depth_k
    - seq_len_k = seq_len_v
    - mask: shape == (..., seq_len_q, seq_len_k)
    Returns:
    - output: weighted sum
    - attention_weights: weights of attention
    """
    # matmul_qk.shape : (..., seq_len_q, seq_len_k)
    matmul_qk = tf.matmul(q, k, transpose_b=True)
    
    dk = tf.cast(tf.shape(k)[-1], dtype=tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
    if mask is not None:
        # 可以使得softmax之后的值无限趋近于0
        scaled_attention_logits += (mask * -1e9)
        
    # attention_weights.shape: (..., seq_len_q, seq_len_k)
    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
    
    # output.shape: (..., seq_len_q, depth_v)
    output = tf.matmul(attention_weights, v)
    return output, attention_weights

2、多头-attention：每一个QKV都要进行缩放点击，再把结果concat，再Linear（全连接）

class MultiHeadAttention(keras.layers.Layer):
    """
    理论上：
    x -> Wq0  -> q0
    x -> Wk0 -> k0
    x -> Wv0 -> v0
    
    实际上：
    q -> Wq0 -> q0
    k -> Wk0 -> k0
    v -> Wv0 -> v0
    
    实战技巧：
    q -> Wq -> Q -> split -> q0, q1, ..., qn
    """
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        assert self.d_model % self.num_heads == 0
        
        self.depth = self.d_model // self.num_heads
        
        self.WQ = keras.layers.Dense(self.d_model)
        self.WK = keras.layers.Dense(self.d_model)
        self.WV = keras.layers.Dense(self.d_model)
        
        self.dense = keras.layers.Dense(self.d_model)
        
    def split_heads(self, x, batch_size):
        # x.shape: (batch_size, seq_len, d_model)
        # d_model = num_heads * depth
        # x -> (batch_size, num_heads, seq_len, depth)
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])
    
    def call(self, q, k, v, mask):
        batch_size = tf.shape(q)[0]
        
        # q.shape: (batch_size, seq_len_q, d_model)
        q = self.WQ(q)
        # k.shape: (batch_size, seq_len_k, d_model)
        k = self.WK(k)
        # v.shape: (batch_size, seq_len_v, d_model)
        v = self.WV(v)
        
        # q.shape: (batch_size, num_heads, seq_len_q, depth)
        q = self.split_heads(q, batch_size)
        # k.shape: (batch_size, num_heads, seq_len_k, depth)
        k = self.split_heads(k, batch_size)
        # v.shape: (batch_size, num_heads, seq_len_v, depth)
        v = self.split_heads(v, batch_size)
        
        # scaled_attention_outputs.shape: (batch_size, num_heads, seq_len_q, depth)
        # attention_weights.shape: (batch_size, num_heads, seq_len_q, seq_len_k)
        scaled_attention_outputs, attention_weights = scaled_dot_product_attention(q, k, v, mask)
        # scaled_attention_outputs.shape: (batch_size, seq_len_q, num_heads, depth)
        scaled_attention_outputs = tf.transpose(scaled_attention_outputs, perm=[0, 2, 1, 3])
        
        # concat_attention.shape: (batch_size, seq_len_q, d_model)
        concat_attention = tf.reshape(scaled_attention_outputs, (batch_size, -1, self.d_model))
        
        # output.shape : (batch_size, seq_len_q, d_model)
        output = self.dense(concat_attention)
        return output, attention_weights

3、前馈网络

def feed_forward_network(d_model, dff):
    # dff: dim of feed forward network
    return keras.Sequential([
        keras.layers.Dense(dff, activation='relu'),
        keras.layers.Dense(d_model)
    ])

3、位置编码

4、EncoderLayer 模块

class EncoderLayer(keras.layers.Layer):
    """
    x -> self attention -> add & normalize & dropout -> feed_forward -> add & normalize & dropout
    """
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super().__init__()
        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = feed_forward_network(d_model, dff)
        
        self.layer_norm1 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.layer_norm2 = keras.layers.LayerNormalization(epsilon=1e-6)
        
        self.dropout1 = keras.layers.Dropout(rate)
        self.dropout2 = keras.layers.Dropout(rate)
    
    def call(self, x, training, encoder_padding_mask):
        # x.shape :  (batch_size, seq_len, dim = d_model)
        # attn_output.shape : (bathc_size, seq_len, d_model)
        attn_output, _ = self.mha(x, x, x, encoder_padding_mask)
        attn_output = self.dropout1(attn_output, training=training)
        # out1.shape : (batch_size, seq_len, d_model)
        out1 = self.layer_norm1(x + attn_output)
        
        
        # ffn_output.shape : (batch_size, seq_len, d_model)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        # out2.shape : (batch_size, seq_len, d_model)
        out2 = self.layer_norm2(out1 + ffn_output)
        return out2

5、DecoderLayer

1）、Train的时候并行化；

2）、Inference的时候仍然要序列式完成；

3）、Self_Attention时前词不能见后词（Mask来实现）

class DecoderLayer(keras.layers.Layer):
    """
    x -> self attention -> add & normalize & dropout -> out1
    out1, encoding_outputs -> attention -> add & normalize & dropout -> out2
    out2 -> ffn -> add & normalize & dropout -> out3
    """
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super().__init__()
        self.mha1 = MultiHeadAttention(d_model, num_heads)
        self.mha2 = MultiHeadAttention(d_model, num_heads)
        
        self.ffn = feed_forward_network(d_model, dff)
        
        self.layer_norm1 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.layer_norm2 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.layer_norm3 = keras.layers.LayerNormalization(epsilon=1e-6)
        
        self.dropout1 = keras.layers.Dropout(rate)
        self.dropout2 = keras.layers.Dropout(rate)
        self.dropout3 = keras.layers.Dropout(rate)
    
    def call(self, x, encoding_outputs, training, decoder_mask, encoder_decoder_padding_mask):
        # x.shape : (batch_size, target_seq_len, d_model)
        # attn1, out1.shape : (batch_size, target_seq_len, d_model)
        attn1, attn_weights1 = self.mha1(x, x, x, decoder_mask)
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.layer_norm1(attn1 + x)
        
        # attn2, output.shape : (batch_size, target_seq_len, d_model)
        # encoding_outputs.shape: (batch_size, input_seq_len, d_model)
        attn2, attn_weights2 = self.mha2(out1, encoding_outputs, encoding_outputs, encoder_decoder_padding_mask)
        attn2 = self.dropout2(attn2, training=training)
        out2 = self.layer_norm2(attn2 + out1)
        
        # ffn_output, out3.shape ： (batch_size, target_seq_len, d_model)
        ffn_output = self.ffn(out2)
        ffn_output = self.dropout3(ffn_output, training=training)
        out3 = self.layer_norm3(ffn_output + out2)
        return out3, attn_weights1, attn_weights2

6、EncoderModel

class EncoderModel(keras.layers.Layer):
    def __init__(self, num_layers, input_vocab_size, max_length, d_model, num_heads, dff, rate=0.1):
        super().__init__()
        self.d_model = d_model
        self.num_layers = num_layers
        self.max_length = max_length
        
        self.embedding = keras.layers.Embedding(input_vocab_size, self.d_model)
        self.position_embedding = get_position_embedding(max_length, self.d_model)
        
        self.dropout = keras.layers.Dropout(rate)
        self.encoder_layers = [EncoderLayer(d_model, num_heads, dff, rate) for _ in range(self.num_layers)]
    
    def call(self, x, training, encoder_padding_mask):
        # x.shape: (batch_size, input_seq_len)
        input_seq_len = tf.shape(x)[1]
        
        tf.debugging.assert_less_equal(input_seq_len, self.max_length,
                                      'input_seq_len should be less or equal to self.max_length')
        
        # x.shape: (batch_size, input_seq_len, d_model)
        x = self.embedding(x)
        # 让x在和position_embedding相加的结果中占更大作用
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        
        x += self.position_embedding[:, :input_seq_len, :]
        
        x = self.dropout(x, training=training)
        
        for i in range(self.num_layers):
            x = self.encoder_layers[i](x, training, encoder_padding_mask)
        
        # x.shape: (batch_size, input_seq_len, d_model)
        return x

7、DecoderModel

class DecoderModel(keras.layers.Layer):
    def __init__(self, num_layers, target_vocab_size, max_length, d_model, num_heads, dff, rate=0.1):
        super().__init__()
        self.num_layers = num_layers
        self.max_length = max_length
        self.d_model = d_model
        
        self.embedding = keras.layers.Embedding(target_vocab_size, d_model)
        self.position_embedding = get_position_embedding(max_length, d_model)
        self.dropout = keras.layers.Dropout(rate)
        self.decoder_layers = [DecoderLayer(d_model, num_heads, dff, rate) for _ in range(self.num_layers)]
    
    def call(self, x, encoding_outputs, training, decoder_mask, encoder_decoder_padding_mask):
        # x.shape: (batch_size, output_seq_len)
        output_seq_len = tf.shape(x)[1]
        tf.debugging.assert_less_equal(output_seq_len, self.max_length,
                                      'output_seq_len should be less or equal to self.max_length')
        
        attention_weights = {}
        
        # x.shape: (batch_size, output_seq_len, d_model)
        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.position_embedding[:, :output_seq_len, :]
        
        x = self.dropout(x, training=training)
        
        for i in range(self.num_layers):
            x, att1, att2 = self.decoder_layers[i](x, encoding_outputs, training, decoder_mask, encoder_decoder_padding_mask)
            attention_weights['decoder_layer{}_att1'.format(i + 1)] = att1
            attention_weights['decoder_layer{}_att2'.format(i + 1)] = att2
            
        # x.shape : (batch_size, output_seq_len, d_model)
        return x, attention_weights

8、Transformer

class Transformer(keras.Model):
    def __init__(self, num_layers, input_vocab_size, target_vocab_size, max_length, d_model, num_heads, dff, rate=0.1):
        super().__init__()
        
        self.encoder_model = EncoderModel(num_layers, input_vocab_size, max_length, d_model, num_heads, dff, rate)
        
        self.decoder_model = DecoderModel(num_layers, target_vocab_size, max_length, d_model, num_heads, dff, rate)
        
        self.final_layer = keras.layers.Dense(target_vocab_size)
    
    def call(self, inp, tar, training, encoder_padding_mask, decoder_mask, encoder_decoder_padding_mask):
        # encoding_outputs.shape: (batch_size, input_seq_len, d_model)
        encoding_outputs = self.encoder_model(inp, training, encoder_padding_mask)
        
        # decoding_outputs.shape: (batch_size, output_seq_len, d_model)
        decoding_outputs, attention_weights = self.decoder_model(tar, encoding_outputs, training, decoder_mask, encoder_decoder_padding_mask)
        
        # predictions.shape: (batch_size, output_seq_len, target_vocab_size)
        predictions = self.final_layer(decoding_outputs)
        
        return predictions, attention_weights

9、创建 Mask

def create_masks(inp, tar):
    """
    Encoder:
      - encoder_padding_mask (self attention of EncoderLayer)
    Decoder:
      - look_ahead_mask (self attention of DecoderLayer)
      - decoder_padding_mask (self attention of DecoderLayer)
      - encoder_decoder_padding_mask (encoder-decoder attention of DecoderLayer)
    """
    encoder_padding_mask = create_padding_mask(inp)
    encoder_decoder_padding_mask = create_padding_mask(inp)
    
    look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
    decoder_padding_mask = create_padding_mask(tar)
    
    decoder_mask = tf.maximum(decoder_padding_mask, look_ahead_mask)

10、attention可视化


def plot_encoder_decoder_attention(attention, input_sentence, result, layer_name):
    fig = plt.figure(figsize=(16, 8))
    input_id_sentence = pt_tokenizer.encode(input_sentence)
    
    # attention.shape (num_heads, tar_seq_len, input_seq_len)
    attention = tf.squeeze(attention[layer_name], axis=0)
    
    for head in range(attention.shape[0]):
        ax = fig.add_subplot(2, 4, head + 1)
        
        ax.matshow(attention[head][:-1, :])
        
        fontdict = {'fontsize': 10}
        
        ax.set_xticks(range(len(input_id_sentence) + 2))
        ax.set_yticks(range(len(result) - 1))
        print('yticks', len(result))
        print(result)
        
        ax.set_ylim(len(result) - 1.5, -0.5)
        ax.set_xticklabels(['<start>'] + [pt_tokenizer.decode([i]) for i in input_id_sentence] + ['<end>'],
                          fontdict=fontdict, rotation=90)
        print([en_tokenizer.decode([i]) for i in result if i < en_tokenizer.vocab_size])
        ax.set_yticklabels([en_tokenizer.decode([i]) for i in result if i < en_tokenizer.vocab_size],
                           fontdict=fontdict)
        ax.set_xlabel('Head {}'.format(head + 1))
    plt.tight_layout()
    plt.show()