Transformer self-attention源码

Transformer self-attention源码

  • 一共分为三个函数:

    • 构造attention_mask
    • transformer_model
      • attention_layer
  • 构造attention_mask:
    该部分代码的作用是构造attention可视域的attention_mask,因为每个样本都经过padding过程,在做self-attention的时候padding的部分不能attend到其他部分上。
    输入为形状为【batch_size, from_seq_length,…】的padding好的input_ids和形状为【batch_size, to_seq_length】的mask标记向量。

    def create_attention_mask_from_input_mask(from_tensor, to_mask):
      from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
      batch_size = from_shape[0]
      from_seq_length = from_shape[1]
    
      to_shape = get_shape_list(to_mask, expected_rank=2)
      to_seq_length = to_shape[1]
    
      to_mask = tf.cast(
          tf.reshape(to_mask, [batch_size, 1, to_seq_length]), tf.float32)
    
      broadcast_ones = tf.ones(
          shape=[batch_size, from_seq_length, 1], dtype=tf.float32)
    
      mask = broadcast_ones * to_mask #*代表矩阵的乘法运算
    
      return mask
    
  • 注意力层(attention layer)
    这部分代码是multi-head attention的实现,主要来自《Attention is all you need》这篇论文。考虑key-query-value形式的attention,输入的from_tensor当做是query, to_tensor当做是key和value,当两者相同的时候即为self-attention。关于attention更详细的介绍可以转到理解Attention机制原理及模型。

    def attention_layer(from_tensor,   # 【batch_size, from_seq_length, from_width】
                        to_tensor,        #【batch_size, to_seq_length, to_width】
                        attention_mask=None,        #【batch_size,from_seq_length, to_seq_length】
                        num_attention_heads=1,        # attention head numbers
                        size_per_head=512,            # 每个head的大小
                        query_act=None,                # query变换的激活函数
                        key_act=None,                # key变换的激活函数
                        value_act=None,                # value变换的激活函数
                        attention_probs_dropout_prob=0.0,        # attention层的dropout
                        initializer_range=0.02,                    # 初始化取值范围
                        do_return_2d_tensor=False,                # 是否返回2d张量。
    #如果True,输出形状【batch_size*from_seq_length,num_attention_heads*size_per_head】
    #如果False,输出形状【batch_size, from_seq_length, num_attention_heads*size_per_head】
                        batch_size=None,                        #如果输入是3D的,
    #那么batch就是第一维,但是可能3D的压缩成了2D的,所以需要告诉函数batch_size 
                        from_seq_length=None,                    # 同上
                        to_seq_length=None):                    # 同上
    
      def transpose_for_scores(input_tensor, batch_size, num_attention_heads,
                               seq_length, width):
        output_tensor = tf.reshape(
            input_tensor, [batch_size, seq_length, num_attention_heads, width])
    
        output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3])    #[batch_size,  num_attention_heads, seq_length, width]
        return output_tensor
    
      from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
      to_shape = get_shape_list(to_tensor, expected_rank=[2, 3])
    
      if len(from_shape) != len(to_shape):
        raise ValueError(
            "The rank of `from_tensor` must match the rank of `to_tensor`.")
    
      if len(from_shape) == 3:
        batch_size = from_shape[0]
        from_seq_length = from_shape[1]
        to_seq_length = to_shape[1]
      elif len(from_shape) == 2:
        if (batch_size is None or from_seq_length is None or to_seq_length is None):
          raise ValueError(
              "When passing in rank 2 tensors to attention_layer, the values "
              "for `batch_size`, `from_seq_length`, and `to_seq_length` "
              "must all be specified.")
    
      # 为了方便备注shape,采用以下简写:
      #   B = batch size (number of sequences)
      #   F = `from_tensor` sequence length
      #   T = `to_tensor` sequence length
      #   N = `num_attention_heads`
      #   H = `size_per_head`
    
      # 把from_tensor和to_tensor压缩成2D张量
      from_tensor_2d = reshape_to_matrix(from_tensor)        # 【B*F, hidden_size】
      to_tensor_2d = reshape_to_matrix(to_tensor)            # 【B*T, hidden_size】
    
      # 将from_tensor输入全连接层得到query_layer
      # `query_layer` = [B*F, N*H]
      query_layer = tf.layers.dense(
          from_tensor_2d,
          num_attention_heads * size_per_head,
          activation=query_act,
          name="query",
          kernel_initializer=create_initializer(initializer_range))
    
      # 将from_tensor输入全连接层得到query_layer
      # `key_layer` = [B*T, N*H]
      key_layer = tf.layers.dense(
          to_tensor_2d,
          num_attention_heads * size_per_head,
          activation=key_act,
          name="key",
          kernel_initializer=create_initializer(initializer_range))
    
      # 同上
      # `value_layer` = [B*T, N*H]
      value_layer = tf.layers.dense(
          to_tensor_2d,
          num_attention_heads * size_per_head,
          activation=value_act,
          name="value",
          kernel_initializer=create_initializer(initializer_range))
    
      # query_layer转成多头:[B*F, N*H]==>[B, F, N, H]==>[B, N, F, H]
      query_layer = transpose_for_scores(query_layer, batch_size,
                                         num_attention_heads, from_seq_length,
                                         size_per_head)
    
      # key_layer转成多头:[B*T, N*H] ==> [B, T, N, H] ==> [B, N, T, H]
      key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads,
                                       to_seq_length, size_per_head)
    
      # 将query与key做点积,然后做一个scale,公式可以参见原始论文
      # `attention_scores` = [B, N, F, T]
      attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
      attention_scores = tf.multiply(attention_scores,
                                     1.0 / math.sqrt(float(size_per_head)))
    
      if attention_mask is not None:
        # `attention_mask` = [B, 1, F, T]
        attention_mask = tf.expand_dims(attention_mask, axis=[1])
    
        # 如果attention_mask里的元素为1,则通过下面运算有(1-1)*-10000,adder就是0
        # 如果attention_mask里的元素为0,则通过下面运算有(1-0)*-10000,adder就是-10000
        adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0
    
        # 我们最终得到的attention_score一般不会很大,
        #所以上述操作对mask为0的地方得到的score可以认为是负无穷
        attention_scores += adder
    
      # 负无穷经过softmax之后为0,就相当于mask为0的位置不计算attention_score
      # `attention_probs` = [B, N, F, T]
      attention_probs = tf.nn.softmax(attention_scores)
    
      # 对attention_probs进行dropout,这虽然有点奇怪,但是Transforme原始论文就是这么做的
      attention_probs = dropout(attention_probs, attention_probs_dropout_prob)
    
      # `value_layer` = [B, T, N, H]
      value_layer = tf.reshape(
          value_layer,
          [batch_size, to_seq_length, num_attention_heads, size_per_head])
    
      # `value_layer` = [B, N, T, H]
      value_layer = tf.transpose(value_layer, [0, 2, 1, 3])
    
      # `context_layer` = [B, N, F, H]
      context_layer = tf.matmul(attention_probs, value_layer)
    
      # `context_layer` = [B, F, N, H]
      context_layer = tf.transpose(context_layer, [0, 2, 1, 3])
    
      if do_return_2d_tensor:
        # `context_layer` = [B*F, N*H]
        context_layer = tf.reshape(
            context_layer,
            [batch_size * from_seq_length, num_attention_heads * size_per_head])
      else:
        # `context_layer` = [B, F, N*H]
        context_layer = tf.reshape(
            context_layer,
            [batch_size, from_seq_length, num_attention_heads * size_per_head])
    
      return context_layer
    
  • transformer_model

    def transformer_model(input_tensor,
                          attention_mask=None,
                          hidden_size=768,
                          num_hidden_layers=12,
                          num_attention_heads=12,
                          intermediate_size=3072,
                          intermediate_act_fn=gelu,
                          hidden_dropout_prob=0.1,
                          attention_probs_dropout_prob=0.1,
                          initializer_range=0.02,
                          do_return_all_layers=False):
      """Multi-headed, multi-layer Transformer from "Attention is All You Need".
    
      This is almost an exact implementation of the original Transformer encoder.
    
      See the original paper:
      https://arxiv.org/abs/1706.03762
    
      Also see:
      https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py
    
      Args:
        input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size].
        attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length,
          seq_length], with 1 for positions that can be attended to and 0 in
          positions that should not be.
        hidden_size: int. Hidden size of the Transformer.
        num_hidden_layers: int. Number of layers (blocks) in the Transformer.
        num_attention_heads: int. Number of attention heads in the Transformer.
        intermediate_size: int. The size of the "intermediate" (a.k.a., feed
          forward) layer.
        intermediate_act_fn: function. The non-linear activation function to apply
          to the output of the intermediate/feed-forward layer.
        hidden_dropout_prob: float. Dropout probability for the hidden layers.
        attention_probs_dropout_prob: float. Dropout probability of the attention
          probabilities.
        initializer_range: float. Range of the initializer (stddev of truncated
          normal).
        do_return_all_layers: Whether to also return all layers or just the final
          layer.
    
      Returns:
        float Tensor of shape [batch_size, seq_length, hidden_size], the final
        hidden layer of the Transformer.
    
      Raises:
        ValueError: A Tensor shape or parameter is invalid.
      """
      if hidden_size % num_attention_heads != 0:
        raise ValueError(
            "The hidden size (%d) is not a multiple of the number of attention "
            "heads (%d)" % (hidden_size, num_attention_heads))
    
      attention_head_size = int(hidden_size / num_attention_heads)
      input_shape = get_shape_list(input_tensor, expected_rank=3)
      batch_size = input_shape[0]
      seq_length = input_shape[1]
      input_width = input_shape[2]
    
      # The Transformer performs sum residuals on all layers so the input needs
      # to be the same as the hidden size.
      if input_width != hidden_size:
        raise ValueError("The width of the input tensor (%d) != hidden size (%d)" %
                         (input_width, hidden_size))
    
      # We keep the representation as a 2D tensor to avoid re-shaping it back and
      # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on
      # the GPU/CPU but may not be free on the TPU, so we want to minimize them to
      # help the optimizer.
      prev_output = reshape_to_matrix(input_tensor)
    
      all_layer_outputs = []
      for layer_idx in range(num_hidden_layers):
        with tf.variable_scope("layer_%d" % layer_idx):
          layer_input = prev_output
    
          with tf.variable_scope("attention"):
            attention_heads = []
            with tf.variable_scope("self"):
              attention_head = attention_layer(
                  from_tensor=layer_input,
                  to_tensor=layer_input,
                  attention_mask=attention_mask,
                  num_attention_heads=num_attention_heads,
                  size_per_head=attention_head_size,
                  attention_probs_dropout_prob=attention_probs_dropout_prob,
                  initializer_range=initializer_range,
                  do_return_2d_tensor=True,
                  batch_size=batch_size,
                  from_seq_length=seq_length,
                  to_seq_length=seq_length)
              attention_heads.append(attention_head)
    
            attention_output = None
            if len(attention_heads) == 1:
              attention_output = attention_heads[0]
            else:
              # In the case where we have other sequences, we just concatenate
              # them to the self-attention head before the projection.
              attention_output = tf.concat(attention_heads, axis=-1)
    
            # Run a linear projection of `hidden_size` then add a residual
            # with `layer_input`.
            with tf.variable_scope("output"):
              attention_output = tf.layers.dense(
                  attention_output,
                  hidden_size,
                  kernel_initializer=create_initializer(initializer_range))
              attention_output = dropout(attention_output, hidden_dropout_prob)
              attention_output = layer_norm(attention_output + layer_input)
    
          # The activation is only applied to the "intermediate" hidden layer.
          with tf.variable_scope("intermediate"):
            intermediate_output = tf.layers.dense(
                attention_output,
                intermediate_size,
                activation=intermediate_act_fn,
                kernel_initializer=create_initializer(initializer_range))
    
          # Down-project back to `hidden_size` then add the residual.
          with tf.variable_scope("output"):
            layer_output = tf.layers.dense(
                intermediate_output,
                hidden_size,
                kernel_initializer=create_initializer(initializer_range))
            layer_output = dropout(layer_output, hidden_dropout_prob)
            layer_output = layer_norm(layer_output + attention_output)
            prev_output = layer_output
            all_layer_outputs.append(layer_output)
    
      if do_return_all_layers:
        final_outputs = []
        for layer_output in all_layer_outputs:
          final_output = reshape_from_matrix(layer_output, input_shape)
          final_outputs.append(final_output)
        return final_outputs
      else:
        final_output = reshape_from_matrix(prev_output, input_shape)
        return final_output
    
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值