Bert 代码详细解读——modeling.py

最新推荐文章于 2024-04-25 23:31:27 发布

Zhen大虾

最新推荐文章于 2024-04-25 23:31:27 发布

阅读量4.1k

点赞数 5

分类专栏： nlp 文章标签： bert nlp

本文链接：https://blog.csdn.net/weixin_38527856/article/details/99987996

版权

nlp 专栏收录该内容

6 篇文章 1 订阅

订阅专栏

在官方的bert-github上，

git clone  https://github.com/google-research/bert.git

主要的文件内容如下图：

主要包括7个主要的python文件，小编要坚持把这7个解读清楚呀！

首先解读的是modeling.py文件，是bert实现的核心代码,主要包括2个类和17个函数，如下所示：

一、类

1.class BertConfig(object):

class BertConfig(object):
  """Configuration for `BertModel`."""

  def __init__(self,
               vocab_size,“词表中共有多少个词”
               hidden_size=768,#词嵌入的维度，也是编码层和池化层的维度
               num_hidden_layers=12,#transformer隐藏层数个数
               num_attention_heads=12,#在encoder层中的注意头个数   
               intermediate_size=3072,#encoder中间隐藏层神经元数，如feed-forward layer
               hidden_act="gelu",#encoder和pooler的激活函数
               hidden_dropout_prob=0.1,
               attention_probs_dropout_prob=0.1,
               max_position_embeddings=512,
               type_vocab_size=16,
               initializer_range=0.02):

    self.vocab_size = vocab_size
    self.hidden_size = hidden_size
    self.num_hidden_layers = num_hidden_layers
    self.num_attention_heads = num_attention_heads
    self.hidden_act = hidden_act
    self.intermediate_size = intermediate_size
    self.hidden_dropout_prob = hidden_dropout_prob
    self.attention_probs_dropout_prob = attention_probs_dropout_prob
    self.max_position_embeddings = max_position_embeddings
    self.type_vocab_size = type_vocab_size
    self.initializer_range = initializer_range

  @classmethod
  def from_dict(cls, json_object):#从json_object读入，将config的参数放入字典中
    """Constructs a `BertConfig` from a Python dictionary of parameters."""
    config = BertConfig(vocab_size=None)
    for (key, value) in six.iteritems(json_object):
      config.__dict__[key] = value
    return config

  @classmethod
  def from_json_file(cls, json_file):
    """Constructs a `BertConfig` from a json file of parameters."""
    with tf.gfile.GFile(json_file, "r") as reader:
      text = reader.read()
    return cls.from_dict(json.loads(text))

  def to_dict(self):
    """Serializes this instance to a Python dictionary."""
    output = copy.deepcopy(self.__dict__)
    return output

  def to_json_string(self):
    """Serializes this instance to a JSON string."""
    return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"

2.class BertModel(object):

class BertModel(object):
  """BERT model ("Bidirectional Encoder Representations from Transformers").
  def __init__(self,
               config,#上边第一个类BertConfig的实例
               is_training,#控制是否训练模型
               input_ids,#int32位的Tensor，维度为[batch_size, seq_length]
               input_mask=None,#可选，int32位的Tensor，维度为[batch_size, seq_length]
               token_type_ids=None,#可选，int32位，维度为[batch_size, seq_length]
               use_one_hot_embeddings=False,#可选，是否用one-hot,或使用tf.embedding_lookup()的预训练词向量
               scope=None#变量可用的范围，默认是整个bert):
#当config实例错误或输入维度错误时，提示ValueError

    config = copy.deepcopy(config)
    if not is_training:#如果is_training是False，dropout设为0
      config.hidden_dropout_prob = 0.0
      config.attention_probs_dropout_prob = 0.0

    input_shape = get_shape_list(input_ids, expected_rank=2)
    batch_size = input_shape[0]
    seq_length = input_shape[1]

    if input_mask is None:#未提供mask，就都设为1
      input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32)

    if token_type_ids is None:#未提供token_type都设为0
      token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32)

    with tf.variable_scope(scope, default_name="bert"):
      with tf.variable_scope("embeddings"):
        # Perform embedding lookup on the word ids.
        (self.embedding_output, self.embedding_table) = embedding_lookup(
            input_ids=input_ids,
            vocab_size=config.vocab_size,
            embedding_size=config.hidden_size,
            initializer_range=config.initializer_range,
            word_embedding_name="word_embeddings",
            use_one_hot_embeddings=use_one_hot_embeddings)

        # Add positional embeddings and token type embeddings, then layer
        # normalize and perform dropout.
        self.embedding_output = embedding_postprocessor(
            input_tensor=self.embedding_output,
            use_token_type=True,
            token_type_ids=token_type_ids,
            token_type_vocab_size=config.type_vocab_size,
            token_type_embedding_name="token_type_embeddings",
            use_position_embeddings=True,
            position_embedding_name="position_embeddings",
            initializer_range=config.initializer_range,
            max_position_embeddings=config.max_position_embeddings,
            dropout_prob=config.hidden_dropout_prob)

      with tf.variable_scope("encoder"):
        # This converts a 2D mask of shape [batch_size, seq_length] to a 3D
        # mask of shape [batch_size, seq_length, seq_length] which is used
        # for the attention scores.
        attention_mask = create_attention_mask_from_input_mask(
            input_ids, input_mask)

        # Run the stacked transformer.
        # `sequence_output` shape = [batch_size, seq_length, hidden_size].
        self.all_encoder_layers = transformer_model(
            input_tensor=self.embedding_output,
            attention_mask=attention_mask,
            hidden_size=config.hidden_size,
            num_hidden_layers=config.num_hidden_layers,
            num_attention_heads=config.num_attention_heads,
            intermediate_size=config.intermediate_size,
            intermediate_act_fn=get_activation(config.hidden_act),
            hidden_dropout_prob=config.hidden_dropout_prob,
            attention_probs_dropout_prob=config.attention_probs_dropout_prob,
            initializer_range=config.initializer_range,
            do_return_all_layers=True)

      self.sequence_output = self.all_encoder_layers[-1]
      # The "pooler" converts the encoded sequence tensor of shape
      # [batch_size, seq_length, hidden_size] to a tensor of shape
      # [batch_size, hidden_size]. This is necessary for segment-level
      # (or segment-pair-level) classification tasks where we need a fixed
      # dimensional representation of the segment.
      with tf.variable_scope("pooler"):
        # We "pool" the model by simply taking the hidden state corresponding
        # to the first token. We assume that this has been pre-trained
        first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1)
        self.pooled_output = tf.layers.dense(
            first_token_tensor,
            config.hidden_size,
            activation=tf.tanh,
            kernel_initializer=create_initializer(config.initializer_range))

  def get_pooled_output(self):
    return self.pooled_output

  def get_sequence_output(self):
    """Gets final hidden layer of encoder.

    Returns:
      float Tensor of shape [batch_size, seq_length, hidden_size] corresponding
      to the final hidden of the transformer encoder.
    """
    return self.sequence_output

  def get_all_encoder_layers(self):
    return self.all_encoder_layers

  def get_embedding_output(self):
    """Gets output of the embedding lookup (i.e., input to the transformer).

    Returns:
      float Tensor of shape [batch_size, seq_length, hidden_size] corresponding
      to the output of the embedding layer, after summing the word
      embeddings with the positional embeddings and the token type embeddings,
      then performing layer normalization. This is the input to the transformer.
    """
    return self.embedding_output

  def get_embedding_table(self):
    return self.embedding_table

二、17个小函数

1.def gelu(x)

def gelu(x):
  """Gaussian Error Linear Unit.#高斯错误线性单元，GELU(x)=xP(X<=x)=xΦ(x)

  This is a smoother version of the RELU.
  Original paper: https://arxiv.org/abs/1606.08415
  Args:
    x: float Tensor to perform activation.

  Returns:
    `x` with the GELU activation applied.
  """
  cdf = 0.5 * (1.0 + tf.tanh(
      (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
  return x * cdf

2.def get_activation(activation_string)

def get_activation(activation_string):
  """Maps a string to a Python function, e.g., "relu" => `tf.nn.relu`.
#将激活函数的string名称变为一个函数
  Args:
    activation_string: String name of the activation function.

  Returns:
    A Python function corresponding to the activation function. If
    `activation_string` is None, empty, or "linear", this will return None.
    If `activation_string` is not a string, it will return `activation_string`.

  Raises:
    ValueError: The `activation_string` does not correspond to a known
      activation.
  """

  # We assume that anything that"s not a string is already an activation
  # function, so we just return it.
  if not isinstance(activation_string, six.string_types):
    return activation_string

  if not activation_string:
    return None

  act = activation_string.lower()
  if act == "linear":
    return None
  elif act == "relu":
    return tf.nn.relu
  elif act == "gelu":
    return gelu
  elif act == "tanh":
    return tf.tanh
  else:
    raise ValueError("Unsupported activation: %s" % act)

3.def get_assignment_map_from_checkpoint(tvars, init_checkpoint)

def get_assignment_map_from_checkpoint(tvars, init_checkpoint):
#计算当前变量和交叉点变量的集合
  """Compute the union of the current variables and checkpoint variables."""
  assignment_map = {}
  initialized_variable_names = {}
#定义一个按输入顺序排列的字典
  name_to_variable = collections.OrderedDict()
  for var in tvars:
    name = var.name
    m = re.match("^(.*):\\d+$", name)
    if m is not None:
      name = m.group(1)
    name_to_variable[name] = var

  init_vars = tf.train.list_variables(init_checkpoint)

  assignment_map = collections.OrderedDict()
  for x in init_vars:
    (name, var) = (x[0], x[1])
    if name not in name_to_variable:
      continue
    assignment_map[name] = name
    initialized_variable_names[name] = 1
    initialized_variable_names[name + ":0"] = 1

  return (assignment_map, initialized_variable_names)

4.def dropout(input_tensor, dropout_prob)

def dropout(input_tensor, dropout_prob):
  """Perform dropout.
#实现Dropout，第二个参数表示要舍弃的概率，应用到tf.nn.dropout中要用1减
  Args:
    input_tensor: float Tensor.
    dropout_prob: Python float. The probability of dropping out a value (NOT of
      *keeping* a dimension as in `tf.nn.dropout`).

  Returns:
    A version of `input_tensor` with dropout applied.
  """
   #tf.nn.dropout(x, keep_prob, noise_shape=None, seed=None, name=None)
  if dropout_prob is None or dropout_prob == 0.0:
    return input_tensor

  output = tf.nn.dropout(input_tensor, 1.0 - dropout_prob)
  return output

5.def layer_norm(input_tensor, name=None)

def layer_norm(input_tensor, name=None):
#进行层normal,这里不是之前常用的batchnormal，主要是对一层即表示每个字或词的embedding进行norm
  """Run layer normalization on the last dimension of the tensor."""
  return tf.contrib.layers.layer_norm(
      inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name)

6.def layer_norm_and_dropout(input_tensor, dropout_prob, name=None)

def layer_norm_and_dropout(input_tensor, dropout_prob, name=None):
  """Runs layer normalization followed by dropout."""
#对输入层先做Norm，再做dropout,返回最后结果
  output_tensor = layer_norm(input_tensor, name)
  output_tensor = dropout(output_tensor, dropout_prob)
  return output_tensor

7.def create_initializer(initializer_range=0.02)

def create_initializer(initializer_range=0.02):
  """Creates a `truncated_normal_initializer` with the given range."""
#从截断的正态分布中输出随机值。生成的值服从具有指定平均值和标准偏差的正态分布，如果生成的值大于平均值2个标准偏差的值则丢弃重新选择。
#默认平均数为0，标准差为1，即下方函数要生成的是标准差为0.02，平均数为1的标准数
  return tf.truncated_normal_initializer(stddev=initializer_range)

8.def embedding_lookup

#获取词id对应的embedding词向量
def embedding_lookup(input_ids,#包含词id的 [batch_size, seq_length]维度的int32的tensor
                     vocab_size,#corpus中词的个数
                     embedding_size=128,#embedding的维度
                     initializer_range=0.02,#embedding初始化的标准差
                     word_embedding_name="word_embeddings",#embedding表的名字
                     use_one_hot_embeddings=False#是否用one-hot，否就用tf.gather()):
  #tf.gather：用一个一维的索引数组，将张量中对应索引的向量提取出来

  Returns:
    float Tensor of shape [batch_size, seq_length, embedding_size].
  """
  # This function assumes that the input is of shape [batch_size, seq_length,
  # num_inputs].
  #
  # If the input is a 2D tensor of shape [batch_size, seq_length], we
  # reshape to [batch_size, seq_length, 1].
  if input_ids.shape.ndims == 2:
    input_ids = tf.expand_dims(input_ids, axis=[-1])
#tf.get_variable(name,  shape, initializer): name就是变量的名称，shape是变量的维度，initializer是变量初始化的方式，初始化的方式有以下几种：tf.constant_initializer：常量初始化函数；tf.random_normal_initializer：正态分布；tf.truncated_normal_initializer：截取的正态分布；tf.random_uniform_initializer：均匀分布；tf.zeros_initializer：全部是0；tf.ones_initializer：全是1；tf.uniform_unit_scaling_initializer：满足均匀分布，但不影响输出数量级的随机值

  embedding_table = tf.get_variable(
      name=word_embedding_name,
      shape=[vocab_size, embedding_size],
      initializer=create_initializer(initializer_range))#初始化方式是截取的正态分布
#tf.reshape(tensor,shape,name=None)  
  flat_input_ids = tf.reshape(input_ids, [-1])#形成一行
  if use_one_hot_embeddings:
    one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size)
#将每个词或字转换成独热码，维度最后变为[flat_input_ids,vocab_size]
    output = tf.matmul(one_hot_input_ids, embedding_table)
#output输出为[flat_input_ids,embedding_size]
  else:
    output = tf.gather(embedding_table, flat_input_ids)
#不使用独热时就直接从embedding_table中提取对应的行，最后维度[ flat_input_ids,embedding_size]
  input_shape = get_shape_list(input_ids)#输出是list[batch_size, seq_length]

  output = tf.reshape(output,
                      input_shape[0:-1] + [input_shape[-1] * embedding_size])
#output维度改为[batch_size,input_shape[-1] * embedding_size]
  return (output, embedding_table)

9.def get_shape_list(tensor, expected_rank=None, name=None)

def get_shape_list(tensor, expected_rank=None, name=None):
  """Returns a list of the shape of tensor, preferring static dimensions.

  Args:
    tensor: A tf.Tensor object to find the shape of.
    expected_rank: (optional) int. The expected rank of `tensor`. If this is
      specified and the `tensor` has a different rank, and exception will be
      thrown.
    name: Optional name of the tensor for the error message.

  Returns:
    A list of dimensions of the shape of tensor. All static dimensions will
    be returned as python integers, and dynamic dimensions will be returned
    as tf.Tensor scalars.
  """
  if name is None:
    name = tensor.name

  if expected_rank is not None:
    assert_rank(tensor, expected_rank, name)

  shape = tensor.shape.as_list()

  non_static_indexes = []
  for (index, dim) in enumerate(shape):
    if dim is None:
      non_static_indexes.append(index)

  if not non_static_indexes:
    return shape

  dyn_shape = tf.shape(tensor)
  for index in non_static_indexes:
    shape[index] = dyn_shape[index]
  return shape

10.def assert_rank(tensor, expected_rank, name=None)

def assert_rank(tensor, expected_rank, name=None):
  """Raises an exception if the tensor rank is not of the expected rank.

  Args:
    tensor: A tf.Tensor to check the rank of.
    expected_rank: Python integer or list of integers, expected rank.
    name: Optional name of the tensor for the error message.

  Raises:
    ValueError: If the expected shape doesn't match the actual shape.
  """
  if name is None:
    name = tensor.name

  expected_rank_dict = {}
  if isinstance(expected_rank, six.integer_types):
    expected_rank_dict[expected_rank] = True
  else:
    for x in expected_rank:
      expected_rank_dict[x] = True

  actual_rank = tensor.shape.ndims
  if actual_rank not in expected_rank_dict:
    scope_name = tf.get_variable_scope().name
    raise ValueError(
        "For the tensor `%s` in scope `%s`, the actual rank "
        "`%d` (shape = %s) is not equal to the expected rank `%s`" %
        (name, scope_name, actual_rank, str(tensor.shape), str(expected_rank)))

11.def embedding_postprocessor

#对一个字或词嵌入张量做各种后加工
def embedding_postprocessor(input_tensor,#[batch_size, seq_length,
      embedding_size]
                            use_token_type=False,#是否为token加嵌入
                            token_type_ids=None,#[batch_size, seq_length]
                            token_type_vocab_size=16,#token_type_ids的词汇大小
                            token_type_embedding_name="token_type_embeddings",
                            use_position_embeddings=True,#增加位置嵌入
                            position_embedding_name="position_embeddings",
                            initializer_range=0.02,#权重初始化range
                            max_position_embeddings=512,#最大序列长度，比真实的大的多，但不能减
                            dropout_prob=0.1#最后输出层使用):

  input_shape = get_shape_list(input_tensor, expected_rank=3)
  batch_size = input_shape[0]
  seq_length = input_shape[1]
  width = input_shape[2]

  output = input_tensor
# Segment embedding,单个句子仅使用一个Segment embedding,两个句子使用两个
  if use_token_type:
    if token_type_ids is None:
      raise ValueError("`token_type_ids` must be specified if"
                       "`use_token_type` is True.")
    token_type_table = tf.get_variable(
        name=token_type_embedding_name,
        shape=[token_type_vocab_size, width],
        initializer=create_initializer(initializer_range))#加入16个词嵌入
    # 此处用One-hot
    flat_token_type_ids = tf.reshape(token_type_ids, [-1])
    one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size)
    token_type_embeddings = tf.matmul(one_hot_ids, token_type_table)
    token_type_embeddings = tf.reshape(token_type_embeddings,
                                       [batch_size, seq_length, width])
    output += token_type_embeddings

# Position embedding信息
  if use_position_embeddings:
    #判断sen_length是否小于max_position_embeddings
    assert_op = tf.assert_less_equal(seq_length, max_position_embeddings)
    with tf.control_dependencies([assert_op]):
      full_position_embeddings = tf.get_variable(
          name=position_embedding_name,
          shape=[max_position_embeddings, width],
          initializer=create_initializer(initializer_range))
    #为了加速训练position_embedding,使用Max_length,但后续slice取出seq_length
    #tf.slice(input_,begin,size,name=None)
      position_embeddings = tf.slice(full_position_embeddings, [0, 0],
                                     [seq_length, -1])
      num_dims = len(output.shape.as_list())

      # Only the last two dimensions are relevant (`seq_length` and `width`), so
      # we broadcast among the first dimensions, which is typically just
      # the batch size.
      position_broadcast_shape = []
      for _ in range(num_dims - 2):
        position_broadcast_shape.append(1)
      position_broadcast_shape.extend([seq_length, width])
      position_embeddings = tf.reshape(position_embeddings,
                                       position_broadcast_shape)
      output += position_embeddings

  output = layer_norm_and_dropout(output, dropout_prob)
  return output

12.def create_attention_mask_from_input_mask(from_tensor, to_mask)

def create_attention_mask_from_input_mask(from_tensor, to_mask):
  #从2D的mask,创建3D的attention mask，做Padding之后有些是无关的信息，不能参与计算
  """
  Args:
    from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...].
    to_mask: int32 Tensor of shape [batch_size, to_seq_length].

  Returns:
    float Tensor of shape [batch_size, from_seq_length, to_seq_length].
  """
  from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
  batch_size = from_shape[0]
  from_seq_length = from_shape[1]

  to_shape = get_shape_list(to_mask, expected_rank=2)
  to_seq_length = to_shape[1]
    #变为float32类型
  to_mask = tf.cast(
      tf.reshape(to_mask, [batch_size, 1, to_seq_length]), tf.float32)

  # We don't assume that `from_tensor` is a mask (although it could be). We
  # don't actually care if we attend *from* padding tokens (only *to* padding)
  # tokens so we create a tensor of all ones.
  #
  # `broadcast_ones` = [batch_size, from_seq_length, 1]
  broadcast_ones = tf.ones(
      shape=[batch_size, from_seq_length, 1], dtype=tf.float32)

  # Here we broadcast along two dimensions to create the mask.
  mask = broadcast_ones * to_mask

  return mask

13.def attention_layer

#multihead attention
def attention_layer(from_tensor,#[batch_size, from_seq_length, from_width].
                    to_tensor,# [batch_size, to_seq_length, to_width]
                    attention_mask=None,#shape [batch_size,
      from_seq_length, to_seq_length]，值为0或1，attention-score在0的位置去无穷大，在1的位置不变
                    num_attention_heads=1,#头个数
                    size_per_head=512,
                    query_act=None,#query transformer的激活函数
                    key_act=None,#key transformer的激活函数
                    value_act=None,#value transformer的激活函数
                    attention_probs_dropout_prob=0.0,
                    initializer_range=0.02,
                    do_return_2d_tensor=False,#True---输出shape [batch_size* from_seq_length, num_attention_heads * size_per_head].False---[batch_size, from_seq_length, num_attention_heads * size_per_head].
                    batch_size=None,
                    from_seq_length=None,
                    to_seq_length=None):

① If `from_tensor` and `to_tensor` are the same——self-attention. 
Each timestep in `from_tensor` attends to the corresponding sequence in `to_tensor`, and returns a fixed-with vector. 
`from_tensor` into a "query" tensor,`to_tensor` into "key" and "value" tensors. each tensor's shape[batch_size, seq_length, size_per_head].

②query和key点乘，加一个softmax,和value tensors相乘，multi-head连接作为一个tensor,返回。
③实际上多头的实现是通过转置和reshape，而非实际的分割tensor。

  Returns:
    float Tensor of shape [batch_size, from_seq_length,
      num_attention_heads * size_per_head]. (If `do_return_2d_tensor` is
      true, this will be of shape [batch_size * from_seq_length,
      num_attention_heads * size_per_head]).

  from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
  to_shape = get_shape_list(to_tensor, expected_rank=[2, 3])

  if len(from_shape) != len(to_shape):
    raise ValueError(
        "The rank of `from_tensor` must match the rank of `to_tensor`.")

  if len(from_shape) == 3:
    batch_size = from_shape[0]
    from_seq_length = from_shape[1]
    to_seq_length = to_shape[1]
  elif len(from_shape) == 2:
    if (batch_size is None or from_seq_length is None or to_seq_length is None):
      raise ValueError(
          "When passing in rank 2 tensors to attention_layer, the values "
          "for `batch_size`, `from_seq_length`, and `to_seq_length` "
          "must all be specified.")

  #标量的简记
  #   B = batch size (number of sequences)
  #   F = `from_tensor` sequence length
  #   T = `to_tensor` sequence length
  #   N = `num_attention_heads`
  #   H = `size_per_head`

  from_tensor_2d = reshape_to_matrix(from_tensor)
  to_tensor_2d = reshape_to_matrix(to_tensor)

  # `query_layer` = [B*F, N*H]
  query_layer = tf.layers.dense(
      from_tensor_2d,
      num_attention_heads * size_per_head,
      activation=query_act,
      name="query",
      kernel_initializer=create_initializer(initializer_range))

  # `key_layer` = [B*T, N*H]
tf.layers.dense(
    inputs,
    units,
    activation=None,
    use_bias=True,
    kernel_initializer=None,
    bias_initializer=tf.zeros_initializer(),
    kernel_regularizer=None,
    bias_regularizer=None,
    activity_regularizer=None,
    kernel_constraint=None,
    bias_constraint=None,
    trainable=True,
    name=None,
    reuse=None
)
  key_layer = tf.layers.dense(
      to_tensor_2d,
      num_attention_heads * size_per_head,
      activation=key_act,
      name="key",
      kernel_initializer=create_initializer(initializer_range))

  # `value_layer` = [B*T, N*H]
  value_layer = tf.layers.dense(
      to_tensor_2d,
      num_attention_heads * size_per_head,
      activation=value_act,
      name="value",
      kernel_initializer=create_initializer(initializer_range))

  # `query_layer` = [B, N, F, H]
  query_layer = transpose_for_scores(query_layer, batch_size,
                                     num_attention_heads, from_seq_length,
                                     size_per_head)

  # `key_layer` = [B, N, T, H]
  key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads,
                                   to_seq_length, size_per_head)

  # Take the dot product between "query" and "key" to get the raw
  # attention scores.
  # `attention_scores` = [B, N, F, T]
  attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
  attention_scores = tf.multiply(attention_scores,
                                 1.0 / math.sqrt(float(size_per_head)))

  if attention_mask is not None:
    # `attention_mask` = [B, 1, F, T]
    attention_mask = tf.expand_dims(attention_mask, axis=[1])

    # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
    # masked positions, this operation will create a tensor which is 0.0 for
    # positions we want to attend and -10000.0 for masked positions.
    adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0

    # Since we are adding it to the raw scores before the softmax, this is
    # effectively the same as removing these entirely.
    attention_scores += adder

  # Normalize the attention scores to probabilities.
  # `attention_probs` = [B, N, F, T]
  attention_probs = tf.nn.softmax(attention_scores)

  # This is actually dropping out entire tokens to attend to, which might
  # seem a bit unusual, but is taken from the original Transformer paper.
  attention_probs = dropout(attention_probs, attention_probs_dropout_prob)

  # `value_layer` = [B, T, N, H]
  value_layer = tf.reshape(
      value_layer,
      [batch_size, to_seq_length, num_attention_heads, size_per_head])

  # `value_layer` = [B, N, T, H]
  value_layer = tf.transpose(value_layer, [0, 2, 1, 3])

  # `context_layer` = [B, N, F, H]
  context_layer = tf.matmul(attention_probs, value_layer)

  # `context_layer` = [B, F, N, H]
  context_layer = tf.transpose(context_layer, [0, 2, 1, 3])

  if do_return_2d_tensor:
    # `context_layer` = [B*F, N*H]
    context_layer = tf.reshape(
        context_layer,
        [batch_size * from_seq_length, num_attention_heads * size_per_head])
  else:
    # `context_layer` = [B, F, N*H]
    context_layer = tf.reshape(
        context_layer,
        [batch_size, from_seq_length, num_attention_heads * size_per_head])

  return context_layer

14.def reshape_to_matrix(input_tensor)

def reshape_to_matrix(input_tensor):
#是2维向量直接返回input_tensor,大于2维向量返回[-1,width]
  """Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix)."""
  ndims = input_tensor.shape.ndims
  if ndims < 2:
    raise ValueError("Input tensor must have at least rank 2. Shape = %s" %
                     (input_tensor.shape))
  if ndims == 2:
    return input_tensor

  width = input_tensor.shape[-1]
  output_tensor = tf.reshape(input_tensor, [-1, width])
  return output_tensor

15.def reshape_from_matrix(output_tensor, orig_shape_list)

def reshape_from_matrix(output_tensor, orig_shape_list):
#把2维的tensor反转为原先的维度
  """Reshapes a rank 2 tensor back to its original rank >= 2 tensor."""
  if len(orig_shape_list) == 2:
    return output_tensor
  output_shape = get_shape_list(output_tensor)
  orig_dims = orig_shape_list[0:-1]
  width = output_shape[-1]

  return tf.reshape(output_tensor, orig_dims + [width])

16.def transpose_for_scores(input_tensor, batch_size, num_attention_heads,seq_length, width)

def transpose_for_scores(input_tensor, batch_size, num_attention_heads,
                           seq_length, width):
    output_tensor = tf.reshape(
        input_tensor, [batch_size, seq_length, num_attention_heads, width])

    output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3])
    return output_tensor

17.def transformer_model

def transformer_model(input_tensor,#[batch_size, seq_length, hidden_size]
                      attention_mask=None,#shape [batch_size, seq_length,seq_length]
                      hidden_size=768,#Transformer的隐藏层
                      num_hidden_layers=12,#Transformer的block数
                      num_attention_heads=12,
                      intermediate_size=3072,#intermediate尺寸，如feed forward等
                      intermediate_act_fn=gelu,
                      hidden_dropout_prob=0.1,
                      attention_probs_dropout_prob=0.1,
                      initializer_range=0.02,
                      do_return_all_layers=False#返回所以层还是最后层):
  """
Multi-headed, multi-layer Transformer from "Attention is All You Need".Transformer的encoder部分.
  Also see:
 https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py
  Returns:
    float Tensor of shape [batch_size, seq_length, hidden_size], the final
    hidden layer of the Transformer.

  Raises:
    ValueError: A Tensor shape or parameter is invalid.
  """
  if hidden_size % num_attention_heads != 0:
    raise ValueError(
        "The hidden size (%d) is not a multiple of the number of attention "
        "heads (%d)" % (hidden_size, num_attention_heads))

  attention_head_size = int(hidden_size / num_attention_heads)
  input_shape = get_shape_list(input_tensor, expected_rank=3)
  batch_size = input_shape[0]
  seq_length = input_shape[1]
  input_width = input_shape[2]

  # Transformer对所以层的剩余误差求和，输入层要与隐藏层数目一致
  if input_width != hidden_size:
    raise ValueError("The width of the input tensor (%d) != hidden size (%d)" %
                     (input_width, hidden_size))
  prev_output = reshape_to_matrix(input_tensor)#形成2维矩阵

  all_layer_outputs = []
  for layer_idx in range(num_hidden_layers):
    with tf.variable_scope("layer_%d" % layer_idx):
      layer_input = prev_output
      with tf.variable_scope("attention"):
        attention_heads = []
        with tf.variable_scope("self"):
          attention_head = attention_layer(
              from_tensor=layer_input,
              to_tensor=layer_input,
              attention_mask=attention_mask,
              num_attention_heads=num_attention_heads,
              size_per_head=attention_head_size,
              attention_probs_dropout_prob=attention_probs_dropout_prob,
              initializer_range=initializer_range,
              do_return_2d_tensor=True,
              batch_size=batch_size,
              from_seq_length=seq_length,
              to_seq_length=seq_length)
          attention_heads.append(attention_head)

        attention_output = None
        if len(attention_heads) == 1:
          attention_output = attention_heads[0]
        else:
          # In the case where we have other sequences, we just concatenate
          # them to the self-attention head before the projection.
          attention_output = tf.concat(attention_heads, axis=-1)

        # Run a linear projection of `hidden_size` then add a residual
        # with `layer_input`.
        with tf.variable_scope("output"):
          attention_output = tf.layers.dense(
              attention_output,
              hidden_size,
              kernel_initializer=create_initializer(initializer_range))
          attention_output = dropout(attention_output, hidden_dropout_prob)
          attention_output = layer_norm(attention_output + layer_input)

      # The activation is only applied to the "intermediate" hidden layer.
      with tf.variable_scope("intermediate"):
        intermediate_output = tf.layers.dense(
            attention_output,
            intermediate_size,
            activation=intermediate_act_fn,
            kernel_initializer=create_initializer(initializer_range))

      # Down-project back to `hidden_size` then add the residual.
      with tf.variable_scope("output"):
        layer_output = tf.layers.dense(
            intermediate_output,
            hidden_size,
            kernel_initializer=create_initializer(initializer_range))
        layer_output = dropout(layer_output, hidden_dropout_prob)
        layer_output = layer_norm(layer_output + attention_output)
        prev_output = layer_output
        all_layer_outputs.append(layer_output)

  if do_return_all_layers:
    final_outputs = []
    for layer_output in all_layer_outputs:
      final_output = reshape_from_matrix(layer_output, input_shape)
      final_outputs.append(final_output)
    return final_outputs
  else:
    final_output = reshape_from_matrix(prev_output, input_shape)
    return final_output

第2个类是整个modeling函数的接口，通过调用上述类和函数，实现了最后结果

参考资料：https://mp.weixin.qq.com/s/rxJ0jAFKsP6ByWeVv6Tr5Q

举例为：

 1# 假设输入已经经过分词变成word_ids. shape=[2, 3]
 2input_ids = tf.constant([[31, 51, 99], [15, 5, 0]])
 3input_mask = tf.constant([[1, 1, 1], [1, 1, 0]])
 4# segment_emebdding. 表示第一个样本前两个词属于句子1，后一个词属于句子2.
 5# 第二个样本的第一个词属于句子1， 第二次词属于句子2，第三个元素0表示padding
 6token_type_ids = tf.constant([[0, 0, 1], [0, 1, 0]])
 7
 8# 创建BertConfig实例
 9config = modeling.BertConfig(vocab_size=32000, hidden_size=512,
10         num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)
11
12# 创建BertModel实例
13model = modeling.BertModel(config=config, is_training=True,
14     input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids)
15
16
17label_embeddings = tf.get_variable(...)
18#得到最后一层的第一个Token也就是[CLS]向量表示，可以看成是一个句子的embedding
19pooled_output = model.get_pooled_output()
20logits = tf.matmul(pooled_output, label_embeddings)

Zhen大虾

关注

5
点赞
踩
39

收藏

觉得还不错? 一键收藏
1
评论
Bert 代码详细解读——modeling.py

在官方的bert-github上，git clone https://github.com/google-research/bert.git主要的文件内容如下图：主要包括7个主要的python文件，小编要坚持把这7个解读清楚呀！首先解读的是modeling.py文件，是bert实现的核心代码,主要包括2个类和17个函数，如下所示：一、类1.class Bert...
复制链接

扫一扫