Bert实战--阅读理解(一)

run_squad解读

SquadExample

class SquadExample(object):
  def __init__(self,
               qas_id,
               question_text,
               doc_tokens,
               orig_answer_text=None,
               start_position=None,
               end_position=None,
               is_impossible=False):
    self.qas_id = qas_id。#qa模块的ID
    self.question_text = question_text #问题
    self.doc_tokens = doc_tokens  #文本
    self.orig_answer_text = orig_answer_text #原始答案
    self.start_position = start_position #答案的开始位置
    self.end_position = end_position  #答案的结束位置
    self.is_impossible = is_impossible  #是否存在答案

定义了一个输入样本Squad的格式

InputFeatures

class InputFeatures(object):
  def __init__(self,
               unique_id,
               example_index,  
               doc_span_index, 
               tokens, 
               token_to_orig_map, 
               token_is_max_context,
               input_ids,
               input_mask,
               segment_ids,
               start_position=None,
               end_position=None,
               is_impossible=None):
    self.unique_id = unique_id   #每个特征的ID
    self.example_index = example_index  #example的索引,建立feature 和example的对应
    self.doc_span_index = doc_span_index  #文本片段的索引,该feature对应的doc_span索引
    self.tokens = tokens  #文本
    self.token_to_orig_map = token_to_orig_map #每一个token在原始doc_token的索引
    self.token_is_max_context = token_is_max_context #该位置的token在当前span里面是否是上下文最全的
    self.input_ids = input_ids  #输入的ID
    self.input_mask = input_mask #输入的
    self.segment_ids = segment_ids。#输入属于第几句话的ID
    self.start_position = start_position  #答案的开始位置(这里指的是在输入文本片段中的位置,同时也包含了问题的长度)
    self.end_position = end_position #答案结束的位置
    self.is_impossible = is_impossible #是否有答案
    

定义了输入的样本特征

read_squad_examples

def read_squad_examples(input_file, is_training):
  """Read a SQuAD json file into a list of SquadExample."""
  with tf.gfile.Open(input_file, "r") as reader:
    input_data = json.load(reader)["data"]

  def is_whitespace(c):
    if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
      return True
    return False

  examples = []
  for entry in input_data:
  #entry中有title 和 paragraph
    for paragraph in entry["paragraphs"]:  #遍历这篇文章的每一个段落
      paragraph_text = paragraph["context"]  #这个段落的文本内容
      doc_tokens = []   #文本的token,相当于split操作
      char_to_word_offset = []  #列表,存储每个字母中属于第几个单词
      prev_is_whitespace = True
      for c in paragraph_text:  #遍历段落中每一个字母,按照一个字母一个字母加入到doc_tokens
        if is_whitespace(c):
          prev_is_whitespace = True
        else:
          if prev_is_whitespace:
            doc_tokens.append(c)
          else:
            doc_tokens[-1] += c
          prev_is_whitespace = False
        char_to_word_offset.append(len(doc_tokens) - 1)

      for qa in paragraph["qas"]: 遍历每个qa
        qas_id = qa["id"]
        question_text = qa["question"]
        start_position = None
        end_position = None
        orig_answer_text = None
        is_impossible = False
        if is_training:

          if FLAGS.version_2_with_negative:
            is_impossible = qa["is_impossible"]
          if (len(qa["answers"]) != 1) and (not is_impossible):
            raise ValueError(
                "For training, each question should have exactly 1 answer.")
          if not is_impossible:
            answer = qa["answers"][0]
            orig_answer_text = answer["text"]
            answer_offset = answer["answer_start"]
            answer_length = len(orig_answer_text)
            start_position = char_to_word_offset[answer_offset]
            end_position = char_to_word_offset[answer_offset + answer_length -
                                               1]
            # Only add answers where the text can be exactly recovered from the
            # document. If this CAN'T happen it's likely due to weird Unicode
            # stuff so we will just skip the example.
            #
            # Note that this means for training mode, every example is NOT
            # guaranteed to be preserved.
            actual_text = " ".join(
                doc_tokens[start_position:(end_position + 1)])
            cleaned_answer_text = " ".join(
  
  • 5
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值