run_squad解读
SquadExample
class SquadExample(object):
def __init__(self,
qas_id,
question_text,
doc_tokens,
orig_answer_text=None,
start_position=None,
end_position=None,
is_impossible=False):
self.qas_id = qas_id。#qa模块的ID
self.question_text = question_text #问题
self.doc_tokens = doc_tokens #文本
self.orig_answer_text = orig_answer_text #原始答案
self.start_position = start_position #答案的开始位置
self.end_position = end_position #答案的结束位置
self.is_impossible = is_impossible #是否存在答案
定义了一个输入样本Squad的格式
InputFeatures
class InputFeatures(object):
def __init__(self,
unique_id,
example_index,
doc_span_index,
tokens,
token_to_orig_map,
token_is_max_context,
input_ids,
input_mask,
segment_ids,
start_position=None,
end_position=None,
is_impossible=None):
self.unique_id = unique_id #每个特征的ID
self.example_index = example_index #example的索引,建立feature 和example的对应
self.doc_span_index = doc_span_index #文本片段的索引,该feature对应的doc_span索引
self.tokens = tokens #文本
self.token_to_orig_map = token_to_orig_map #每一个token在原始doc_token的索引
self.token_is_max_context = token_is_max_context #该位置的token在当前span里面是否是上下文最全的
self.input_ids = input_ids #输入的ID
self.input_mask = input_mask #输入的
self.segment_ids = segment_ids。#输入属于第几句话的ID
self.start_position = start_position #答案的开始位置(这里指的是在输入文本片段中的位置,同时也包含了问题的长度)
self.end_position = end_position #答案结束的位置
self.is_impossible = is_impossible #是否有答案
定义了输入的样本特征
read_squad_examples
def read_squad_examples(input_file, is_training):
"""Read a SQuAD json file into a list of SquadExample."""
with tf.gfile.Open(input_file, "r") as reader:
input_data = json.load(reader)["data"]
def is_whitespace(c):
if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
return True
return False
examples = []
for entry in input_data:
#entry中有title 和 paragraph
for paragraph in entry["paragraphs"]: #遍历这篇文章的每一个段落
paragraph_text = paragraph["context"] #这个段落的文本内容
doc_tokens = [] #文本的token,相当于split操作
char_to_word_offset = [] #列表,存储每个字母中属于第几个单词
prev_is_whitespace = True
for c in paragraph_text: #遍历段落中每一个字母,按照一个字母一个字母加入到doc_tokens
if is_whitespace(c):
prev_is_whitespace = True
else:
if prev_is_whitespace:
doc_tokens.append(c)
else:
doc_tokens[-1] += c
prev_is_whitespace = False
char_to_word_offset.append(len(doc_tokens) - 1)
for qa in paragraph["qas"]: 遍历每个qa
qas_id = qa["id"]
question_text = qa["question"]
start_position = None
end_position = None
orig_answer_text = None
is_impossible = False
if is_training:
if FLAGS.version_2_with_negative:
is_impossible = qa["is_impossible"]
if (len(qa["answers"]) != 1) and (not is_impossible):
raise ValueError(
"For training, each question should have exactly 1 answer.")
if not is_impossible:
answer = qa["answers"][0]
orig_answer_text = answer["text"]
answer_offset = answer["answer_start"]
answer_length = len(orig_answer_text)
start_position = char_to_word_offset[answer_offset]
end_position = char_to_word_offset[answer_offset + answer_length -
1]
# Only add answers where the text can be exactly recovered from the
# document. If this CAN'T happen it's likely due to weird Unicode
# stuff so we will just skip the example.
#
# Note that this means for training mode, every example is NOT
# guaranteed to be preserved.
actual_text = " ".join(
doc_tokens[start_position:(end_position + 1)])
cleaned_answer_text = " ".join(