torchtext结构总览
图片来源:https://mp.weixin.qq.com/s/1T8peCd8IQT5XmZf68DhwQ
数据格式(以MRC为例)
{"id": "56dfa01738dc42170015211f",
"context": "Tesla went on to pursue his ideas of wireless lighting and electricity distribution in his high-voltage, high-frequency power experiments in New York and Colorado Springs, and made early (1893) pronouncements on the possibility of wireless communication with his devices. He tried to put these ideas to practical use in an ill-fated attempt at intercontinental wireless transmission, his unfinished Wardenclyffe Tower project. In his lab he also conducted a range of experiments with mechanical oscillators/generators, electrical discharge tubes, and early X-ray imaging. He also built a wireless controlled boat, one of the first ever exhibited.",
"question": "What were some of Tesla's experiments?",
"answer": "high-voltage, high-frequency power",
"s_idx": 15,
"e_idx": 18
}
test_data.jsonl
代码样例
import json
import nltk
from torchtext import data
def word_tokenize(tokens):
return [token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens)]
# 定义id字段
RAW = data.RawField()
RAW.is_target = False
# 定义char字段,需要嵌套,先通过tokenize变成单词,然后通过单词进行list操作变成char
CHAR_NESTING = data.Field(batch_first = True,tokenize = list,lower = True)
CHAR = data.NestedField(CHAR_NESTING,tokenize = word_tokenize)
# 定义标签字段(阅读理解答案的起止位置)
LABEL = data.Field(sequential = False,unk_token = None,use_vocab = False)
# 对json中的每一个字段与相应的field进行对应
dict_field = {'id':('data_id',RAW),
's_idx':('data_s_idx',LABEL),
'e_idx':('data_e_idx',LABEL),
'context':[('data_c_word',WORD),('data_c_char',CHAR)],
'question':[('data_q_word',WORD),('data_q_char',CHAR)]
from torchtext.data.example import Example
test_data = [json.loads(line.strip()) for line in open('./data/test_data.jsonl')]
print('处理一条样例的例子:',Example.fromdict(test_data[0],dict_field).__dict__)
使用data.TabularDataset从json处理成Example
train,dev = data.TabularDataset.splits(
path = './data',
train = 'test_data.jsonl',
validation='test_data.jsonl',
format='json',
fields = dict_field
)
print('data.TabularDataset.splits处理train与test后:')
print('dev:',dev)
print('dev[0]:',dev.examples[0].__dict__)
构造vocab
CHAR.build_vocab(train,dev)
WORD.build_vocab(train,dev)