2021SC@SDUSC
下面分析data.py程序
这里定义了DataLoader类,分三种情况:训练、开发、测试,从json文件中读取数据构建图对象。
class DataLoader(object):
def __init__(self, args, vocabs, lex_map, filename, batch_size, for_train):
if for_train == True:
self.examples = read_jsonl(args.train_data_jsonl)
elif for_train == 'Eval':
self.examples = read_jsonl(args.test_data_jsonl)
for_train = False
else:
self.examples = read_jsonl(args.dev_data_jsonl)
self.filename = filename
self.lex_map = lex_map
self.vocabs = vocabs
self.batch_size = batch_size
self.train = for_train
self.unk_rate = 0.
self.record_flag = False
self.args = args
该程序中还定义了格式转换函数,下面是列表转换为向量:
ef ListsToTensor(xs, vocab=None, local_vocabs=None, unk_rate=0.):
pad = vocab.padding_idx if vocab else 0
def toIdx(w, i):
if vocab is None:
return w
if isinstance(w, list):
return [toIdx(_, i) for _ in w]
if random.random() < unk_rate:
return vocab.unk_idx
if local_vocabs is not None:
local_vocab = local_vocabs[i]
if (local_vocab is not None) and (w in local_vocab):
return local_vocab[w]
return vocab.token2idx(w)
max_len = max(len(x) for x in xs)
ys = []
for i, x in enumerate(xs):
y = toIdx(x, i) + [pad] * (max_len - len(x))
ys.append(y)
data = torch.LongTensor(ys).t_().contiguous()
return data
数组转换为向量:
ef ArraysToTensor(xs):
x = np.array([list(x.shape) for x in xs])
shape = [len(xs)] + list(x.max(axis=0))
data = np.zeros(shape, dtype=np.int)
for i, x in enumerate(xs):
slicing_shape = list(x.shape)
slices = tuple([slice(i, i + 1)] + [slice(0, x) for x in slicing_shape])
data[slices] = x
tensor = torch.from_numpy(data).long()
return tensor