1 .VQAFeatureDataset
此类是ReGat项目对torch自带的from torch.utils.data import Dataset
的重写,是模型运行的时候训练集和测试集的加载,加载的数据是模型forward函数的参数。如下:
regat.forward():
def forward(self, v, b, q, implicit_pos_emb, sem_adj_matrix,
spa_adj_matrix, labels):
"""Forward
v: [batch, num_objs, obj_dim]
b: [batch, num_objs, b_dim]
q: [batch_size, seq_length]
pos: [batch_size, num_objs, nongt_dim, emb_dim]
sem_adj_matrix: [batch_size, num_objs, num_objs, num_edge_labels]
spa_adj_matrix: [batch_size, num_objs, num_objs, num_edge_labels]
return: logits, not probs
"""
w_emb = self.w_emb(q) #问题嵌入
q_emb_seq = self.q_emb.forward_all(w_emb) # [batch, q_len, q_dim]
q_emb_self_att = self.q_att(q_emb_seq) #添加自注意力信息
# [batch_size, num_rois, out_dim]
if self.relation_type == "semantic": #如果关系类型是语义
v_emb = self.v_relation.forward(v, sem_adj_matrix, q_emb_self_att)
elif self.relation_type == "spatial": #如果关系类型是空间
v_emb = self.v_relation.forward(v, spa_adj_matrix, q_emb_self_att)
else: # implicit #如果是隐式关系
v_emb = self.v_relation.forward(v, implicit_pos_emb,
q_emb_self_att)
if self.fusion == "ban": #融合模型1
joint_emb, att = self.joint_embedding(v_emb, q_emb_seq, b)
elif self.fusion == "butd": #融合模型2
q_emb = self.q_emb(w_emb) # [batch, q_dim]
joint_emb, att = self.joint_embedding(v_emb, q_emb)
else: # mutan融合模型3
joint_emb, att = self.joint_embedding(v_emb, q_emb_self_att)
if self.classifier: #分类模型
logits = self.classifier(joint_emb)
else:
logits = joint_emb
return logits, att
VQAFeatureDataset
self中的变量
变量名 | 含义 | 来源 | 值 |
---|---|---|---|
self.ans2label | 单词-索引表示:字典 | trainval_ans2label.pkl | {‘net’: 0, ‘pitcher’: 1, ‘orange’: 2, ‘yes’: 3, ‘white’: 4,… |
self.label2ans | 索引-单词表示:列表 | trainval_label2ans.pkl | [‘net’, ‘pitcher’, ‘orange’, ‘yes’, ‘white’,. |
self.num_ans_candidates | 答案单词候选数:int | len(self.ans2label) | 3129 |
self.img_id2idx | 图像id-索引表示:字典 | imgid2idx.pkl | {218224: 0, 306670: 1, 208663: 2, 225177: 3, 467257: 4, . |
self.features | 图像特征:Tensor | hf.get(‘image_features’) | tensor[40504,36,2048] |
self.normalized_bb | 标准化区域边界框空间位置:Tensor | hf.get(‘spatial_features’) | tensor([40504, 36, 4]) |
self.bb | 区域边界框位置:Tensor | hf.get(‘image_bb’) | tensor[40504,36,4] |
self.semantic_adj_matrix | 语义形容词矩阵 | 如果在hf的键中:hf.get(‘semantic_adj_matrix’) ,不在=None | |
self.spatial_adj_matrix | 空间形容词矩阵 | 如果在hf的键中:hf.get(‘image_adj_matrix’) ,不在=None | |
self.pos_boxes | None | None | None |
self.entries | 数据条目,items:list | _load_dataset(dataroot, name, self.img_id2idx,self.label2ans) | 长度214354 |
self.nongt_dim | self.nongt_dim = nongt_dim | 36 | |
self.emb_dim | 位置嵌入维度 | pos_emb_dim | 64 |
self.v_dim | 图像特征嵌入维度 | self.features.size(1 if self.adaptive else 2) | 2048 |
self.s_dim | 方向维度 | self.normalized_bb.size(1 if self.adaptive else 2) | 6 |
class VQAFeatureDataset(Dataset):
def __init__(self, name, dictionary, relation_type, dataroot='data',
adaptive=False, pos_emb_dim=64, nongt_dim=36):
super(VQAFeatureDataset, self).__init__()
assert name in ['train', 'val', 'test-dev2015', 'test2015']
# 加载annotations.json的预处理后的pkl文件
ans2label_path = os.path.join(dataroot, 'cache', 'trainval_ans2label.pkl')
label2ans_path = os.path.join(dataroot, 'cache', 'trainval_label2ans.pkl')
self.ans2label = pickle.load(open(ans2label_path, 'rb')) #形如{'w1':1,'w2':2,...,'w3129':3129}
self.label2ans = pickle.load(open(label2ans_path, 'rb'))#['w1','w2',...,'w3129']
self.num_ans_candidates = len(self.ans2label) #候选答案单词数目=3129
self.dictionary = dictionary #词典,包含19901个单词,键:idx2word#['w1','w2',...,'w19901'],word2idx{'w1':1,'w2':2,...,'w19901':19901},padding_idx=19901,ntoken=19901
self.relation_type = relation_type
self.adaptive = adaptive #数据集是否是自适应的10-100个区域的
prefix = '36'
if 'test' in name:
prefix = '_36'
#加载hdf5文件目录
h5_dataroot = dataroot+"/Bottom-up-features-adaptive"\
if self.adaptive else dataroot+"/Bottom-up-features-fixed"
imgid_dataroot = dataroot+"/imgids" #加载图像ids文件:
#加载imgid2idx.pkl文件,保存再self.img_id2idx里:{id1:1,id2:2,...,id40504:40504}
self.img_id2idx = pickle.load(open(os.path.join(imgid_dataroot, '%s%s_imgid2idx.pkl' %
(name, '' if self.adaptive else prefix)), 'rb'))
#加载hdf5文件
h5_path = os.path.join(h5_dataroot, '%s%s.hdf5' %
(name, '' if self.adaptive else prefix))
print('loading features from h5 file %s' % h5_path)
with h5py.File(h5_path, 'r') as hf:
# self.features = np.array(hf.get('image_features'))
self.features = np.array(hf.get('image_features'),dtype='float32')
self.normalized_bb = np.array(hf.get('spatial_features'),dtype='float32')
self.bb = np.array(hf.get('image_bb'),dtype='float32')
print("hdf5数据加载成功!")
if "semantic_adj_matrix" in hf.keys() \
and self.relation_type == "semantic":
self.semantic_adj_matrix = np.array(
hf.get('semantic_adj_matrix'))
print("Loaded semantic adj matrix from file...",
self.semantic_adj_matrix.shape)
else:
self.semantic_adj_matrix = None
print("Setting semantic adj matrix to None...")
if "image_adj_matrix" in hf.keys()\
and self.relation_type == "spatial":
self.spatial_adj_matrix = np.array(hf.get('image_adj_matrix'))#从文件加载空间的形容词矩阵
print("Loaded spatial adj matrix from file...",
self.spatial_adj_matrix.shape)
else:
self.spatial_adj_matrix = None
print("Setting spatial adj matrix to None...")
self.pos_boxes = None
if self.adaptive:
self.pos_boxes = np.array(hf.get('pos_boxes'),dtype='float32')
self.entries = _load_dataset(dataroot, name, self.img_id2idx,
self.label2ans)
self.tokenize()
print("数据加载成功!")
self.tensorize()
self.nongt_dim = nongt_dim
self.emb_dim = pos_emb_dim
self.v_dim = self.features.size(1 if self.adaptive else 2)
self.s_dim = self.normalized_bb.size(1 if self.adaptive else 2)
def tokenize(self, max_length=14):
"""Tokenizes the questions.
This will add q_token in each entry of the dataset.
-1 represent nil, and should be treated as padding_idx in embedding
"""
for entry in self.entries:
tokens = self.dictionary.tokenize(entry['question'], False)
tokens = tokens[:max_length]
if len(tokens) < max_length:
# Note here we pad to the back of the sentence
padding = [self.dictionary.padding_idx] * \
(max_length - len(tokens))
tokens = tokens + padding
utils.assert_eq(len(tokens), max_length)
entry['q_token'] = tokens
def tensorize(self):
self.features = torch.from_numpy(self.features)
self.normalized_bb = torch.from_numpy(self.normalized_bb)
self.bb = torch.from_numpy(self.bb)
if self.semantic_adj_matrix is not None:
self.semantic_adj_matrix = torch.from_numpy(
self.semantic_adj_matrix).double()
if self.spatial_adj_matrix is not None:
self.spatial_adj_matrix = torch.from_numpy(
self.spatial_adj_matrix).double()
if self.pos_boxes is not None:
self.pos_boxes = torch.from_numpy(self.pos_boxes)
for entry in self.entries:
question = torch.from_numpy(np.array(entry['q_token']))
entry['q_token'] = question
answer = entry['answer']
if answer is not None:
labels = np.array(answer['labels'])
scores = np.array(answer['scores'], dtype=np.float32)
if len(labels):
labels = torch.from_numpy(labels)
scores = torch.from_numpy(scores)
entry['answer']['labels'] = labels
entry['answer']['scores'] = scores
else:
entry['answer']['labels'] = None
entry['answer']['scores'] = None
def __getitem__(self, index):
entry = self.entries[index]
raw_question = entry["question"]
image_id = entry["image_id"]
question = entry['q_token']
question_id = entry['question_id']
if self.spatial_adj_matrix is not None:
spatial_adj_matrix = self.spatial_adj_matrix[entry["image"]]
else:
spatial_adj_matrix = torch.zeros(1).double()
if self.semantic_adj_matrix is not None:
semantic_adj_matrix = self.semantic_adj_matrix[entry["image"]]
else:
semantic_adj_matrix = torch.zeros(1).double()
if not self.adaptive:
# fixed number of bounding boxes
features = self.features[entry['image']]
normalized_bb = self.normalized_bb[entry['image']]
bb = self.bb[entry["image"]]
else:
features = self.features[
self.pos_boxes[
entry['image']][0]:self.pos_boxes[entry['image']][1], :]
normalized_bb = self.normalized_bb[
self.pos_boxes[
entry['image']][0]:self.pos_boxes[entry['image']][1], :]
bb = self.bb[
self.pos_boxes[
entry['image']][0]:self.pos_boxes[entry['image']][1], :]
answer = entry['answer']
if answer is not None:
labels = answer['labels']
scores = answer['scores']
target = torch.zeros(self.num_ans_candidates)
if labels is not None:
target.scatter_(0, labels, scores)
return features, normalized_bb, question, target,\
question_id, image_id, bb, spatial_adj_matrix,\
semantic_adj_matrix
else:
return features, normalized_bb, question, question_id,\
question_id, image_id, bb, spatial_adj_matrix,\
semantic_adj_matrix
def __len__(self):
return len(self.entries)
entries
entries是数据的条目,类型是list,共214354条数据,每条数据是一个字典。
每条数据如下:
键值 | 含义 | 值 |
---|---|---|
question_id | 问题id | 42000 |
image_id | 图像id | 42 |
image | 图像 | 37244 |
question | 问题文字表示 | ‘What color are the gym shoes?’ |
answer | 答案:label,score | {‘labels’: tensor([ 4, 1594], dtype=torch.int32), ‘scores’: tensor([1.0000, 0.3000])} |
q_token | 问题索引向量表示 | tensor([ 0, 10, 68, 11, 2618, 225, 19901, 19901, 19901, 19901,19901, 19901, 19901, 19901], dtype=torch.int32) |
2. 模型需要传入的getitem数据返回 :如果是固定36个区域
变量名 | 来源 |
---|---|
features | self.features[entry[‘image’]],此处的image是位置 |
normalized_bb | self.normalized_bb[entry[‘image’]] |
question | entry[‘q_token’] |
target | scatter_(0, labels, scores) |
question_id | entry[‘question_id’] |
image_id | entry[“image_id”] |
bb | self.bb[entry[“image”]] |
spatial_adj_matrix | self.spatial_adj_matrix[entry[“image”]] |
semantic_adj_matrix | self.semantic_adj_matrix[entry[“image”]] |