【数据分析】之ReGat的VQAFeaturesDataset加载

1 .VQAFeatureDataset

此类是ReGat项目对torch自带的from torch.utils.data import Dataset的重写,是模型运行的时候训练集和测试集的加载,加载的数据是模型forward函数的参数。如下:
regat.forward():

    def forward(self, v, b, q, implicit_pos_emb, sem_adj_matrix,
                spa_adj_matrix, labels):
        """Forward
        v: [batch, num_objs, obj_dim]
        b: [batch, num_objs, b_dim]
        q: [batch_size, seq_length]
        pos: [batch_size, num_objs, nongt_dim, emb_dim]
        sem_adj_matrix: [batch_size, num_objs, num_objs, num_edge_labels]
        spa_adj_matrix: [batch_size, num_objs, num_objs, num_edge_labels]
        return: logits, not probs
        """
        w_emb = self.w_emb(q) #问题嵌入
        q_emb_seq = self.q_emb.forward_all(w_emb)  # [batch, q_len, q_dim]
        q_emb_self_att = self.q_att(q_emb_seq)  #添加自注意力信息

        # [batch_size, num_rois, out_dim]
        if self.relation_type == "semantic": #如果关系类型是语义
            v_emb = self.v_relation.forward(v, sem_adj_matrix, q_emb_self_att)
        elif self.relation_type == "spatial": #如果关系类型是空间
            v_emb = self.v_relation.forward(v, spa_adj_matrix, q_emb_self_att)
        else:  # implicit #如果是隐式关系
            v_emb = self.v_relation.forward(v, implicit_pos_emb,
                                            q_emb_self_att)

        if self.fusion == "ban": #融合模型1
            joint_emb, att = self.joint_embedding(v_emb, q_emb_seq, b)
        elif self.fusion == "butd": #融合模型2
            q_emb = self.q_emb(w_emb)  # [batch, q_dim]
            joint_emb, att = self.joint_embedding(v_emb, q_emb)
        else:  # mutan融合模型3
            joint_emb, att = self.joint_embedding(v_emb, q_emb_self_att)
        if self.classifier: #分类模型
            logits = self.classifier(joint_emb)
        else: 
            logits = joint_emb
        return logits, att

VQAFeatureDataset

self中的变量

变量名含义来源
self.ans2label单词-索引表示:字典trainval_ans2label.pkl{‘net’: 0, ‘pitcher’: 1, ‘orange’: 2, ‘yes’: 3, ‘white’: 4,…
self.label2ans索引-单词表示:列表trainval_label2ans.pkl[‘net’, ‘pitcher’, ‘orange’, ‘yes’, ‘white’,.
self.num_ans_candidates答案单词候选数:intlen(self.ans2label)3129
self.img_id2idx图像id-索引表示:字典imgid2idx.pkl{218224: 0, 306670: 1, 208663: 2, 225177: 3, 467257: 4, .
self.features图像特征:Tensorhf.get(‘image_features’)tensor[40504,36,2048]
self.normalized_bb标准化区域边界框空间位置:Tensorhf.get(‘spatial_features’)tensor([40504, 36, 4])
self.bb区域边界框位置:Tensorhf.get(‘image_bb’)tensor[40504,36,4]
self.semantic_adj_matrix语义形容词矩阵如果在hf的键中:hf.get(‘semantic_adj_matrix’) ,不在=None
self.spatial_adj_matrix空间形容词矩阵如果在hf的键中:hf.get(‘image_adj_matrix’) ,不在=None
self.pos_boxesNoneNoneNone
self.entries数据条目,items:list_load_dataset(dataroot, name, self.img_id2idx,self.label2ans)长度214354
self.nongt_dimself.nongt_dim = nongt_dim36
self.emb_dim位置嵌入维度pos_emb_dim64
self.v_dim图像特征嵌入维度self.features.size(1 if self.adaptive else 2)2048
self.s_dim方向维度self.normalized_bb.size(1 if self.adaptive else 2)6
class VQAFeatureDataset(Dataset):
    def __init__(self, name, dictionary, relation_type, dataroot='data',
                 adaptive=False, pos_emb_dim=64, nongt_dim=36):
        super(VQAFeatureDataset, self).__init__()
        assert name in ['train', 'val', 'test-dev2015', 'test2015']
        
		# 加载annotations.json的预处理后的pkl文件
        ans2label_path = os.path.join(dataroot, 'cache', 'trainval_ans2label.pkl')
        label2ans_path = os.path.join(dataroot, 'cache', 'trainval_label2ans.pkl')
        self.ans2label = pickle.load(open(ans2label_path, 'rb')) #形如{'w1':1,'w2':2,...,'w3129':3129}
        self.label2ans = pickle.load(open(label2ans_path, 'rb'))#['w1','w2',...,'w3129']


        self.num_ans_candidates = len(self.ans2label) #候选答案单词数目=3129
        self.dictionary = dictionary #词典,包含19901个单词,键:idx2word#['w1','w2',...,'w19901'],word2idx{'w1':1,'w2':2,...,'w19901':19901},padding_idx=19901,ntoken=19901
        self.relation_type = relation_type
        self.adaptive = adaptive #数据集是否是自适应的10-100个区域的
        prefix = '36'
        if 'test' in name:
            prefix = '_36'
        #加载hdf5文件目录
        h5_dataroot = dataroot+"/Bottom-up-features-adaptive"\
            if self.adaptive else dataroot+"/Bottom-up-features-fixed"
        imgid_dataroot = dataroot+"/imgids" #加载图像ids文件:
		#加载imgid2idx.pkl文件,保存再self.img_id2idx里:{id1:1,id2:2,...,id40504:40504}
        self.img_id2idx = pickle.load(open(os.path.join(imgid_dataroot, '%s%s_imgid2idx.pkl' %
                              (name, '' if self.adaptive else prefix)), 'rb'))
        #加载hdf5文件
        h5_path = os.path.join(h5_dataroot, '%s%s.hdf5' %
                               (name, '' if self.adaptive else prefix))

        print('loading features from h5 file %s' % h5_path)
        with h5py.File(h5_path, 'r') as hf:
            # self.features = np.array(hf.get('image_features'))
            self.features = np.array(hf.get('image_features'),dtype='float32')

            self.normalized_bb = np.array(hf.get('spatial_features'),dtype='float32')

            self.bb = np.array(hf.get('image_bb'),dtype='float32')
            print("hdf5数据加载成功!")
            if "semantic_adj_matrix" in hf.keys() \
               and self.relation_type == "semantic":
                self.semantic_adj_matrix = np.array(
                                        hf.get('semantic_adj_matrix'))
                print("Loaded semantic adj matrix from file...",
                      self.semantic_adj_matrix.shape)
            else:
                self.semantic_adj_matrix = None
                print("Setting semantic adj matrix to None...")
            if "image_adj_matrix" in hf.keys()\
               and self.relation_type == "spatial":
                self.spatial_adj_matrix = np.array(hf.get('image_adj_matrix'))#从文件加载空间的形容词矩阵
                print("Loaded spatial adj matrix from file...",
                      self.spatial_adj_matrix.shape)
            else:
                self.spatial_adj_matrix = None
                print("Setting spatial adj matrix to None...")

            self.pos_boxes = None
            if self.adaptive:
                self.pos_boxes = np.array(hf.get('pos_boxes'),dtype='float32')
        self.entries = _load_dataset(dataroot, name, self.img_id2idx,
                                     self.label2ans)
        self.tokenize()
        print("数据加载成功!")
        self.tensorize()
        self.nongt_dim = nongt_dim
        self.emb_dim = pos_emb_dim
        self.v_dim = self.features.size(1 if self.adaptive else 2)
        self.s_dim = self.normalized_bb.size(1 if self.adaptive else 2)

    def tokenize(self, max_length=14):
        """Tokenizes the questions.

        This will add q_token in each entry of the dataset.
        -1 represent nil, and should be treated as padding_idx in embedding
        """
        for entry in self.entries:
            tokens = self.dictionary.tokenize(entry['question'], False)
            tokens = tokens[:max_length]
            if len(tokens) < max_length:
                # Note here we pad to the back of the sentence
                padding = [self.dictionary.padding_idx] * \
                          (max_length - len(tokens))
                tokens = tokens + padding
            utils.assert_eq(len(tokens), max_length)
            entry['q_token'] = tokens

    def tensorize(self):
        self.features = torch.from_numpy(self.features)
        self.normalized_bb = torch.from_numpy(self.normalized_bb)
        self.bb = torch.from_numpy(self.bb)
        if self.semantic_adj_matrix is not None:
            self.semantic_adj_matrix = torch.from_numpy(
                                        self.semantic_adj_matrix).double()
        if self.spatial_adj_matrix is not None:
            self.spatial_adj_matrix = torch.from_numpy(
                                        self.spatial_adj_matrix).double()
        if self.pos_boxes is not None:
            self.pos_boxes = torch.from_numpy(self.pos_boxes)

        for entry in self.entries:
            question = torch.from_numpy(np.array(entry['q_token']))
            entry['q_token'] = question

            answer = entry['answer']
            if answer is not None:
                labels = np.array(answer['labels'])
                scores = np.array(answer['scores'], dtype=np.float32)
                if len(labels):
                    labels = torch.from_numpy(labels)
                    scores = torch.from_numpy(scores)
                    entry['answer']['labels'] = labels
                    entry['answer']['scores'] = scores
                else:
                    entry['answer']['labels'] = None
                    entry['answer']['scores'] = None

    def __getitem__(self, index):
        entry = self.entries[index]
        raw_question = entry["question"]
        image_id = entry["image_id"]

        question = entry['q_token']
        question_id = entry['question_id']
        if self.spatial_adj_matrix is not None:
            spatial_adj_matrix = self.spatial_adj_matrix[entry["image"]]
        else:
            spatial_adj_matrix = torch.zeros(1).double()
        if self.semantic_adj_matrix is not None:
            semantic_adj_matrix = self.semantic_adj_matrix[entry["image"]]
        else:
            semantic_adj_matrix = torch.zeros(1).double()
        if not self.adaptive:
            # fixed number of bounding boxes
            features = self.features[entry['image']]
            normalized_bb = self.normalized_bb[entry['image']]
            bb = self.bb[entry["image"]]
        else:
            features = self.features[
                self.pos_boxes[
                    entry['image']][0]:self.pos_boxes[entry['image']][1], :]
            normalized_bb = self.normalized_bb[
                self.pos_boxes[
                    entry['image']][0]:self.pos_boxes[entry['image']][1], :]
            bb = self.bb[
                self.pos_boxes[
                    entry['image']][0]:self.pos_boxes[entry['image']][1], :]

        answer = entry['answer']
        if answer is not None:
            labels = answer['labels']
            scores = answer['scores']
            target = torch.zeros(self.num_ans_candidates)
            if labels is not None:
                target.scatter_(0, labels, scores)
            return features, normalized_bb, question, target,\
                question_id, image_id, bb, spatial_adj_matrix,\
                semantic_adj_matrix

        else:
            return features, normalized_bb, question, question_id,\
                question_id, image_id, bb, spatial_adj_matrix,\
                semantic_adj_matrix

    def __len__(self):
        return len(self.entries)

entries
entries是数据的条目,类型是list,共214354条数据,每条数据是一个字典。
每条数据如下:

键值含义
question_id问题id42000
image_id图像id42
image图像37244
question问题文字表示‘What color are the gym shoes?’
answer答案:label,score{‘labels’: tensor([ 4, 1594], dtype=torch.int32), ‘scores’: tensor([1.0000, 0.3000])}
q_token问题索引向量表示tensor([ 0, 10, 68, 11, 2618, 225, 19901, 19901, 19901, 19901,19901, 19901, 19901, 19901], dtype=torch.int32)

2. 模型需要传入的getitem数据返回 :如果是固定36个区域

变量名来源
featuresself.features[entry[‘image’]],此处的image是位置
normalized_bbself.normalized_bb[entry[‘image’]]
questionentry[‘q_token’]
targetscatter_(0, labels, scores)
question_identry[‘question_id’]
image_identry[“image_id”]
bbself.bb[entry[“image”]]
spatial_adj_matrixself.spatial_adj_matrix[entry[“image”]]
semantic_adj_matrixself.semantic_adj_matrix[entry[“image”]]
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值