论文代码细读(初学者入门,两张图看懂塔克融合论文代码)MUTAN: Multimodal Tucker Fusion for Visual Question Answering

入门小技巧
一,pycharm快捷键使用(参考)可以快速在多个文件中找到函数,参数调用关系
ctrl+f: 搜索打开的文件中的关键字
ctrl+f+shift:全局文件搜索关键字
ctrl+n:查找.py文件
ctrl+n+shift:查找各种类型的文件
find useages(选中函数右键):找到函数或参数被使用的地方。
二,pytorch常用函数讲解
PyTorch中文API文档:https://pytorch-cn.readthedocs.io/zh/latest/package_references/torch-nn/
这个API函数讲解也不全,实在不行就见到一个搜一个,搜多了就会用了(好吧,是因为我懒,还没总结/(ㄒoㄒ)/~~)
PyTorch中文教程(也可以参考我上一篇博客分享的pdf):http://pytorch123.com/SecondSection/training_a_classifier/

论文内容详解参考链接:
论文:https://arxiv.org/pdf/1606.01847v3.pdf
官方代码:https://github.com/Cadene/vqa.pytorch
论文分析1
论文解析1,https://blog.csdn.net/snow_maple521/article/details/108640202

论文和代码思路分析:

在这里插入图片描述
图片上的处理过程是整个模型的主要在vqa/models/att.py中,其中TuckerFusion在vqa/models/fusion.py中。很直观了吧,后面会附上这两个文件的代码解析(超详细,甚至包括维度变换)。官方代码的readme文件有详细的讲解,用到的数据集是COCOImages比较占内存。下面这个图是对官方代码解释的思维导图**(以训练集为例,用的VQA2.0,没有视觉基因组,模型是MutanAtt)**。
在这里插入图片描述
xmind原来的链接:(部分节点笔记这图片里面显示不处理出来,所以建议下载)
链接:https://pan.baidu.com/s/1IocgwpWEeT2ERIvLZm2TrA
提取码:ewq6

如果去掉塔克分解只有以下关键代码:

class NewAttention(nn.Module):
    def __init__(self, v_dim, q_dim, num_hid, dropout=0.2):
        super(NewAttention, self).__init__()

        self.dropout = nn.Dropout(dropout)
        self.linear = weight_norm(nn.Linear(1024, 1), dim=None)
        self.linear_q_att = nn.Linear(1024, 1024)
        self.linear_v_att = nn.Linear(2048, 1024)

    def forward(self, v, q):
        """
        输入:
        v: [512, 36, 2048]
        q: [512, 1024]
        输出:
        x:[512, 36, 1]
        """
        batch, k, _ = v.size()
        v = self.linear_v_att(v) # [batch, k, qdim]
        q = self.linear_q_att(q).unsqueeze(1).repeat(1, k, 1)
        joint_repr = v * q
        x_att = self.dropout(joint_repr)  #
        x_att = F.softmax(x_att)

        x_v_att = torch.mul(x_att, v)  # (b,36,1024)
        x_v_att = self.linear(x_v_att)  # (b,36,1)
        # x_v_att = x_v_att.sum(2)
        # x_v_att = x_v_att.view(b,36,1)

        return x_v_att

vqa/models/att.py文件代码:

import torch
import torch.nn as nn
import torch.nn.functional as F

import copy

from vqa.lib import utils
from vqa.models import seq2vec
from vqa.models import fusion

class AbstractAtt(nn.Module):

    def __init__(self, opt={}, vocab_words=[], vocab_answers=[]):
        super(AbstractAtt, self).__init__()
        self.opt = opt
        self.vocab_words = vocab_words
        self.vocab_answers = vocab_answers
        self.num_classes = len(self.vocab_answers)
        # Modules
        self.seq2vec = seq2vec.factory(self.vocab_words, self.opt['seq2vec'])  # seq2vec = skipthoughts

        # Modules for attention
        self.conv_v_att = nn.Conv2d(self.opt['dim_v'],                     # 贰元(通道數,輸出的深度,過濾器的高,過濾器的寬)
                                    self.opt['attention']['dim_v'], 1, 1)  # (2048,2048,1,1)
        self.linear_q_att = nn.Linear(self.opt['dim_q'],
                                      self.opt['attention']['dim_q'])  # (2400,2048, 310)
        self.conv_att = nn.Conv2d(self.opt['attention']['dim_mm'],
                                  self.opt['attention']['nb_glimpses'], 1, 1)  # (510,2,1,1)
        # Modules for classification
        self.list_linear_v_fusion = None
        self.linear_q_fusion = None
        self.linear_classif = None

    def _fusion_att(self, x_v, x_q):
        raise NotImplementedError

    def _fusion_classif(self, x_v, x_q):
        raise NotImplementedError

    # 注意力函数
    def _attention(self, input_v, x_q_vec):
        batch_size = input_v.size(0)
        width = input_v.size(2)
        height = input_v.size(3)

        # Process visual before fusion 融合前处理视觉
        # x_v = input_v.view(batch_size*width*height, dim_features)
        x_v = input_v
        x_v = F.dropout(x_v,
                        p=self.opt['attention']['dropout_v'],
                        training=self.training)
        x_v = self.conv_v_att(x_v)
        if 'activation_v' in self.opt['attention']:
            x_v = getattr(F, self.opt['attention']['activation_v'])(x_v)
        x_v = x_v.view(batch_size,
                       self.opt['attention']['dim_v'],
                       width * height)  # (batchsize, 2048,長*宽)
        x_v = x_v.transpose(1,2)  # (batchsize, 长*宽, 2048)

        # Process question before fusion
        x_q = F.dropout(x_q_vec, p=self.opt['attention']['dropout_q'],
                           training=self.training)
        x_q = self.linear_q_att(x_q)  # 2400
        if 'activation_q' in self.opt['attention']:
            x_q = getattr(F, self.opt['attention']['activation_q'])(x_q)
        x_q = x_q.view(batch_size,
                       1,
                       self.opt['attention']['dim_q'])  # x_q = (batchsize, 1, 2400)
        x_q = x_q.expand(batch_size,
                         width * height,
                         self.opt['attention']['dim_q'])  # x_q = (batchsize, 长乘宽, 2400) 为了与x_v匹配

        # First multimodal fusion 第一次融合
        # 使用了fusion.py中的 MutanFusion2d()
        x_att = self._fusion_att(x_v, x_q)

        if 'activation_mm' in self.opt['attention']:
            x_att = getattr(F, self.opt['attention']['activation_mm'])(x_att)

        # Process attention vectors 处理注意向量
        x_att = F.dropout(x_att,
                          p=self.opt['attention']['dropout_mm'],
                          training=self.training)
        # can be optim to avoid two views and transposes
        x_att = x_att.view(batch_size,
                           width,
                           height,
                           self.opt['attention']['dim_mm'])  # (batchsize,14,14,510)
        x_att = x_att.transpose(2,3).transpose(1,2)  # (batchsize, 510, 14, 14)

        # 输入 [batsize, 通道510, 图片高14, 图片宽14]
        # 卷积 [通道510, 输出深度2, 过滤器宽1, 过滤器宽1]
        # 结果 [batchsize, 输出深度2, 14, 14]
        x_att = self.conv_att(x_att)
        x_att = x_att.view(batch_size,
                           self.opt['attention']['nb_glimpses'],
                           width * height) # (batchsize, 2, 14*14)
        list_att_split = torch.split(x_att, 1, dim=1)  # (batchsize, 1, 14*14)
        list_att = []
        for x_att in list_att_split:
            x_att = x_att.contiguous()
            x_att = x_att.view(batch_size, width*height)
            x_att = F.softmax(x_att)
            list_att.append(x_att)

        self.list_att = [x_att.data for x_att in list_att]

        # Apply attention vectors to input_v 把注意力向量应用到input_v中
        x_v = input_v.view(batch_size, self.opt['dim_v'], width * height)
        x_v = x_v.transpose(1,2)

        list_v_att = []
        for i, x_att in enumerate(list_att):
            x_att = x_att.view(batch_size,
                               width * height,
                               1)
            x_att = x_att.expand(batch_size,
                                 width * height,
                                 self.opt['dim_v'])
            # x_att(batchsize, 14*14, 2048)
            # x_v(batchsize, 14*14, 2048)
            x_v_att = torch.mul(x_att, x_v)  # 点乘
            x_v_att = x_v_att.sum(1)          # 权重在1维度堆叠 (batchsize,1,dim_v)
            x_v_att = x_v_att.view(batch_size, self.opt['dim_v'])  # (batchsize,2048)
            list_v_att.append(x_v_att)

        return list_v_att

    def _fusion_glimpses(self, list_v_att, x_q_vec):
        # Process visual for each glimpses 注意力机制中为每个一瞥处理视觉
        # 两个list_v_att(batch_size, self.opt['dim_v'])图像是2维的
        list_v = []
        for glimpse_id, x_v_att in enumerate(list_v_att):
            x_v = F.dropout(x_v_att,
                            p=self.opt['fusion']['dropout_v'],
                            training=self.training)
            x_v = self.list_linear_v_fusion[glimpse_id](x_v)  # 2048--》155
            if 'activation_v' in self.opt['fusion']:
                x_v = getattr(F, self.opt['fusion']['activation_v'])(x_v)
            list_v.append(x_v)
        x_v = torch.cat(list_v, 1)   # 155--》310

        # Process question 还是最初的问题输入,2维的
        x_q = F.dropout(x_q_vec,
                        p=self.opt['fusion']['dropout_q'],
                        training=self.training)
        x_q = self.linear_q_fusion(x_q)
        if 'activation_q' in self.opt['fusion']:
            x_q = getattr(F, self.opt['fusion']['activation_q'])(x_q)

        # Second multimodal fusion 用了MutanFusion
        x = self._fusion_classif(x_v, x_q)
        return x

    def _classif(self, x):

        if 'activation' in self.opt['classif']:
            x = getattr(F, self.opt['classif']['activation'])(x)
        x = F.dropout(x,
                      p=self.opt['classif']['dropout'],
                      training=self.training)
        x = self.linear_classif(x)
        return x

    #
    def forward(self, input_v, input_q): # att,py中的输入输出都来自子开始的这个(input_v, input_q),
        # input是四维的(batch_size,width, height, dim_features)
        if input_v.dim() != 4 and input_q.dim() != 2:
            raise ValueError
        x_q_vec = self.seq2vec(input_q)  # 这是问题的输入,2维的

        list_v_att = self._attention(input_v, x_q_vec)  # 这是注意后的图像输入
        x = self._fusion_glimpses(list_v_att, x_q_vec)  # 融合
        x = self._classif(x)    # 分类
        return x


class MLBAtt(AbstractAtt):

    def __init__(self, opt={}, vocab_words=[], vocab_answers=[]):
        # TODO: deep copy ?
        opt['attention']['dim_v']  = opt['attention']['dim_h']
        opt['attention']['dim_q']  = opt['attention']['dim_h']
        opt['attention']['dim_mm'] = opt['attention']['dim_h']
        super(MLBAtt, self).__init__(opt, vocab_words, vocab_answers)
        # Modules for classification
        self.list_linear_v_fusion = nn.ModuleList([
            nn.Linear(self.opt['dim_v'],
                      self.opt['fusion']['dim_h'])
            for i in range(self.opt['attention']['nb_glimpses'])])
        self.linear_q_fusion = nn.Linear(self.opt['dim_q'],
                                         self.opt['fusion']['dim_h']
                                         * self.opt['attention']['nb_glimpses'])
        self.linear_classif = nn.Linear(self.opt['fusion']['dim_h']
                                        * self.opt['attention']['nb_glimpses'],
                                        self.num_classes)

    def _fusion_att(self, x_v, x_q):
        x_att = torch.mul(x_v, x_q)
        return x_att

    def _fusion_classif(self, x_v, x_q):
        x_mm = torch.mul(x_v, x_q)
        return x_mm

class MutanAtt(AbstractAtt):

    def __init__(self, opt={}, vocab_words=[], vocab_answers=[]):
        # TODO: deep copy ?
        opt['attention']['dim_v'] = opt['attention']['dim_hv']
        opt['attention']['dim_q'] = opt['attention']['dim_hq']
        super(MutanAtt, self).__init__(opt, vocab_words, vocab_answers)
        # Modules for classification
        self.fusion_att = fusion.MutanFusion2d(self.opt['attention'],
                                               visual_embedding=False,    # x_v = (batchsize, 长乘宽, 2048)
                                               question_embedding=False)  # x_q = (batchsize, 长乘宽, 2400)
        self.list_linear_v_fusion = nn.ModuleList([
            nn.Linear(self.opt['dim_v'],
                      int(self.opt['fusion']['dim_hv']
                          / opt['attention']['nb_glimpses']))  # 2048->155
            for i in range(self.opt['attention']['nb_glimpses'])])
        self.linear_q_fusion = nn.Linear(self.opt['dim_q'],
                                         self.opt['fusion']['dim_hq'])  # 310-->510
        self.linear_classif = nn.Linear(self.opt['fusion']['dim_mm'],
                                        self.num_classes)               # 510-->分类数
        self.fusion_classif = fusion.MutanFusion(self.opt['fusion'],
                                                 visual_embedding=False,
                                                 question_embedding=False)

    def _fusion_att(self, x_v, x_q):      # 75行,
        return self.fusion_att(x_v, x_q)  # MutanFusion2d 塔克融合

    def _fusion_classif(self, x_v, x_q):      # 150行
        return self.fusion_classif(x_v, x_q)  # MutanFusion

vqa/models/fusion.py文件代码:

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

class AbstractFusion(nn.Module):

    def __init__(self, opt={}):
        super(AbstractFusion, self).__init__()
        self.opt = opt

    def forward(self, input_v, input_q):
        raise NotImplementedError


class MLBFusion(AbstractFusion):

    def __init__(self, opt):
        super(MLBFusion, self).__init__(opt)
        # Modules
        if 'dim_v' in self.opt:
            self.linear_v = nn.Linear(self.opt['dim_v'], self.opt['dim_h'])
        else:
            print('Warning fusion.py: no visual embedding before fusion')

        if 'dim_q' in self.opt:
            self.linear_q = nn.Linear(self.opt['dim_q'], self.opt['dim_h'])
        else:
            print('Warning fusion.py: no question embedding before fusion')
        
    def forward(self, input_v, input_q):
        # visual (cnn features)
        if 'dim_v' in self.opt:
            x_v = F.dropout(input_v, p=self.opt['dropout_v'], training=self.training)
            x_v = self.linear_v(x_v)
            if 'activation_v' in self.opt:
                x_v = getattr(F, self.opt['activation_v'])(x_v)
        else:
            x_v = input_v
        # question (rnn features)
        if 'dim_q' in self.opt:
            x_q = F.dropout(input_q, p=self.opt['dropout_q'], training=self.training)
            x_q = self.linear_q(x_q)
            if 'activation_q' in self.opt:
                x_q = getattr(F, self.opt['activation_q'])(x_q)
        else:
            x_q = input_q
        # hadamard product
        x_mm = torch.mul(x_q, x_v)
        return x_mm


class MutanFusion(AbstractFusion):
    ####################################################################
    # v=resnet152-------v
    #              MutanFusion---(注意力机制)---->y    softmax(y)==out
    # q=GRU ------------^
    # Mutan主要将V,q融合。
    ####################################################################
    def __init__(self, opt, visual_embedding=True, question_embedding=True):
        super(MutanFusion, self).__init__(opt)
        self.visual_embedding = visual_embedding
        self.question_embedding = question_embedding
        # Modules
        if self.visual_embedding:
            self.linear_v = nn.Linear(self.opt['dim_v'], self.opt['dim_hv'])  # 2048-->310
        else:
            print('Warning fusion.py: no visual embedding before fusion')

        if self.question_embedding:
            self.linear_q = nn.Linear(self.opt['dim_q'], self.opt['dim_hq'])  # 2400-->310
        else:
            print('Warning fusion.py: no question embedding before fusion')
        
        self.list_linear_hv = nn.ModuleList([
            nn.Linear(self.opt['dim_hv'], self.opt['dim_mm'])
            for i in range(self.opt['R'])])

        self.list_linear_hq = nn.ModuleList([
            nn.Linear(self.opt['dim_hq'], self.opt['dim_mm'])  # Linear(310, 510)
            for i in range(self.opt['R'])])

    def forward(self, input_v, input_q):
        # input_v和input_q的维度都是2, (batchsize, d_v) ,统一输入图像和问题的维度
        if input_v.dim() != input_q.dim() and input_v.dim() != 2:
            raise ValueError
        batch_size = input_v.size(0)

        # 分别处理,图像和问题嵌入
        # dropout-->linear(d_v2048/d_q2400--》310)-->tanh
        if self.visual_embedding:
            x_v = F.dropout(input_v, p=self.opt['dropout_v'], training=self.training)
            x_v = self.linear_v(x_v)
            if 'activation_v' in self.opt:
                    x_v = getattr(F, self.opt['activation_v'])(x_v)
        else:
            x_v = input_v

        if self.question_embedding:
            x_q = F.dropout(input_q, p=self.opt['dropout_q'], training=self.training)
            x_q = self.linear_q(x_q)
            if 'activation_q' in self.opt:
                    x_q = getattr(F, self.opt['activation_q'])(x_q)
        else:
            x_q = input_q

        # 秩R的约束,(论文中)Z表示成R个Zr的总和(Z会投影到预测空间y上)。
        # 处理后的图像和问题,使用了对应位的相乘,
        # 使用堆叠求和方式进行相加,最终得到的x_mm相当于文章的Z
        x_mm = []
        for i in range(self.opt['R']): # R个映射独立的进行映射,存储到x_mm

            # 分别处理,图像和问题嵌入
            # dropout-->linear(310--》510)-->tanh

            x_hv = F.dropout(x_v, p=self.opt['dropout_hv'], training=self.training)
            x_hv = self.list_linear_hv[i](x_hv)  # linear后大小变510,
            if 'activation_hv' in self.opt: # tanh
                x_hv = getattr(F, self.opt['activation_hv'])(x_hv)

            x_hq = F.dropout(x_q, p=self.opt['dropout_hq'], training=self.training)
            x_hq = self.list_linear_hq[i](x_hq)
            if 'activation_hq' in self.opt:
                x_hq = getattr(F, self.opt['activation_hq'])(x_hq)

            #
            x_mm.append(torch.mul(x_hq, x_hv))  # 使用mul()对应位相乘进行融合,这样融合之后大小不变,但是有R个

        # x_mm([batchsize,510],,,,R个,,,[batchsize,510]),
        x_mm = torch.stack(x_mm, dim=1)  # R个,,在维度1堆起来,
        x_mm = x_mm.sum(1).view(batch_size, self.opt['dim_mm'])  # dim1求和,恢复原来大小(batchsize,510)

        if 'activation_mm' in self.opt:
            x_mm = getattr(F, self.opt['activation_mm'])(x_mm)   # activation_mm = softmax

        # 这就是模型的输出,output,用来预测答案。
        return x_mm

class MutanFusion2d(MutanFusion):

    def __init__(self, opt, visual_embedding=True, question_embedding=True):
        super(MutanFusion2d, self).__init__(opt,
                                            visual_embedding,
                                            question_embedding)

    def forward(self, input_v, input_q):
        if input_v.dim() != input_q.dim() and input_v.dim() != 3:  # 输入都是三维的(,,)
            raise ValueError
        batch_size = input_v.size(0)
        weight_height = input_v.size(1)
        dim_hv = input_v.size(2)
        dim_hq = input_q.size(2)
        if not input_v.is_contiguous():
            input_v = input_v.contiguous()
        if not input_q.is_contiguous():
            input_q = input_q.contiguous()
        # 三维的(,,)变成二维的(,)
        x_v = input_v.view(batch_size * weight_height, self.opt['dim_hv'])
        x_q = input_q.view(batch_size * weight_height, self.opt['dim_hq'])
        # MutanFusion
        x_mm = super().forward(x_v, x_q)
        # 再变成三维的
        x_mm = x_mm.view(batch_size, weight_height, self.opt['dim_mm'])
        return x_mm

如有错误,欢迎评论指正。

  • 4
    点赞
  • 13
    收藏
    觉得还不错? 一键收藏
  • 3
    评论
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值