rpn.py

最新推荐文章于 2022-01-25 14:22:22 发布

bestrivern

最新推荐文章于 2022-01-25 14:22:22 发布

阅读量267

点赞数

分类专栏： Faster-RCNN

本文链接：https://blog.csdn.net/bestrivern/article/details/89298931

版权

Faster-RCNN 专栏收录该内容

9 篇文章 1 订阅

订阅专栏

这部分是region proposal network，首先网络结构如下图所示：

首先对于feature map先做一个3*3的卷积，然后分为两段：

第一段对于bg/fg做一个二分类的softmax回归，对于每一个anchor得到一个分类的score,第一次reshape是为了将用于softmax回归的bg和fg这两个维度单独分离出来，即是[batch_size,2*9,W,H]变为[batch_size,2,9*W,H],第二次的reshape即为将维度又恢复原状。

第二段是对于每一个anchor的四个坐标值，通过一个1*1的卷积层，得到一个offset的值：[dx,dy,dw,dh]，用于后续的平移和尺度缩放变换。

详细解释参见下属代码和代码的注释：

class _RPN(nn.Module):
    # region proposal network
    def __init__(self, din):
        super(_RPN, self).__init__()

        self.din = din  # get depth of input feature map
        self.anchor_scales = cfg.ANCHOR_SCALES
        self.anchor_ratios = cfg.ANCHOR_RATIOS
        # The size of the slider slide, it is useful for identifying the size of an object,
        # such as the identification of small objects, which need to be reduced.
        self.feat_stride = cfg.FEAT_STRIDE[0]

        # define the convolution layers processing input feature map
        # input_channel:self.din which should be 512
        # output_channel:512
        # kernel_size:3*3
        # stride:1
        # padding:1
        self.RPN_Conv = nn.Conv2d(self.din, 512, 3, 1, 1, bias=True)

        # (cls layer)
        # define bg/fg classification score layer
        # nc_score_out = 2*k = 2*9 =18
        # k:the number of the anchors(9 = 3*3)
        self.nc_score_out = len(self.anchor_scales)*len(self.anchor_ratios)*2  # 3 * 3 * 2
        self.RPN_cls_score = nn.Conv2d(512, self.nc_score_out, 1, 1, 0)

        # (reg layer)
        # define anchor box offset prediction layer
        # nc_bbox_out = 4*k = 4*9 =36
        # k:the number of the anchors(9 = 3*3)
        self.nc_bbox_out = len(self.anchor_scales)*len(self.anchor_ratios)*4   # 3 * 3 * 4
        self.RPN_bbox_pred = nn.Conv2d(512, self.nc_bbox_out, 1, 1, 0)

        # define proposal layer
        self.RPN_proposal = _ProposalLayer(self.feat_stride, self.anchor_scales, self.anchor_ratios)

        # define anchor target layer
        self.RPN_anchor_target = _AnchorTargetLayer(self.feat_stride, self.anchor_scales, self.anchor_ratios)

        # initialize the bg/fg classification loss and the bbox offset loss to zero
        self.rpn_loss_cls = 0
        self.rpn_loss_box = 0

    # the reshape function is to make the softmax classification convenient
    # the bg/fg matrix:[batch_size, 2*9, H, W]
    # the reshape function transform it to:
    # [batch_size, 2, 9*H, W]
    # Take out a dimension separately to make the softmax classification
    def reshape(x, d):
        input_shape = x.size()
        x = x.view(
            input_shape[0],
            int(d),
            int(float(input_shape[1]*input_shape[2]) / float(d)),
            input_shape[3]
        )
        return x

    # base_feat:the feature map made by the Conv layers
    def forward(self, base_feat, im_info, gt_boxes, num_boxes):

        batch_size = base_feat.size(0)

        # the 3*3 convolution made on the feature map
        rpn_conv1 = F.relu(self.RPN_Conv(base_feat), inplace = True)

        # get rpn classification score
        rpn_cls_score = self.RPN_cls_score(rpn_conv1)

        rpn_cls_score_reshape = self.reshape(rpn_cls_score, 2)
        rpn_cls_prob_reshape = F.softmax(rpn_cls_score_reshape, 1)  # softmax to the dim=1 input = rpn_cls_score_reshape
        # reshape the output to restore its shape
        rpn_cls_prob = self.reshape(rpn_cls_prob_reshape, self.nc_score_out)

        # the convolution layer: RPN_bbox_pred
        # the conv layer make convolution operation on the feature map
        # which is made by the first conv layer(3 * 3) of the rpn
        # its aim is to get rpn offsets to the anchor boxes
        rpn_bbox_pred = self.RPN_bbox_pred(rpn_conv1)

        # proposal layer
        cfg_key = 'TRAIN' if self.training else 'TRST'

        # the type of the input is tensor
        # rpn_cls_prob.data: the score of the fg/bg
        # rpn_bbox_pred.data:the offset of the bbox
        # RPN_proposal use the input data to generate the rois
        rois = self.RPN_proposal((rpn_cls_prob.data, rpn_bbox_pred.data, im_info, cfg_key))

        self.rpn_loss_cls = 0
        self.rpn_loss_box = 0

        # generating training labels and build the rpn loss
        if self.training:
            assert gt_boxes is not None

            rpn_data = self.RPN_anchor_target((rpn_cls_score.data, gt_boxes, im_info, num_boxes))

            # compute classification loss
            rpn_cls_score = rpn_cls_score_reshape.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 2)
            rpn_label = rpn_data[0].view(batch_size, -1)

            rpn_keep = Variable(rpn_label.view(-1).ne(-1).nonzero().view(-1))
            rpn_cls_score = torch.index_select(rpn_cls_score.view(-1,2), 0, rpn_keep)
            rpn_label = torch.index_select(rpn_label.view(-1), 0, rpn_keep.data)
            rpn_label = Variable(rpn_label.long())
            self.rpn_loss_cls = F.cross_entropy(rpn_cls_score, rpn_label)
            fg_cnt = torch.sum(rpn_label.data.ne(0))

            rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights = rpn_data[1:]

            # compute bbox regression loss
            rpn_bbox_inside_weights = Variable(rpn_bbox_inside_weights)
            rpn_bbox_outside_weights = Variable(rpn_bbox_outside_weights)
            rpn_bbox_targets = Variable(rpn_bbox_targets)

            self.rpn_loss_box = _smooth_l1_loss(rpn_bbox_pred, rpn_bbox_targets, rpn_bbox_inside_weights,
                                                            rpn_bbox_outside_weights, sigma=3, dim=[1,2,3])

        return rois, self.rpn_loss_cls, self.rpn_loss_box