faster-rcnn-pytorch重要源码记录->proposal_layer.py

最新推荐文章于 2024-01-13 18:20:16 发布

sakurasakura1996

最新推荐文章于 2024-01-13 18:20:16 发布

阅读量353

点赞数 1

分类专栏：目标检测计算机视觉学习记录

本文链接：https://blog.csdn.net/sakurasakura1996/article/details/104902512

版权

计算机视觉学习记录同时被 2 个专栏收录

11 篇文章 0 订阅

订阅专栏

目标检测

5 篇文章 0 订阅

订阅专栏

本文详细记录了Faster R-CNN在PyTorch实现中，用于生成候选区域的Proposal Layer的源码分析，探讨了其实现原理及其在目标检测中的作用。

摘要由CSDN通过智能技术生成

	from __future__ import absolute_import
	import torch
	import torch.nn as nn
	import numpy as np
	import math
	import yaml
	from model.utils.config import cfg
	from .generate_anchors import generate_anchors
	from .bbox_transform import bbox_transform_inv, clip_boxes, clip_boxes_batch
	from model.nms.nms_wrapper import nms
	import pdb
	DEBUG = False
	class _ProposalLayer(nn.Module):
    """
    Outputs object detection proposals by applying estimated bounding-box
    transformations to a set of regular boxes (called "anchors").
    """

    def __init__(self, feat_stride, scales, ratios):
        super(_ProposalLayer, self).__init__()

        self._feat_stride = feat_stride    # 缩放比例
        self._anchors = torch.from_numpy(generate_anchors(scales=np.array(scales),
            ratios=np.array(ratios))).float()
        self._num_anchors = self._anchors.size(0)

        # rois blob: holds R regions of interest, each is a 5-tuple
        # (n, x1, y1, x2, y2) specifying an image batch index n and a
        # rectangle (x1, y1, x2, y2)
        # top[0].reshape(1, 5)
        #
        # # scores blob: holds scores for R regions of interest
        # if len(top) > 1:
        #     top[1].reshape(1, 1, 1, 1)

    def forward(self, input):

        # Algorithm:
        #
        # 1.for each (H, W) location i
        #   generate A anchor boxes centered on cell i
        #   apply predicted bbox deltas at cell i to each of the A anchors
        # 2.clip predicted boxes to image
        # 3.remove predicted boxes with either height or width < threshold
        # 4.sort all (proposal, score) pairs by score from highest to lowest
        # 5.take top pre_nms_topN proposals before NMS
        # 6.apply NMS with threshold 0.7 to remaining proposals
        # 7.take after_nms_topN proposals after NMS
        # 8.return the top proposals (-> RoIs top, scores top)

        # the first set of _num_anchors channels are bg probs
        # the second set are the fg probs
        scores = input[0][:, self._num_anchors:, :, :]   # input[0] 's shape is (N,18,H/4,W/4)
        bbox_deltas = input[1]   # rpn_bbox_pred.data  shape:(N,36,H/4,W/4)
        im_info = input[2]    # (H,W,scale=16) 记录了卷积前的图像尺寸以及缩放的比例 16倍
        cfg_key = input[3]     # 下面需要用到的一些参数

        pre_nms_topN = cfg[cfg_key].RPN_PRE_NMS_TOP_N
        post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N
        nms_thresh = cfg[cfg_key].RPN_NMS_THRESH
        min_size = cfg[cfg_key].RPN_MIN_SIZE

        batch_size = bbox_deltas.size(0)      # N

        feat_height, feat_width = scores.size(2), scores.size(3)    # feature map 的高和宽，原来的1/4 也就是H/4, W/4

        shift_x = np.arange(0, feat_width) * self._feat_stride    # shape: [width,]
        shift_y = np.arange(0, feat_height) * self._feat_stride   # shape: [height,]
        # 生成网格 shift_x shape: [height, width], shift_y shape: [height, width]
        shift_x, shift_y = np.meshgrid(shift_x, shift_y)    # meshgrid返回的就是x，y的坐标矩阵，shift_x返回的是对应原图中所有点横坐标的坐标矩阵

        # np.vstack: 按垂直方向（行顺序）堆叠数组构成一个新的数组,  ravel()是将矩阵扁平化
        shifts = torch.from_numpy(np.vstack((shift_x.ravel(), shift_y.ravel(),
                                  shift_x.ravel(), shift_y.ravel())).transpose())   # shape[height*width, 4]
        shifts = shifts.contiguous().type_as(scores).float()   # 这里的contiguous意思是连续的，如果对这个变量使用view之前进行了
        # transpose或者 permute操作，那么它的内存分布很可能不连续，就会出错，所以要用contiguous()来将tensor变成在内存中连续分布的形式

        # 上面的shifts把坐标给搞出来了，然后每个点是要生成9个anchor框的，下面就是在做这个
        # Enumerate all shifted anchors:
        #
        # add A anchors (1, A, 4) to
        # cell K shifts (K, 1, 4) to get
        # shift anchors (K, A, 4)
        # reshape to (K*A, 4) shifted anchors

        A = self._num_anchors    # A = 9
        K = shifts.size(0)       # K=height*width(特征图上的)

        self._anchors = self._anchors.type_as(scores)
        # anchors = self._anchors.view(1, A, 4) + shifts.view(1, K, 4).permute(1, 0, 2).contiguous()
        anchors = self._anchors.view(1, A, 4) + shifts.view(K, 1, 4)
        anchors = anchors.view(1, K * A, 4).expand(batch_size, K * A, 4)   # expand扩展维度，这里就是复制 K*A * 4到其他batch_size-1维度上
        

        # Transpose and reshape predicted bbox transformations to get them
        # into the same order as the anchors:
        # 将RPN输出的边框变换信息维度变回[N,H,W,C]，再改变一下维度，变成[N,H*W,4]
        bbox_deltas = bbox_deltas.permute(0, 2, 3, 1).contiguous()
        bbox_deltas = bbox_deltas.view(batch_size, -1, 4)

        # Same story for the scores:
        # 将RPN输出的分类信息维度变回[N,H,W,C]，再改变一下维度，变成[1×H×W×A,1]
        scores = scores.permute(0, 2, 3, 1).contiguous()
        scores = scores.view(batch_size, -1)

        # Convert anchors into proposals via bbox transformations
        # 在这里结合RPN的输出变换初始框的坐标，得到第一次变换坐标后的proposals
        proposals = bbox_transform_inv(anchors, bbox_deltas, batch_size)

        # 2. clip predicted boxes to image
        # 在这里讲超出图像边界的proposal进行边界裁剪，使之在图像边界之内
        proposals = clip_boxes(proposals, im_info, batch_size)
        # proposals = clip_boxes_batch(proposals, im_info, batch_size)

        # assign the score to 0 if it's non keep.
        # keep = self._filter_boxes(proposals, min_size * im_info[:, 2])

        # trim keep index to make it euqal over batch
        # keep_idx = torch.cat(tuple(keep_idx), 0)

        # scores_keep = scores.view(-1)[keep_idx].view(batch_size, trim_size)
        # proposals_keep = proposals.view(-1, 4)[keep_idx, :].contiguous().view(batch_size, trim_size, 4)
        
        # _, order = torch.sort(scores_keep, 1, True)
        
        scores_keep = scores
        proposals_keep = proposals
        _, order = torch.sort(scores_keep, 1, True)

        output = scores.new(batch_size, post_nms_topN, 5).zero_()
        for i in range(batch_size):
            # # 3. remove predicted boxes with either height or width < threshold
            # # (NOTE: convert min_size to input image scale stored in im_info[2])
            proposals_single = proposals_keep[i]
            scores_single = scores_keep[i]

            # # 4. sort all (proposal, score) pairs by score from highest to lowest
            # # 5. take top pre_nms_topN (e.g. 6000)
            order_single = order[i]

            if pre_nms_topN > 0 and pre_nms_topN < scores_keep.numel():
                order_single = order_single[:pre_nms_topN]

            proposals_single = proposals_single[order_single, :]
            scores_single = scores_single[order_single].view(-1,1)

            # 6. apply nms (e.g. threshold = 0.7)
            # 7. take after_nms_topN (e.g. 300)
            # 8. return the top proposals (-> RoIs top)

            keep_idx_i = nms(torch.cat((proposals_single, scores_single), 1), nms_thresh, force_cpu=not cfg.USE_GPU_NMS)
            keep_idx_i = keep_idx_i.long().view(-1)

            if post_nms_topN > 0:
                keep_idx_i = keep_idx_i[:post_nms_topN]
            proposals_single = proposals_single[keep_idx_i, :]
            scores_single = scores_single[keep_idx_i, :]

            # padding 0 at the end.
            num_proposal = proposals_single.size(0)
            output[i,:,0] = i
            output[i,:num_proposal,1:] = proposals_single

        return output


    def backward(self, top, propagate_down, bottom):
        """This layer does not propagate gradients."""
        pass

    def reshape(self, bottom, top):
        """Reshaping happens during the call to forward."""
        pass

    def _filter_boxes(self, boxes, min_size):
        """Remove all boxes with any side smaller than min_size."""
        ws = boxes[:, :, 2] - boxes[:, :, 0] + 1
        hs = boxes[:, :, 3] - boxes[:, :, 1] + 1
        keep = ((ws >= min_size.view(-1,1).expand_as(ws)) & (hs >= min_size.view(-1,1).expand_as(hs)))
        return keep