Faster R-CNN代码解读之anchor_target_layer.py

最新推荐文章于 2024-07-20 17:36:52 发布
hancoder
最新推荐文章于 2024-07-20 17:36:52 发布
阅读量521
点赞数 1
分类专栏：深度学习 python
blog.csdn.net/hancoder 可以打赏后索要离线md文件，打赏码在谷粒笔记的文尾。博主目前就职于阿里云，有需要可内推
本文链接：https://blog.csdn.net/hancoder/article/details/90320721
版权
深度学习同时被 2 个专栏收录
5 篇文章 2 订阅
订阅专栏
python
3 篇文章 0 订阅
订阅专栏

# --------------------------------------------------------
# Faster R-CNN
# Copyright (c) 2015 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Ross Girshick and Xinlei Chen
# --------------------------------------------------------
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
from model.config import cfg
import numpy as np
import numpy.random as npr
from utils.bbox import bbox_overlaps
from model.bbox_transform import bbox_transform
import torch

# 解读faster rcnn重要的几个文件之anchor_target_layer.py文件
# https://github.com/ruotianluo/pytorch-faster-rcnn
"""
这个程序里的东西都是为训练服务的，而训练好后前向出来后topn，NMS等操作在proposal_layer.py
因为是为训练服务的，所以其中有赋label的过程，还有为了计算loss而提供的rpn_bbox_inside_weights等数据
rpn_bbox_inside_weights即loss中为了让loss只计算前景的框位置回归而设置的tensor
用法类似于(delta - gt）点乘rpn_bbox_inside_weights，，即得到了边框loss。
anchor_target_layer是入口，此时有2W个原始anchor
处理步骤为：
-->首先赋label：总的锚点为all_anchors，去除边框外的框
-->只算在边界内的框，这些框的下标inds_inside，框为anchors，分类为label=-1，三者行数都相等。
-->将threshhold＞0.7的label=1，＜0.3的label=0。
-->还需要把与gt的overlap最大的框赋label=1。gt_argmax_overlaps是gt依次对应的第几个anchor。
-->各从前背景label 1 0中选取128个，其余重新设置label=-1
-->内部权重bbox_inside_weights 为坐标前乘的系数，格式为[[0,0,0,0或1111]，...刚开始为内部框个，后来是全部all_anchors个]
    外部权重bbox_outside_weights 为正负区别的系数，格式同上，不过非0数字为1/N_sample,正负权重可以设置（focal Loss论文中会设置）
-->前面都是在边界内anchor上计算的，
    然后用_unmap函数把label,bbox_targets,  bbox_inside_weights,bbox_outside_weights这几个参数扩展到
    全部anchor的shape尺度，对于边界外的anchor，label填充的是-1，其他三个填充的是0.（列数不变）
-->然后相当于有all_anchors个信息，这些信息的排序方式通过label.reshape((1, height, width, A))可以看出是
    联系特征图，先是A（K=9）个anchor为一组，按行排列，排列完一行另起一行。这样height, width, A就解释通了。
    数字1应该是1张图片
    然后.transpose(0,3,1,2)，可以把k理解为通道（只是假设理解），即先填充了一个通道的W×H吗，然后再填第二个通道
    最后.reshape((1, 1, A * height, width))顺序应该没变，只是括号[]前后位置变了
    其余3个返回值应该还是k-W-H-图的理解方式，只不过元素个数×4，也就是4-k-W-H-图
    .reshape((1, height, width, A * 4))
"""
def anchor_target_layer(rpn_cls_score, gt_boxes, im_info, _feat_stride,
                        all_anchors, num_anchors):
    """Same as the anchor target layer in original Fast/er RCNN """
    A = num_anchors # 单尺度anchor个数
    total_anchors = all_anchors.shape[0] # 多尺度，总anchor个数
    K = total_anchors / num_anchors

    # allow boxes to sit over the edge by a small amount
    # 边框可以踩一点界，这里是0，即不允许
    _allowed_border = 0

    # map of shape (..., H, W)
    height, width = rpn_cls_score.shape[1:3]

    # only keep anchors inside the image 只保留在图像内的框
    inds_inside = np.where(  # np.where(condition)
        (all_anchors[:, 0] >= -_allowed_border) &
        (all_anchors[:, 1] >= -_allowed_border) &
        (all_anchors[:, 2] < im_info[1] + _allowed_border) &  # width
        (all_anchors[:, 3] < im_info[0] + _allowed_border)  # height
    )[0]
    # all_anchors的格式的[[x,y,w,h],...]?

    # keep only inside anchors
    anchors = all_anchors[inds_inside, :] # 从所有的anchor只保留不踩界的

    # label: 1 is positive, 0 is negative, -1 is dont care
    labels = np.empty((len(inds_inside), ), dtype=np.float32) # 长度
    # 只给不跨边界的anchor赋label=0,1,-1 ?
    labels.fill(-1) # 用1填充

    # overlaps between the anchors and the gt boxes
    # overlaps (ex, gt)
    overlaps = bbox_overlaps(
        np.ascontiguousarray(anchors, dtype=np.float),
        np.ascontiguousarray(gt_boxes, dtype=np.float))
        # 返回了anchors个行，gt_boxes个列。因为有view，所以得ascontiguousarray
    argmax_overlaps = overlaps.argmax(axis=1)  # 最大值的下标，每一行的最大值
    # anchor匹配的是第几个gt框
    max_overlaps = overlaps[np.arange(len(inds_inside)), argmax_overlaps] 
    # 格式应该是[0.5,0.3,...]
    
    gt_argmax_overlaps = overlaps.argmax(axis=0) # argmax输出格式[1,2,3]
    # 与每个gt框最大值的anchor，argmax的shape=anchor的列数，即gt个数
    # 有了行，下面的arange是取对应的列，即第一个gt框对应的最大overlap
    gt_max_overlaps = overlaps[gt_argmax_overlaps,
                               np.arange(overlaps.shape[1])]
       # overlaps.shape[1]是gt个数
       # gt_max_overlaps最后格式 [0.7,0.9,....共gt个]
    gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0]

    if not cfg.TRAIN.RPN_CLOBBER_POSITIVES:
        # assign bg labels first so that positive labels can clobber them
        # first set the negatives
        labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0 # 0.3负类label=0

    # fg label: for each gt, anchor with highest overlap 每个gt最大重叠的框为正
    labels[gt_argmax_overlaps] = 1

    # fg label: above threshold IOU 大于阈值的框为正
    labels[max_overlaps >= cfg.TRAIN.RPN_POSITIVE_OVERLAP] = 1

    if cfg.TRAIN.RPN_CLOBBER_POSITIVES:
        # assign bg labels last so that negative labels can clobber positives
        labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0

    # subsample positive labels if we have too many
    num_fg = int(cfg.TRAIN.RPN_FG_FRACTION * cfg.TRAIN.RPN_BATCHSIZE)
    fg_inds = np.where(labels == 1)[0]  # 这个[0]是什么？符合条件的第一个维度值
    if len(fg_inds) > num_fg: # 随机挑选anchor，选出来剩下的赋label=-1
        disable_inds = npr.choice(  # numpy.random.choice ，replace表示是否重用元素，默认True
            fg_inds, size=(len(fg_inds) - num_fg), replace=False)
        labels[disable_inds] = -1

    # subsample negative labels if we have too many
    num_bg = cfg.TRAIN.RPN_BATCHSIZE - np.sum(labels == 1) # label=1是前景anchor
    bg_inds = np.where(labels == 0)[0] # label=0是背景anchor
    if len(bg_inds) > num_bg:
        disable_inds = npr.choice(
            bg_inds, size=(len(bg_inds) - num_bg), replace=False)
        labels[disable_inds] = -1

    bbox_targets = np.zeros((len(inds_inside), 4), dtype=np.float32)
    # shape=[在边界里的anchor的数目，4]
    bbox_targets = _compute_targets(anchors, gt_boxes[argmax_overlaps, :]) 
    # _compute_targets函数：把绝对坐标转换成offset 
    # 第一个是anchor的坐标，第二个参数是anchors与第几个gt匹配
    # 返回的是anchors与gt框的offset，即训练的真实值tx,ty,tw,th.
    

    bbox_inside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32)
    # bbox_inside_weights的shape是[边界内anchor个数,4坐标]
    # 这里的weight和具体分类时的维度不同，那里的列数是4*类别数
    # only the positive ones have regression targets
    bbox_inside_weights[labels == 1, :] = np.array(
        cfg.TRAIN.RPN_BBOX_INSIDE_WEIGHTS) # (1.0, 1.0, 1.0, 1.0)
        # 只让选出来的正类sample计算回归loss

    bbox_outside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32)
    # bbox_outside_weights 的shape和bbox_inside_weights 同
    if cfg.TRAIN.RPN_POSITIVE_WEIGHT < 0:  # RPN_POSITIVE_WEIGHT =-1
        # uniform weighting of examples (given non-uniform sampling)
        num_examples = np.sum(labels >= 0) # 样本的数目
        positive_weights = np.ones((1, 4)) * 1.0 / num_examples # 这里即公式里的N_clss，要放到outweight中
        negative_weights = np.ones((1, 4)) * 1.0 / num_examples # 正负权重可以设置
    else:
        assert ((cfg.TRAIN.RPN_POSITIVE_WEIGHT > 0) &
                (cfg.TRAIN.RPN_POSITIVE_WEIGHT < 1))
        positive_weights = (
            cfg.TRAIN.RPN_POSITIVE_WEIGHT / np.sum(labels == 1))
        negative_weights = (
            (1.0 - cfg.TRAIN.RPN_POSITIVE_WEIGHT) / np.sum(labels == 0))
    bbox_outside_weights[labels == 1, :] = positive_weights
    bbox_outside_weights[labels == 0, :] = negative_weights
    # 外部权重=1/样本个数

    # map up to original set of anchors映射到原始total_anchor
    # 接下来都使用了_unmap函数，这个函数的作用是从在框内的anchor又扩展回了全部anchor尺度，fill参数代表那些在边界外的anchor要填充的数字
    # 虽然拓展到了原来total_anchor行，但是其中非0还是sample个行
    labels = _unmap(labels, total_anchors, inds_inside, fill=-1) 
    # _unmap由把label从只有在边界里的anchor还原到了total_anchors尺度。即正1，负0，不关心-1
    # 但是1和0的总数还是256个
    bbox_targets = _unmap(bbox_targets, total_anchors, inds_inside, fill=0)
    bbox_inside_weights = _unmap(
        bbox_inside_weights, total_anchors, inds_inside, fill=0)
    bbox_outside_weights = _unmap(
        bbox_outside_weights, total_anchors, inds_inside, fill=0)

    # labels # 原labels=[1,0,-1,1,1,0,0.....2W个]
    labels = labels.reshape((1, height, width, A)).transpose(0, 3, 1, 2)
    # 原来的label是[1,0,-1,1,1,0,0.....2W个]，顺序是9个为一组，宽为一组，高为一组，图为1组
    # 1个张图片，宽，高，A=k=9。转成看一张图片，k，高，宽？这样？
    # 联想通道图（实际k不是通道），之前是先从通道方向上方，转置之后是先放完一个通道的整张图，再放第二个通道代表的图
    labels = labels.reshape((1, 1, A * height, width))
    # 我猜是换成了 宽-高-k为一组，一张图为一组，一个batch为一组
    rpn_labels = labels

    # bbox_targets # bbox_targets=[[tx,ty,tw,wh],...2W个]
    bbox_targets = bbox_targets \
      .reshape((1, height, width, A * 4))
    rpn_bbox_targets = bbox_targets
    
    # bbox_inside_weights #原bbox_inside_weights=[[1,1,1,1],[0,0,0,0]...1或0形成2W个 ]
    bbox_inside_weights = bbox_inside_weights \
      .reshape((1, height, width, A * 4))
    rpn_bbox_inside_weights = bbox_inside_weights

    # bbox_outside_weights
    bbox_outside_weights = bbox_outside_weights \
      .reshape((1, height, width, A * 4))
    rpn_bbox_outside_weights = bbox_outside_weights
    return rpn_labels, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights


def _unmap(data, count, inds, fill=0):
    """ Unmap a subset of item (data) back to the original set of items (of
  size count) """
    if len(data.shape) == 1: # data（label）是一维的，如[1,2,3,4]
    # 例_unmap(labels, total_anchors, inds_inside, fill=-1)
        ret = np.empty((count, ), dtype=np.float32)
        ret.fill(fill)
        ret[inds] = data # ret[inds_inside] = labels
    else:
    #例2： _unmap(data=bbox_targets, count=total_anchors, inds=inds_inside, fill=0)
        ret = np.empty((count, ) + data.shape[1:], dtype=np.float32)
        # 单纯第data.shape[1]是一个数，但是data.shape[1:]是一个元组(2,)
        # np.empty((2,)+(3,))是2行3列的空矩阵
        ret.fill(fill)
        ret[inds, :] = data
    return ret


def _compute_targets(ex_rois, gt_rois): 
# 这个函数的意思是验证一下维度，实际起作用的是另外一个函数bbox_transform
    """Compute bounding-box regression targets for an image."""

    assert ex_rois.shape[0] == gt_rois.shape[0] # gt已经是赋完label的anchor了
    # 这里gtbox的格式还需要根据其他地方研究
    assert ex_rois.shape[1] == 4 # [[x,y,x,y],...]
    assert gt_rois.shape[1] == 5 #gt[[x,y,x,y,类],...]

    return bbox_transform( # 把绝对坐标转换成offset坐标
        torch.from_numpy(ex_rois), torch.from_numpy(gt_rois[:, :4])).numpy()