Faster R-CNN代码解读之anchor_target_layer.py


# --------------------------------------------------------
# Faster R-CNN
# Copyright (c) 2015 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Ross Girshick and Xinlei Chen
# --------------------------------------------------------
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
from model.config import cfg
import numpy as np
import numpy.random as npr
from utils.bbox import bbox_overlaps
from model.bbox_transform import bbox_transform
import torch

# 解读faster rcnn重要的几个文件之anchor_target_layer.py文件
# https://github.com/ruotianluo/pytorch-faster-rcnn
"""
这个程序里的东西都是为训练服务的,而训练好后前向出来后topn,NMS等操作在proposal_layer.py
因为是为训练服务的,所以其中有赋label的过程,还有为了计算loss而提供的rpn_bbox_inside_weights等数据
rpn_bbox_inside_weights即loss中为了让loss只计算前景的框位置回归而设置的tensor
用法类似于(delta - gt)点乘rpn_bbox_inside_weights,,即得到了边框loss。
anchor_target_layer是入口,此时有2W个原始anchor
处理步骤为:
-->首先赋label:总的锚点为all_anchors,去除边框外的框
-->只算在边界内的框,这些框的下标inds_inside,框为anchors,分类为label=-1,三者行数都相等。
-->将threshhold>0.7的label=1,<0.3的label=0。
-->还需要把与gt的overlap最大的框赋label=1。gt_argmax_overlaps是gt依次对应的第几个anchor。
-->各从前背景label 1 0中选取128个,其余重新设置label=-1
-->内部权重bbox_inside_weights 为坐标前乘的系数,格式为[[0,0,0,0或1111],...刚开始为内部框个,后来是全部all_anchors个]
    外部权重bbox_outside_weights 为正负区别的系数,格式同上,不过非0数字为1/N_sample,正负权重可以设置(focal Loss论文中会设置)
-->前面都是在边界内anchor上计算的,
    然后用_unmap函数把label,bbox_targets,  bbox_inside_weights,bbox_outside_weights这几个参数扩展到
    全部anchor的shape尺度,对于边界外的anchor,label填充的是-1,其他三个填充的是0.(列数不变)
-->然后相当于有all_anchors个信息,这些信息的排序方式通过label.reshape((1, height, width, A))可以看出是
    联系特征图,先是A(K=9)个anchor为一组,按行排列,排列完一行另起一行。这样height, width, A就解释通了。
    数字1应该是1张图片
    然后.transpose(0,3,1,2),可以把k理解为通道(只是假设理解),即先填充了一个通道的W×H吗,然后再填第二个通道
    最后.reshape((1, 1, A * height, width))顺序应该没变,只是括号[]前后位置变了
    其余3个返回值应该还是k-W-H-图的理解方式,只不过元素个数×4,也就是4-k-W-H-图
    .reshape((1, height, width, A * 4))
"""
def anchor_target_layer(rpn_cls_score, gt_boxes, im_info, _feat_stride,
                        all_anchors, num_anchors):
    """Same as the anchor target layer in original Fast/er RCNN """
    A = num_anchors # 单尺度anchor个数
    total_anchors = all_anchors.shape[0] # 多尺度,总anchor个数
    K = total_anchors / num_anchors

    # allow boxes to sit over the edge by a small amount
    # 边框可以踩一点界,这里是0,即不允许
    _allowed_border = 0

    # map of shape (..., H, W)
    height, width = rpn_cls_score.shape[1:3]

    # only keep anchors inside the image 只保留在图像内的框
    inds_inside = np.where(  # np.where(condition)
        (all_anchors[:, 0] >= -_allowed_border) &
        (all_anchors[:, 1] >= -_allowed_border) &
        (all_anchors[:, 2] < im_info[1] + _allowed_border) &  # width
        (all_anchors[:, 3] < im_info[0] + _allowed_border)  # height
    )[0]
    # all_anchors的格式的[[x,y,w,h],...]?

    # keep only inside anchors
    anchors = all_anchors[inds_inside, :] # 从所有的anchor只保留不踩界的

    # label: 1 is positive, 0 is negative, -1 is dont care
    labels = np.empty((len(inds_inside), ), dtype=np.float32) # 长度
    # 只给不跨边界的anchor赋label=0,1,-1 ?
    labels.fill(-1) # 用1填充

    # overlaps between the anchors and the gt boxes
    # overlaps (ex, gt)
    overlaps = bbox_overlaps(
        np.ascontiguousarray(anchors, dtype=np.float),
        np.ascontiguousarray(gt_boxes, dtype=np.float))
        # 返回了anchors个行,gt_boxes个列。因为有view,所以得ascontiguousarray
    argmax_overlaps = overlaps.argmax(axis=1)  # 最大值的下标,每一行的最大值
    # anchor匹配的是第几个gt框
    max_overlaps = overlaps[np.arange(len(inds_inside)), argmax_overlaps] 
    # 格式应该是[0.5,0.3,...]
    
    gt_argmax_overlaps = overlaps.argmax(axis=0) # argmax输出格式[1,2,3]
    # 与每个gt框最大值的anchor,argmax的shape=anchor的列数,即gt个数
    # 有了行,下面的arange是取对应的列,即第一个gt框对应的最大overlap
    gt_max_overlaps = overlaps[gt_argmax_overlaps,
                               np.arange(overlaps.shape[1])]
       # overlaps.shape[1]是gt个数
       # gt_max_overlaps最后格式 [0.7,0.9,....共gt个]
    gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0]

    if not cfg.TRAIN.RPN_CLOBBER_POSITIVES:
        # assign bg labels first so that positive labels can clobber them
        # first set the negatives
        labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0 # 0.3负类label=0

    # fg label: for each gt, anchor with highest overlap 每个gt最大重叠的框为正
    labels[gt_argmax_overlaps] = 1

    # fg label: above threshold IOU 大于阈值的框为正
    labels[max_overlaps >= cfg.TRAIN.RPN_POSITIVE_OVERLAP] = 1

    if cfg.TRAIN.RPN_CLOBBER_POSITIVES:
        # assign bg labels last so that negative labels can clobber positives
        labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0

    # subsample positive labels if we have too many
    num_fg = int(cfg.TRAIN.RPN_FG_FRACTION * cfg.TRAIN.RPN_BATCHSIZE)
    fg_inds = np.where(labels == 1)[0]  # 这个[0]是什么?符合条件的第一个维度值
    if len(fg_inds) > num_fg: # 随机挑选anchor,选出来剩下的赋label=-1
        disable_inds = npr.choice(  # numpy.random.choice ,replace表示是否重用元素,默认True
            fg_inds, size=(len(fg_inds) - num_fg), replace=False)
        labels[disable_inds] = -1

    # subsample negative labels if we have too many
    num_bg = cfg.TRAIN.RPN_BATCHSIZE - np.sum(labels == 1) # label=1是前景anchor
    bg_inds = np.where(labels == 0)[0] # label=0是背景anchor
    if len(bg_inds) > num_bg:
        disable_inds = npr.choice(
            bg_inds, size=(len(bg_inds) - num_bg), replace=False)
        labels[disable_inds] = -1

    bbox_targets = np.zeros((len(inds_inside), 4), dtype=np.float32)
    # shape=[在边界里的anchor的数目,4]
    bbox_targets = _compute_targets(anchors, gt_boxes[argmax_overlaps, :]) 
    # _compute_targets函数:把绝对坐标转换成offset 
    # 第一个是anchor的坐标,第二个参数是anchors与第几个gt匹配
    # 返回的是anchors与gt框的offset,即训练的真实值tx,ty,tw,th.
    

    bbox_inside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32)
    # bbox_inside_weights的shape是[边界内anchor个数,4坐标]
    # 这里的weight和具体分类时的维度不同,那里的列数是4*类别数
    # only the positive ones have regression targets
    bbox_inside_weights[labels == 1, :] = np.array(
        cfg.TRAIN.RPN_BBOX_INSIDE_WEIGHTS) # (1.0, 1.0, 1.0, 1.0)
        # 只让选出来的正类sample计算回归loss

    bbox_outside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32)
    # bbox_outside_weights 的shape和bbox_inside_weights 同
    if cfg.TRAIN.RPN_POSITIVE_WEIGHT < 0:  # RPN_POSITIVE_WEIGHT =-1
        # uniform weighting of examples (given non-uniform sampling)
        num_examples = np.sum(labels >= 0) # 样本的数目
        positive_weights = np.ones((1, 4)) * 1.0 / num_examples # 这里即公式里的N_clss,要放到outweight中
        negative_weights = np.ones((1, 4)) * 1.0 / num_examples # 正负权重可以设置
    else:
        assert ((cfg.TRAIN.RPN_POSITIVE_WEIGHT > 0) &
                (cfg.TRAIN.RPN_POSITIVE_WEIGHT < 1))
        positive_weights = (
            cfg.TRAIN.RPN_POSITIVE_WEIGHT / np.sum(labels == 1))
        negative_weights = (
            (1.0 - cfg.TRAIN.RPN_POSITIVE_WEIGHT) / np.sum(labels == 0))
    bbox_outside_weights[labels == 1, :] = positive_weights
    bbox_outside_weights[labels == 0, :] = negative_weights
    # 外部权重=1/样本个数

    # map up to original set of anchors映射到原始total_anchor
    # 接下来都使用了_unmap函数,这个函数的作用是从在框内的anchor又扩展回了全部anchor尺度,fill参数代表那些在边界外的anchor要填充的数字
    # 虽然拓展到了原来total_anchor行,但是其中非0还是sample个行
    labels = _unmap(labels, total_anchors, inds_inside, fill=-1) 
    # _unmap由把label从只有在边界里的anchor还原到了total_anchors尺度。即正1,负0,不关心-1
    # 但是1和0的总数还是256个
    bbox_targets = _unmap(bbox_targets, total_anchors, inds_inside, fill=0)
    bbox_inside_weights = _unmap(
        bbox_inside_weights, total_anchors, inds_inside, fill=0)
    bbox_outside_weights = _unmap(
        bbox_outside_weights, total_anchors, inds_inside, fill=0)

    # labels # 原labels=[1,0,-1,1,1,0,0.....2W个]
    labels = labels.reshape((1, height, width, A)).transpose(0, 3, 1, 2)
    # 原来的label是[1,0,-1,1,1,0,0.....2W个],顺序是9个为一组,宽为一组,高为一组,图为1组
    # 1个张图片,宽,高,A=k=9。转成看一张图片,k,高,宽?这样?
    # 联想通道图(实际k不是通道),之前是先从通道方向上方,转置之后是先放完一个通道的整张图,再放第二个通道代表的图
    labels = labels.reshape((1, 1, A * height, width))
    # 我猜是换成了 宽-高-k为一组,一张图为一组,一个batch为一组
    rpn_labels = labels

    # bbox_targets # bbox_targets=[[tx,ty,tw,wh],...2W个]
    bbox_targets = bbox_targets \
      .reshape((1, height, width, A * 4))
    rpn_bbox_targets = bbox_targets
    
    # bbox_inside_weights #原bbox_inside_weights=[[1,1,1,1],[0,0,0,0]...1或0形成2W个 ]
    bbox_inside_weights = bbox_inside_weights \
      .reshape((1, height, width, A * 4))
    rpn_bbox_inside_weights = bbox_inside_weights

    # bbox_outside_weights
    bbox_outside_weights = bbox_outside_weights \
      .reshape((1, height, width, A * 4))
    rpn_bbox_outside_weights = bbox_outside_weights
    return rpn_labels, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights


def _unmap(data, count, inds, fill=0):
    """ Unmap a subset of item (data) back to the original set of items (of
  size count) """
    if len(data.shape) == 1: # data(label)是一维的,如[1,2,3,4]
    # 例_unmap(labels, total_anchors, inds_inside, fill=-1)
        ret = np.empty((count, ), dtype=np.float32)
        ret.fill(fill)
        ret[inds] = data # ret[inds_inside] = labels
    else:
    #例2: _unmap(data=bbox_targets, count=total_anchors, inds=inds_inside, fill=0)
        ret = np.empty((count, ) + data.shape[1:], dtype=np.float32)
        # 单纯第data.shape[1]是一个数,但是data.shape[1:]是一个元组(2,)
        # np.empty((2,)+(3,))是2行3列的空矩阵
        ret.fill(fill)
        ret[inds, :] = data
    return ret


def _compute_targets(ex_rois, gt_rois): 
# 这个函数的意思是验证一下维度,实际起作用的是另外一个函数bbox_transform
    """Compute bounding-box regression targets for an image."""

    assert ex_rois.shape[0] == gt_rois.shape[0] # gt已经是赋完label的anchor了
    # 这里gtbox的格式还需要根据其他地方研究
    assert ex_rois.shape[1] == 4 # [[x,y,x,y],...]
    assert gt_rois.shape[1] == 5 #gt[[x,y,x,y,类],...]

    return bbox_transform( # 把绝对坐标转换成offset坐标
        torch.from_numpy(ex_rois), torch.from_numpy(gt_rois[:, :4])).numpy()


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值