Faster R-CNN源码阅读之八：Faster R-CNN/lib/rpn_msr/proposal_target_layer

本文链接：https://blog.csdn.net/DaVinciL/article/details/81939966
一、介绍
本demo由Faster R-CNN官方提供，我只是在官方的代码上增加了注释，一方面方便我自己学习，另一方面贴出来和大家一起交流。
该文件中的函数的主要目的是根据所传入的参数rpn rois和gt boxes等信息对rois尽心采样，并确定每一个roi的labels标签和bbox回归目标。
二、代码以及注释
# coding=utf-8
# --------------------------------------------------------
# Faster R-CNN
# Copyright (c) 2015 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Ross Girshick and Sean Bell
# --------------------------------------------------------

import yaml
import numpy as np
import numpy.random as npr
from fast_rcnn.config import cfg
from fast_rcnn.bbox_transform import bbox_transform
from utils.cython_bbox import bbox_overlaps
import pdb

DEBUG = False

# 该函数将rois和gt boxes结合起来，对产生的rois进行筛选和分类（每一个roi中的目标属于哪一种类别）。
# 同时产生bbox inside weights和bbox outside weights，用以loss值的确定。
def proposal_target_layer(rpn_rois, gt_boxes, _num_classes):
    """
    Assign object detection proposals to ground-truth targets.
    Produces proposal classification labels and bounding-box regression targets.
    将之前产生的目标检测的proposals和ground-truth目标进行匹配对齐，从而产生proposals的分类labels和bbox的回归目标。
    :param rpn_rois:blob, shape为[N, 5]，每一行的组成为[proposals的输入图片的索引（1），proposals坐标（4）]。
                    （由于每次值feed一张图片，这里的图片索引一般为0）
    :param gt_boxes: ground truth boxes，shape为[M, 5]，每一行的前四个元素表示gt box的坐标，最后一个元素表示类别。
    :param _num_classes: 类别的总数目，包括背景，这里一本为21,(Pascal VOC的类别数目为21)。
    :returns:
    """

    # Proposal ROIs (0, x1, y1, x2, y2) coming from RPN
    # (i.e., rpn.proposal_layer.ProposalLayer), or any other source
    # 重新给变量命名
    all_rois = rpn_rois
    # TODO(rbg): it's annoying that sometimes I have extra info before
    # and other times after box coordinates -- normalize to one format

    # Include ground-truth boxes in the set of candidate rois
    # 定义一个shape为(gt_boxes.shape[0], 1)的全0矩阵，用以标注gt boxes的图片索引。
    zeros = np.zeros((gt_boxes.shape[0], 1), dtype=gt_boxes.dtype)
    # 这一步将生成的proposals和gt boxes结合在一起。
    #  gt_boxes[:, :-1]表示取出每一行的除了最后一个元素之外的所有元素。
    all_rois = np.vstack((all_rois, np.hstack((zeros, gt_boxes[:, :-1]))))

    # Sanity check: single batch only
    # 因为每次值feed进网络一张图片，因此图片索引必定为0。
    assert np.all(all_rois[:, 0] == 0), 'Only single item batches are supported'

    # 图片数目，一般情况下都为1
    num_images = 1
    # 平均每张图片上的rois数目。cfg.TRAIN.BATCH_SIZE一般取值128。
    rois_per_image = cfg.TRAIN.BATCH_SIZE / num_images
    # 平均每张图片上的前景rois数目。cfg.TRAIN.FG_FRACTION一般取值0.25，表示前景的比例。
    fg_rois_per_image = np.round(cfg.TRAIN.FG_FRACTION * rois_per_image)

    # Sample rois with classification labels and bounding box regression targets
    # 对所有的rois进行采样，选区其中的一部分作为前景rois，背景rois，
    # 返回他们的labels标签，rois，bbox回归的目标矩阵和bbox inside weights
    labels, rois, bbox_targets, bbox_inside_weights = _sample_rois(
        all_rois, # 所有的rois，包括产生的proposals和gt boxes
        gt_boxes, # ground truth boxes
        fg_rois_per_image, # 每张图片的前景rois数目
        rois_per_image, # 平均每张图片上的rois总数目
        _num_classes) # 类别总数目

    # 输出调试信息
    if DEBUG:
        print 'num fg: {}'.format((labels > 0).sum())
        print 'num bg: {}'.format((labels == 0).sum())
        _count += 1
        _fg_num += (labels > 0).sum()
        _bg_num += (labels == 0).sum()
        print 'num fg avg: {}'.format(_fg_num / _count)
        print 'num bg avg: {}'.format(_bg_num / _count)
        print 'ratio: {:.3f}'.format(float(_fg_num) / float(_bg_num))

    # 对上面产生的矩阵进行重新整理
    # rois整理成N x 5 的矩阵，第一列表示图片索引，一般为0。
    rois = rois.reshape(-1, 5)
    labels = labels.reshape(-1, 1)
    bbox_targets = bbox_targets.reshape(-1, _num_classes * 4)
    bbox_inside_weights = bbox_inside_weights.reshape(-1, _num_classes * 4)

    # bbox outside weights的产生和赋值，这里将bbox inside weights大于0的部分设置为1.0，其余部分设置为0.0。
    bbox_outside_weights = np.array(bbox_inside_weights > 0).astype(np.float32)

    # 返回
    return rois, labels, bbox_targets, bbox_inside_weights, bbox_outside_weights


def _get_bbox_regression_labels(bbox_target_data, num_classes):
    """
    Bounding-box regression targets (bbox_target_data) are stored in a
    compact form N x (class, tx, ty, tw, th)

    This function expands those targets into the 4-of-4*K representation used
    by the network (i.e. only one class has non-zero targets).

    这个函数目的一个是将bbox targets扩展转换成类似one-hot的形式，另一个目的是返回bbox inside weights。
    :param bbox_target_data: _compute_targets函数生成的labels和bbox回归目标的关联矩阵，形如N x (class, tx, ty, tw, th)。
    :param num_classes: 类别数目。

    Returns:
        bbox_target (ndarray): N x 4K blob of regression targets
        N × 4K的矩阵，表示bbox回归的目标。
        bbox_inside_weights (ndarray): N x 4K blob of loss weights
        N × 4K的矩阵，用以产生loss值。
    """

    # 获取所有的类别信息
    clss = np.array(bbox_target_data[:, 0], dtype=np.uint16, copy=True)
    # 产生一个全0的矩阵，用以储存bbox的回归目标信息，注意这里矩阵的形状是(clss.size, 4 * num_classes)。
    bbox_targets = np.zeros((clss.size, 4 * num_classes), dtype=np.float32)
    # 产生一个全0的矩阵，用以存储bbox inside weights，这里矩阵的形状也是(clss.size, 4 * num_classes)。
    bbox_inside_weights = np.zeros(bbox_targets.shape, dtype=np.float32)
    # 获取所有的前景目标的索引，0表示背景
    inds = np.where(clss > 0)[0]
    # 对每一个前景目标索引
    for ind in inds:
        # 获取这个前景目标的类别
        cls = clss[ind]
        # 计算初始列和结束列
        start = 4 * cls
        end = start + 4
        # 在合适的列上进行赋值，行数有ind指定。这里把bbox的回归目标进行赋值。
        bbox_targets[ind, start:end] = bbox_target_data[ind, 1:]
        # 同上，这里将bbox的bbox inside weights进行赋值。cfg.TRAIN.BBOX_INSIDE_WEIGHTS一般取值(1.0, 1.0, 1.0, 1.0)。
        bbox_inside_weights[ind, start:end] = cfg.TRAIN.BBOX_INSIDE_WEIGHTS

    # 返回
    return bbox_targets, bbox_inside_weights


def _compute_targets(ex_rois, gt_rois, labels):
    """
    Compute bounding-box regression targets for an image.
    计算bbox的回归目标
    :param ex_rois: 经过一系列计算保留下来的rois。
    :param gt_rois: 和ex_rois拥有最大IOU的gt box，该参数中的gt box和前面额ex rois一一对应。
    :param labels: 前面ex rois的labels
    :return: bbox的labels和回归目标共同组成的二维数组。
    """

    # 保证ex rois和gt rois的形状符合要求。
    assert ex_rois.shape[0] == gt_rois.shape[0]
    assert ex_rois.shape[1] == 4
    assert gt_rois.shape[1] == 4

    # 计算bbox的回归目标，该函数的具体含义可以参考lib/fast_rcnn/bbox_transform.py文件。
    targets = bbox_transform(ex_rois, gt_rois)

    # 是否进行正则化，默认是False。
    if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED:
        # Optionally normalize targets by a precomputed mean and stddev
        # 对targets进行正则化，参数取值一般如下
        # cfg.TRAIN.BBOX_NORMALIZE_MEAN = (0.0, 0.0, 0.0, 0.0)
        # cfg.TRAIN.BBOX_NORMALIZE_STDS = (0.1, 0.1, 0.2, 0.2)
        targets = ((targets - np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS)) / np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS))

    # 将labels信息和bbox回归目标结合起来。
    return np.hstack((labels[:, np.newaxis], targets)).astype(np.float32, copy=False)


def _sample_rois(all_rois, gt_boxes, fg_rois_per_image, rois_per_image, num_classes):
    """
    Generate a random sample of RoIs comprising foreground and background examples.
    生成包含前景和背景示例的RoI的随机样本。
    :param all_rois: 所有的rois，包括产生的proposals和gt boxes
    :param gt_boxes: ground truth boxes
    :param fg_rois_per_image: 每张图片的前景rois数目，该值一般为32
    :param rois_per_image: 平均每张图片上的rois总数目，该值一般为128
    :param num_classes: 类别总数目，该值一般为21，包括背景
    :returns:
    """
    # overlaps: (rois x gt_boxes)
    # 计算所有的产生的rois和gt boxes之间的overlaps（IOU）。
    # overlaps是一个shape为[N, K]的二维数组，N表示所有的rois的数目，K表示gt boxes的数目。
    # 对应overlap[i, j]存放的是第i个rois和第j个gt boxes之间的IOU。
    overlaps = bbox_overlaps(
        np.ascontiguousarray(all_rois[:, 1:5], dtype=np.float),
        np.ascontiguousarray(gt_boxes[:, :4], dtype=np.float))

    # 横向比较，找到与每个roi拥有最高IOU的gt box的索引。
    gt_assignment = overlaps.argmax(axis=1)
    # 横向比较，找到与每个roi拥有最高IOU的gt box的IOU值。
    max_overlaps = overlaps.max(axis=1)
    # 根据gt_assignment信息取出gt boxes的第4列（即labels标签列），此时相当于是在给all rois设置labels标签。
    labels = gt_boxes[gt_assignment, 4]

    # Select foreground RoIs as those with >= FG_THRESH overlap
    # 找到和某个gt boxes拥有等于或者高于cfg.TRAIN.FG_THRESH阈值的all rois的索引。
    fg_inds = np.where(max_overlaps >= cfg.TRAIN.FG_THRESH)[0]
    # Guard against the case when an image has fewer than fg_rois_per_image foreground RoIs
    # 控制前景rois的数目不超过fg_rois_per_image。因为有时候经过cfg.TRAIN.FG_THRESH的阈值控制，剩下的rois数目还是过多。
    # 这个时候需要对rois的数目进行控制，让其不能超过fg_rois_per_image。
    # 如果本身前景rois的数目就没有超过fg_rois_per_image，则直接全部保留。
    # 因此这里取fg_rois_per_image和fg_inds.size之间的最小值。
    fg_rois_per_this_image = int(min(fg_rois_per_image, fg_inds.size))
    # Sample foreground regions without replacement
    # 如果有有效的前景rois
    if fg_inds.size > 0:
        # 当fg_rois_per_this_image取值是fg_inds.size时，说明前景anchors数目不超过fg_rois_per_image，
        # 这个时候由于replace=False，npr.choice的作用仅仅相当于打乱顺序。
        # 当fg_rois_per_this_image取值是fg_rois_per_image时，说明前景anchors数目超过了fg_rois_per_image，
        # 这个时候就从fg_inds中随机选择fg_rois_per_image个前景rois的索引。
        # 将取出的索引存入fg_inds。
        fg_inds = npr.choice(fg_inds, size=fg_rois_per_this_image, replace=False)

    # 和选择前景rois采用类似的方法进行背景rois的选择。
    # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
    # 选择那些最大IOU在[BG_THRESH_LO, BG_THRESH_HI)（一般取值BG_THRESH_LO=0.1, BG_THRESH_HI=0.5）之间的proposals的索引。
    bg_inds = np.where((max_overlaps < cfg.TRAIN.BG_THRESH_HI) &
                       (max_overlaps >= cfg.TRAIN.BG_THRESH_LO))[0]
    # Compute number of background RoIs to take from this image (guarding against there being fewer than desired)
    # 和上面的前景rois选择类似，这里也是为了防止产生过多的背景rois。
    bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image
    bg_rois_per_this_image = min(bg_rois_per_this_image, bg_inds.size)
    # Sample background regions without replacement
    if bg_inds.size > 0:
        bg_inds = npr.choice(bg_inds, size=bg_rois_per_this_image, replace=False)

    # The indices that we're selecting (both fg and bg)
    # 将前景rois和背景rois的索引连接起来，作为保留下来的rois
    keep_inds = np.append(fg_inds, bg_inds)
    # Select sampled values from various arrays:
    # 获取保留下来的rois的labels标签。
    labels = labels[keep_inds]
    # Clamp labels for the background RoIs to 0
    # 前面的labels中保存的是每个rois的何其拥有最大IOU的gt box的label，均为大于0的labels，
    # 刚刚我们计算出了应该被设置为背景的rois的索引，并把这些rois保存在了keep_inds的后半部分，
    # 因此我们需要将后半部分的rois的labels设置为0，表明他们为背景rois。
    labels[fg_rois_per_this_image:] = 0
    # 取出保留下来的rois
    rois = all_rois[keep_inds]

    # rois的第1列表示的是图片索引，一般为0，使用不到。
    # gt_boxes[gt_assignment[keep_inds], :4]的目的是取出和前面的rois一一对应的gt box的坐标（前4列，最后一列为labels标签。）
    # labels为rois的labels，也适合rois一一对应的。
    # 该函数产生bbox的回归目标。形如N × 5。
    bbox_target_data = _compute_targets(rois[:, 1:5], gt_boxes[gt_assignment[keep_inds], :4], labels)

    # 结合上面产生的回归目标和类别总数目，产生扩展变换后的bbox回归目标和bbox inside weights。
    bbox_targets, bbox_inside_weights = _get_bbox_regression_labels(bbox_target_data, num_classes)

    # 返回
    return labels, rois, bbox_targets, bbox_inside_weights