Faster-RCNN源码之RPN详细分析

最新推荐文章于 2022-03-19 11:26:24 发布

那时那月那人

最新推荐文章于 2022-03-19 11:26:24 发布

阅读量431

点赞数 1

文章标签： Faster-RCNN RPN 源码分析

本文链接：https://blog.csdn.net/xiaoxu1025/article/details/104084235

版权

这篇博客只是用来做个记录记录看源码的理解也和大家一起分享下。

源码地址：https://github.com/kevinjliang/tf-Faster-RCNN

RPN 也就是 region proposal networks 就是区域建议网络通俗点将就是给fast-rcnn roipooling层提供rois 方便后续训练和测试

RPN源码主要由以下几个文件组成

faster_rcnn_networks.py 这里是rpn的入口

generate_anchors.py/anchor_target_layer.py 这两个文件主要给RPN提供训练的数据用于分类(区分前景和背景)和边框回归

proposal_target_layer.py 这个文件是为了每个物体候选框生成生成训练数据用于后续fast-rcnn分类和坐标回归

proposal_layer.py 这个文件是为了将RPN的输出转换成为物体候选区域

class rpn:
    '''
    Region Proposal Network (RPN): From the convolutional feature maps
    (TensorBase Layers object) of the last layer, generate bounding boxes
    relative to anchor boxes and give an "objectness" score to each
    In evaluation mode (eval_mode==True), gt_boxes should be None.
    '''

    def __init__(self, featureMaps, gt_boxes, im_dims, _feat_stride, eval_mode):
        self.featureMaps = featureMaps
        self.gt_boxes = gt_boxes
        self.im_dims = im_dims
        self._feat_stride = _feat_stride
        self.anchor_scales = cfg.RPN_ANCHOR_SCALES
        self.eval_mode = eval_mode

        self._network()

    def _network(self):
        # There shouldn't be any gt_boxes if in evaluation mode
        if self.eval_mode is True:
            assert self.gt_boxes is None, \
                'Evaluation mode should not have ground truth boxes (or else what are you detecting for?)'

        _num_anchors = len(self.anchor_scales) * 3

        rpn_layers = Layers(self.featureMaps)

        with tf.variable_scope('rpn'):
            # Spatial windowing
            for i in range(len(cfg.RPN_OUTPUT_CHANNELS)):
                # 这里相当于先进行3 * 3 卷积操作 padding=same 大小不变通道不变(这可能鲁棒性更好，让该点融合 3 * 3区域特征)
                rpn_layers.conv2d(filter_size=cfg.RPN_FILTER_SIZES[i], output_channels=cfg.RPN_OUTPUT_CHANNELS[i])

            features = rpn_layers.get_output()
            # feature_maps 维度不变  只是改变通道 从 H * W * 256 -> H * W * 2k
            # 特征图上每个点分配了k个anchor  每个anchor区分前景和背景(positive/negative)
            # 每个点预测 2k个scores
            with tf.variable_scope('cls'):
                # Box-classification layer (objectness)
                self.rpn_bbox_cls_layers = Layers(features)
                self.rpn_bbox_cls_layers.conv2d(filter_size=1, output_channels=_num_anchors * 2, activation_fn=None)

            with tf.variable_scope('target'):
                # Only calculate targets in train mode. No ground truth boxes in evaluation mode
                if self.eval_mode is False:
                    # Anchor Target Layer (anchors and deltas)
                    rpn_cls_score = self.rpn_bbox_cls_layers.get_output()
                    # 如果是训练模式下  生成用于分类和回归的数据标签
                    self.rpn_labels, self.rpn_bbox_targets, self.rpn_bbox_inside_weights, self.rpn_bbox_outside_weights = \
                        anchor_target_layer(rpn_cls_score=rpn_cls_score, gt_boxes=self.gt_boxes, im_dims=self.im_dims,
                                            _feat_stride=self._feat_stride, anchor_scales=self.anchor_scales)

            with tf.variable_scope('bbox'):
                #  feature_maps 维度不变  只是改变通道 从 H * W * 256 -> H * W * 4k
                # 特征图上每个点分配了k个anchor  每个anchor回归4个坐标
                # 相当于每个点预测 4k个坐标值
                # Bounding-Box regression layer (bounding box predictions)
                self.rpn_bbox_pred_layers = Layers(features)
                self.rpn_bbox_pred_layers.conv2d(filter_size=1, output_channels=_num_anchors * 4, activation_fn=None)

    targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights)

然后分析anchor_target_layer.py 这个类只做两件事

1 得到用于训练的anchors和对应的labels标签用于后续分类训练只需要区分前景和背景

2 得到图像内所有anchors和gt_bbox的映射关系用于后续边框回归

"""
Generates training targets/labels for each anchor.
Classification labels are 1 (object), 0 (not object) or -1 (ignore).
Bbox regression targets are specified when the classification label is > 0.

生成用于训练的labels(用于区分是前景: 1 背景: 0 其他(不参与训练): -1) 和 targets(用于边框回归)
"""
import tensorflow as tf
import numpy as np
from faster_rcnn.tf import config
from faster_rcnn.tf.rpn.generate_anchors import generate_anchors
from faster_rcnn.utils.bbox_overlaps import bbox_overlaps
from faster_rcnn.utils.bbox_transform import bbox_transform


def anchor_target_layer(feature_maps, gt_boxes, im_dims, feat_stride, anchor_scales):
    # 这里解释下tf.py_function
    # 因为tensorflow1.0+系列是静态图编程无法随时改变数据 为了方便计算这里调用py_function将我们的tensor数据
    # 转成numpy方便处理  但是通过py_function转出来的tensor是不能参与训练的
    rpn_labels, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights = \
        tf.py_function(_anchor_target_layer, [feature_maps, gt_boxes, im_dims, feat_stride, anchor_scales],
                       [tf.float32, tf.float32, tf.float32, tf.float32])
    rpn_labels = tf.convert_to_tensor(rpn_labels, dtype=tf.float32)
    rpn_bbox_targets = tf.convert_to_tensor(rpn_bbox_targets, dtype=tf.float32)
    rpn_bbox_inside_weights = tf.convert_to_tensor(rpn_bbox_inside_weights, dtype=tf.float32)
    rpn_bbox_outside_weights = tf.convert_to_tensor(rpn_bbox_outside_weights, dtype=tf.float32)
    return rpn_labels, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights


def _anchor_target_layer(feature_maps, gt_boxes, im_dims, feat_stride, anchor_scales):
    """

    :param feature_maps: shape (None, H, W, channels)
    :param gt_boxes: shape (None, 5) (x1, y1, x2, y2, cls)
    :param im_dims: shape (None, 2)
    :param feat_stride: 16
    :param anchor_scales: [8, 16, 32]
    :return:
    """
    assert feature_maps.shape[0] == 1, 'Only single item batches are supported'
    # 这个函数只做两个事
    # 1 得到用于训练的anchors和对应的labels标签 用于后续分类训练 只需要区别前景和背景
    # 2 得到图像内所有anchors和gt_bbox的映射关系 用于后续边框回归
    im_dims = im_dims[0]

    anchors = generate_anchors(scales=np.array(anchor_scales))
    num_anchors = anchors.shape[0]
    # 得到特征图的尺寸
    feature_height, feature_width = feature_maps.shape[1:3]
    # 特征图上的每个点对应原图feat_stride * feat_stride
    shift_x = np.arange(0, feature_width) * feat_stride
    shift_y = np.arange(0, feature_height) * feat_stride
    # 这里是得到特征图中所有的点坐标(也就是网络坐标) 而且每一个坐标不管向x还是y移动都会+feat_stride
    # 等效于遍历H, W for each (H, W) location i
    shift_x, shift_y = np.meshgrid(shift_x, shift_y)
    # [H*W, 4] 这里的shifts每一行相当于 (x1, y1, x2, y2) 网格中的某一点 只需要将初始得到的9个anchor加上该点
    # 的值 就可以得到该点的9个anchor 这是基本思想  下面的操作就是实现他们相加 调整维度实现
    shifts = np.vstack((shift_x.ravel(), shift_y.ravel(),
                        shift_x.ravel(), shift_y.ravel())).transpose()
    # A : 9
    A = num_anchors
    # H * W
    K = shifts.shape[0]
    # 让生成的9个anchors在shifts上移动
    all_anchors = (anchors.reshape((1, A, 4)) + shifts.reshape((1, K, 4)).transpose((1, 0, 2)))
    all_anchors = all_anchors.reshape((K * A, 4))
    # H * W * A
    total_anchors_num = int(K * A)
    # 对于超出图像区域的anchors直接去掉
    # 允许anchor超出的边界
    allow_border = 0
    inds_inside = np.where(all_anchors[:, 0] >= allow_border &
                           all_anchors[:, 1] >= allow_border &
                           all_anchors[:, 2] <= im_dims[1] + allow_border &
                           all_anchors[:, 3] <= im_dims[0] + allow_border)[0]
    anchors = all_anchors[inds_inside, :]
    # 生成labels数据
    labels = np.empty((len(inds_inside, )), dtype=np.float32)
    labels.fill(-1)
    # 开始对anchors 和 label进行标注
    # 1 计算anchors和gt_boxes的iou
    # 2 大于0.7的是positive 小于0.3的是negative 对于和gt_box相交iou最大的anchor也标记为positive
    # 3 最终一个batch设置128个positive  128个negative

    # 这里是计算 anchors与gt_boxes的IOU
    # (N, M) N:len(anchors) M:len(gt_boxes)
    # 源码是用cpython(不懂得可以自行搜索)实现的 为了提高效率
    # np.ascontiguousarray是为了将数组里元素的内存连续
    overlaps = bbox_overlaps(np.ascontiguousarray(anchors, dtype=np.float32),
                             np.ascontiguousarray(gt_boxes, dtype=np.float32))
    #  取出每个anchor和 所有gt_boxes中iou最大的索引位置 shape(len(anchors))
    argmax_overlaps = np.argmax(overlaps, axis=1)
    # 这里是得到最大的值
    # overlaps[np.arange(len(inds_inside)), argmax_overlaps]
    # 这是个很巧妙的设计  argmax(axis=1) 得到的最大值所在列的索引
    # 前面补上行索引可以精确定位到具体位置 取出最大值
    # 这里其实等效于 max_overlaps = np.max(overlaps, axis=1)
    max_overlaps = overlaps[np.arange(len(inds_inside)), argmax_overlaps]
    # 对于每个gt_box 得到最大的anchor所在的位置
    # shape: len(M)
    gt_argmax_overlaps = np.argmax(overlaps, axis=0)
    # 得到最大anchor的值
    gt_max_overlaps = overlaps[gt_argmax_overlaps, np.arange(overlaps.shape[1])]
    # np.where 返回的是元组 这里是行 元素和列元素  取出行元素就行  这个gt_argmax_overlaps
    # 包含上面的gt_argmax_overlaps overlaps可能出现计算相交的iou相等 对于同一个gt_box
    # 下面这个gt_argmax_overlaps包含上面gt_max_overlaps  两个属于父子关系
    # 这样写的目的就是如果 两个anchor 与某个gt 的iou一样大 overlaps.argmax 会舍去后面的anchor
    # np.where(overlaps == gt_max_overlaps)[0] 相当于又把舍去的anchor给找回来
    gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0]

    # RPN_CLOBBER_POSITIVES 抑制positive
    # 这里表示不需要抑制positive 可以提前设置 后面可以进行覆盖
    # 也就是说既属于positive又属于negative设置为positive
    if not config.TRAIN_RPN_CLOBBER_POSITIVES:
        labels[max_overlaps < config.TRAIN_RPN_NEGATIVE_OVERLAP] = 0

    labels[gt_argmax_overlaps] = 1
    labels[max_overlaps > config.TRAIN_RPN_POSITIVE_OVERLAP] = 1

    # 这里表示需要抑制positive 后面进行覆盖
    # 对既属于negative又属于positive的样本设置为 negitive
    if config.TRAIN_RPN_CLOBBER_POSITIVES:
        labels[max_overlaps < config.TRAIN_RPN_NEGATIVE_OVERLAP] = 0
    # 防止每张图片训练数据过多  这里每张图片positive + negative 的样本数等于256
    # 其中尽量保证 positive和negative样本数目一致 如果正样本不够128则负样本增加满足一种图片取256个样本
    num_fg = int(config.TRAIN_RPN_FG_FRACTION * config.TRAIN_RPN_BATCHSIZE)
    fg_inds = np.where(labels == 1)[0]
    if len(fg_inds) > num_fg:
        # 这个表示随机采样 replace=false表示没有重复采样
        disabled_inds = np.random.choice(fg_inds, size=len(fg_inds) - num_fg, replace=False)
        labels[disabled_inds] = -1
    num_bg = config.TRAIN_RPN_BATCHSIZE - np.sum(labels == 1)
    bg_inds = np.where(labels == 0)[0]
    if len(bg_inds) > num_bg:
        disabled_inds = np.random.choice(bg_inds, size=len(bg_inds) - num_bg, replace=False)
        labels[disabled_inds] = -1
    """
        overlaps = bbox_overlaps(np.ascontiguousarray(anchors, dtype=np.float),
                                np.ascontiguousarray(gt_boxes, dtype=np.float))
       # 取出每个anchor和 所有gt_boxes中iou最大的索引位置 shape(len(anchors))
       argmax_overlaps = overlaps.argmax(axis=1)
       overlaps的第一个维度和anchors的第一个维度是一致的
       gt_boxes[argmax_overlaps, :]取出来的是 anchors对应的最大gt_boxes
    """
    # 接下来开始做第二件事
    # len(inds_inside) = len(anchors) = len(labels)
    bbox_targets = _compute_targets(anchors, gt_boxes[argmax_overlaps, :])
    bbox_inside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32)
    # 这里的意思是只会对前景(positive)的标签做边框回归，
    # 只有当anchors A和GT比较接近时，才能使用线性回归模型，否则就是复杂的非线性问题了 所有这里取得都是
    # IOU > 0.7 或者和gt_box相交最大的iou
    bbox_inside_weights[labels == 1, :] = np.array([1.0, 1.0, 1.0, 1.0])
    bbox_outside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32)
    if config.TRAIN_RPN_POSITIVE_WEIGHT < 0:
        # uniform weighting of examples (given non-uniform sampling) 非均匀采样
        # 这里相当于把样本进行均匀采样处理出现的概率都是一样的权重是一样
        num_examples = np.sum(labels >= 0)
        positive_weights = np.ones((1, 4)) * 1.0 / num_examples
        negative_weights = np.ones((1, 4)) * 1.0 / num_examples
    else:
        assert ((config.TRAIN_RPN_POSITIVE_WEIGHT > 0) &
                (config.TRAIN_RPN_POSITIVE_WEIGHT < 1))
        # 如果是非均匀采样这里将权重设置成 该样本的概率乘以1/positive_samples  loss = p * loss(positive) + (1-p) loss(negative)
        # 目的就是调节正负样本损失在总损失中站的比例 防止样本多的损失占比过大
        positive_weights = (config.TRAIN_RPN_POSITIVE_WEIGHT / np.sum(labels == 1))
        negative_weights = ((1.0 - config.TRAIN_RPN_POSITIVE_WEIGHT) / np.sum(labels == 0))
    bbox_outside_weights[labels == 1] = positive_weights
    bbox_outside_weights[labels == 0] = negative_weights

    # map up to original set of anchors
    # 将labels赋值给所有的anchors 不仅仅只有图像内的anchor 相当于扩充到所有anchors上 H * W * K
    labels = _unmap(labels, total_anchors_num, inds_inside, fill=-1)
    # 把图像内部的anchor对应的bbox_target映射回所有的anchor(加上了那些超出边界的anchor，填充0)
    bbox_targets = _unmap(bbox_targets, total_anchors_num, inds_inside, fill=0)
    # 把图像内部的anchor对应的bbox_target映射回所有的anchor(加上了那些超出边界的anchor，填充0)
    # [H * W * A, 4]
    bbox_inside_weights = _unmap(bbox_inside_weights, total_anchors_num, inds_inside, fill=0)
    # 把图像内部的anchor对应的bbox_target映射回所有的anchor(加上了那些超出边界的anchor，填充0)
    bbox_outside_weights = _unmap(bbox_outside_weights, total_anchors_num, inds_inside, fill=0)

    # 针对维度进行reshape loss_function 中会做最后的reshape
    # [H * W * A, 1] --> [1, H, W, A] --> [1, A, H, W]
    labels = labels.reshape((1, feature_height, feature_width, A)).transpose(0, 3, 1, 2)
    labels = labels.reshape((1, 1, A * feature_height, feature_width))
    rpn_labels = labels

    # 源码中会进行如下reshape 其实TensorFlow中可以不需要这么做 这是由于caff和TensorFlow中数据存储格式有关
    # caff中是[batch_size, channel，height，width]  TensorFlow[batch_size，height，width, channel]
    #  (H * W * A, 4) -> (1, H, W, A * 4) --> (1, A * 4, H, W)
    rpn_bbox_targets = bbox_targets.reshape((1, feature_height, feature_width, A * 4))
    #  (H * W * A, 4) -> (1, H, W, A * 4) --> (1, A * 4, H, W)
    rpn_bbox_inside_weights = bbox_inside_weights.reshape((1, feature_height, feature_width, A * 4))
    #  (H * W * A, 4) -> (1, H, W, A * 4) --> (1, A * 4, H, W)
    rpn_bbox_outside_weights = bbox_outside_weights.reshape((1, feature_height, feature_width, A * 4))

    return rpn_labels, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights


def _unmap(data, count, inds, fill=0):
    if len(data.shape) == 1:
        ret = np.empty((count, 1), dtype=np.float32)
        ret.fill(fill)
        ret[inds] = data
    else:
        ret = np.empty((count,) + data.shape[1:], dtype=np.float32)
        ret.fill(fill)
        ret[inds, :] = data
    return ret


def _compute_targets(ex_rois, gt_rois):
    assert ex_rois.shape[0] == gt_rois.shape[0]
    assert ex_rois.shape[1] == 4
    assert gt_rois.shape[1] == 5
    return bbox_transform(ex_rois, gt_rois[:, :4]).astype(np.float32, copy=False)

其中用到了iou的计算和边框回归映射方法bbox_transform方法比较简单这里就贴出来

"""
源码使用cpython实现 这里我就不用cpython了 pycharm社区版写cpython没有代码提示
这里就用Python实现 对cpython怎么实现有兴趣的可以去参考源码
"""
import numpy as np


def bbox_overlaps(boxes, query_boxes):
    """

    :param boxes: shape(None, 4)
    :param query_boxes: (None, 5)
    :return:
    """
    N = boxes.shape[0]
    K = query_boxes[0]
    overlaps = np.zeros((N, K), dtype=np.float32)
    for k in range(K):
        box_area = (query_boxes[k, 2] - query_boxes[k, 0] + 1) * (query_boxes[k, 3] - query_boxes[k, 1] + 1)
        for n in range(N):
            # 计算query_box面积
            iw = min(boxes[n, 2], query_boxes[k, 2]) - max(boxes[n, 0], query_boxes[k, 0]) + 1
            if iw > 0:
                ih = min(boxes[n, 3], query_boxes[k, 3]) - max(boxes[n, 1], query_boxes[k, 1]) + 1
                if ih > 0:
                    # iw * ih 是相交部分的面积
                    # ua: area(A) + area(B) - area(AB)
                    ua = (boxes[n, 2] - boxes[n, 0] + 1) * (boxes[n, 3] - boxes[n, 1] + 1) + box_area - iw * ih
                    overlaps[n, k] = iw * ih / ua
    return overlaps




def bbox_transform(ex_rois, gt_rois):
    """
      返回生成的anchor和真实box之间的线性映射
      Gˆx =Pwdx(P)+Px (1)
      Gˆy =Phdy(P)+Py (2)
      Gˆw = Pw exp(dw(P))(3)
      Gˆh = Ph exp(dh(P))
      tx = (Gx − Px)/Pw (6)
      ty=(Gy−Py)/Ph (7)
      tw = log(Gw/Pw) (8)
      th = log(Gh/Ph).
      :param ex_rois:
      :param gt_rois:
      :return:
    """
    # 首先转换成中心点坐标
    ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.
    ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.
    ex_ctr_x = ex_rois[:, 0] + 0.5 * (ex_widths - 1)
    ex_ctr_y = ex_rois[:, 1] + 0.5 * (ex_heights - 1)

    gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + 1.
    gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + 1.
    gt_ctr_x = gt_rois[:, 0] + 0.5 * (gt_widths - 1)
    gt_ctr_y = gt_rois[:, 1] + 0.5 * (gt_heights - 1)

    dx = (gt_ctr_x - ex_ctr_x) / ex_widths
    dy = (gt_ctr_y - ex_ctr_y) / ex_heights
    dw = np.log(gt_widths / ex_widths)
    dh = np.log(gt_heights / ex_heights)
    targets = np.vstack((dx, dy, dw, dh)).transpose()
    return targets

然后分析generate_anchors.py文件这里是用来生成基本的9个anchors 唯一的就是base_anchor的大小可能是由于经验决定的。这里这里的anchors有很多负数和超出图像边界的anchor 这里没关系后续送回roipooling会进行裁剪

"""
Generates a regular grid of multi-scale, multi-aspect anchor boxes.
生成初始的9个anchor  生成方法不唯一  这里使用源码中的方法
"""
import numpy as np


def generate_anchors(base_size=16, ratios=[0.5, 1, 2], scales=2 ** np.arange(3, 6)):
    # 生成base anchor [0, 0, 15, 15]
    base_anchor = np.array([1, 1, base_size, base_size]) - 1
    ratios_anchors = _ratios_enum(base_anchor, ratios)
    anchors = np.vstack([_scale_enum(ratios_anchors[i], scales) for i in range(ratios_anchors.shape[0])])
    return anchors


def _ratios_enum(anchor, ratios):
    w, h, x_ctr, y_ctr = _whctrs(anchor)
    # 这里是求解方程得到的  x * y = w * h  x / y = ratios
    # x, y = np.sqrt(w * h / ratios), np.sqrt(w * h / ratios) * ratios
    size = w * h
    size_ratios = size / ratios
    ws = np.round(np.sqrt(size_ratios))
    hs = np.round(ws * ratios)
    anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
    return anchors


def _whctrs(anchor):
    """
    转换anchor坐标 (x1, y1, x2, y2) -> (w, h, x_center, y_center)
    :param anchor:
    :return:
    """
    w = anchor[2] - anchor[0] + 1
    h = anchor[3] - anchor[1] + 1
    x_ctr = anchor[0] + 0.5 * (w - 1)
    y_ctr = anchor[1] + 0.5 * (h - 1)
    return w, h, x_ctr, y_ctr


def _mkanchors(ws, hs, x_ctr, y_ctr):
    ws = ws[:, np.newaxis]
    hs = hs[:, np.newaxis]
    anchors = np.hstack((x_ctr - 0.5 * (ws - 1),
                         y_ctr - 0.5 * (hs - 1),
                         x_ctr + 0.5 * (ws - 1),
                         y_ctr + 0.5 * (hs - 1)))
    return anchors


def _scale_enum(anchor, scales):
    w, h, x_ctr, y_ctr = _whctrs(anchor)
    ws = w * scales
    hs = h * scales
    anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
    return anchors

if __name__ == '__main__':
    import time

    t = time.time()
    a = generate_anchors()
    print(a)
    print(time.time() - t)
    # from IPython import embed; embed()

对于proposal_layer.py将rpn输出转成物体候选框也就是rois

这个方法主要做了下面几件事

1 将特征图上的所有anchor总共是 H * W * K个进行坐标修正

2 对修正的anchors进行裁剪防止anchor超出图像边界

3 过滤掉较小的anchor

4 根据scores进行降序排序

5 由于anchor还是过多进行nms之前先选出 pre_nms_topN个

6 进行nms 非极大值抑制

7 进行nms后再抽取post_nms_topN个进一步减少anchors

8 返回 post_nms_topN 用于后续训练和测试

"""
Converts RPN outputs (per-anchor scores and bbox regression estimates) into object proposals.
将rpn的输出结果转化为roipooling的输入
"""
import tensorflow as tf
from faster_rcnn.tf import config
from faster_rcnn.tf.rpn.generate_anchors import generate_anchors
import numpy as np
from faster_rcnn.utils.bbox_transform import clip_boxes, bbox_transform_inv
from faster_rcnn.tf.nms.nms_cpu import py_cpu_nms as nms



def proposal_layer(rpn_bbox_cls_prob, rpn_bbox_pred, im_dims, cfg_key, feat_stride=16, anchor_scales=[8, 16, 32]):
    return tf.reshape(tf.py_function(_proposal_layer,
                                     [rpn_bbox_cls_prob, rpn_bbox_pred, im_dims, cfg_key, feat_stride, anchor_scales],
                                     [tf.float32]), (-1, 5))


def _proposal_layer(rpn_bbox_cls_prob, rpn_bbox_pred, im_dims, cfg_key, feat_stride, anchor_scales):
    """
    给 roi提供候选框
    :param rpn_bbox_cls_prob:  (1, H, W, 2K)
    :param rpn_bbox_pred: (1, H, W, 4*K)
    :param im_dims:
    :param cfg_key:
    :param feat_stride:
    :param anchor_scales:
    :return:
    """
    # 生成基本anchors
    _anchors = generate_anchors(scales=np.array(anchor_scales))
    _num_anchors = _anchors.shape[0]

    assert rpn_bbox_cls_prob.shape[0] == 1, 'Only single item batches are supported'
    
    # 对于训练和测试用于参数不同
    if cfg_key == 'TRAIN':
        pre_nms_topN = config.TRAIN_RPN_PRE_NMS_TOP_N
        post_nms_topN = config.TRAIN_RPN_POST_NMS_TOP_N
        nms_thresh = config.TRAIN_RPN_NMS_THRESH
        min_size = config.TRAIN_RPN_MIN_SIZE
    else:  # cfg_key == 'TEST':
        pre_nms_topN = config.TEST_RPN_PRE_NMS_TOP_N
        post_nms_topN = config.TEST_RPN_POST_NMS_TOP_N
        nms_thresh = config.TEST_RPN_NMS_THRESH
        min_size = config.TEST_RPN_MIN_SIZE

    # 前9个是背景(negative)得分  后9个前景(positive)得分 具体原因再最后损失函数时会详细讨论请往下翻
    scores = rpn_bbox_cls_prob[:, :, :, _num_anchors:]
    bbox_deltas = rpn_bbox_pred
    height, width = rpn_bbox_cls_prob.shape[1], rpn_bbox_cls_prob.shape[2]

    # generate all anchors
    shift_x = np.arange(0, width) * feat_stride
    shift_y = np.arange(0, height) * feat_stride
    shift_x, shift_y = np.meshgrid(shift_x, shift_y)
    shifts = np.vstack((shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), shift_y.ravel())).transpose()

    A = _num_anchors
    K = shifts.shape[0]
    anchors = _anchors.reshape((1, A, 4)) + shifts.reshape((1, K, 4)).transpose((1, 0, 2))
    anchors = anchors.reshape((K * A, 4))

    bbox_deltas = bbox_deltas.reshape((-1, 4))
    scores = scores.reshape((-1, 1))

    # 1. 将所有的anchor坐标通过rpn输出的坐标回归进行修正
    proposals = bbox_transform_inv(anchors, bbox_deltas)

    # 2. clip predicted boxes to image
    # 对候选框进行裁剪 因为生成的anchor有些超出图像边界
    proposals = clip_boxes(proposals, im_dims)

    # 3. remove predicted boxes with either height or width < threshold
    # 过滤掉小的proposal
    keep = _filter_boxes(proposals, min_size)
    proposals = proposals[keep, :]
    scores = scores[keep]

    # 4. sort all (proposal, score) pairs by score from highest to lowest
    # 5. take top pre_nms_topN (e.g. 6000)
    order = scores.ravel().argsort()[::-1]
    if pre_nms_topN > 0:
        order = order[0:, pre_nms_topN]
    proposals = proposals[order, :]
    scores = scores[order]

    # 6. apply nms (e.g. threshold = 0.7)
    # 7. take after_nms_topN (e.g. 300)
    # 8. return the top proposals (-> RoIs top)
    keep = nms(np.hstack((proposals, scores)), nms_thresh)
    if post_nms_topN > 0:
        keep = keep[:post_nms_topN]
    proposals = proposals[keep, :]
    scores = scores[keep]

    # Output rois blob
    # Our RPN implementation only supports a single input image, so all
    # batch inds are 0
    # 这个格式是为了满足 fast-rcnn roi输入格式
    batch_inds = np.zeros((proposals.shape[0], 1), dtype=np.float32)
    blob = np.hstack((batch_inds, proposals.astype(np.float32, copy=False)))
    return blob


def _filter_boxes(boxes, min_size):
    ws = boxes[:, 2] - boxes[:, 0]
    hs = boxes[:, 3] - boxes[:, 1]
    keep = np.where(ws >= min_size & hs >= min_size)[0]
    return keep

继续分析 proposal_target_layer.py

import tensorflow as tf
import numpy as np
from faster_rcnn.tf import config
from faster_rcnn.utils.bbox_overlaps import bbox_overlaps
from faster_rcnn.utils.bbox_transform import bbox_transform


def proposal_target_layer(rpn_rois, gt_boxes, _num_classes):
    rois, labels, bbox_targets, bbox_inside_weights, bbox_outside_weights = \
        tf.py_function(_proposal_target_layer_py,
                       [rpn_rois, gt_boxes, _num_classes],
                       [tf.float32, tf.int32, tf.float32, tf.float32, tf.float32])

    rois = tf.reshape(rois, [-1, 5], name='rois')
    labels = tf.convert_to_tensor(tf.cast(labels, tf.int32), name='labels')
    bbox_targets = tf.convert_to_tensor(bbox_targets, name='bbox_targets')
    bbox_inside_weights = tf.convert_to_tensor(bbox_inside_weights, name='bbox_inside_weights')
    bbox_outside_weights = tf.convert_to_tensor(bbox_outside_weights, name='bbox_outside_weights')

    return rois, labels, bbox_targets, bbox_inside_weights, bbox_outside_weights


def _proposal_target_layer_py(rpn_rois, gt_boxes, _num_classes):
    all_rois = rpn_rois
    # Include ground-truth boxes in the set of candidate rois
    # 将ground_th加入候选区域用于训练
    zeros = np.zeros((gt_boxes.shape[0], 1), dtype=gt_boxes.dtype)
    all_rois = np.vstack(
        (all_rois, np.hstack((zeros, gt_boxes[:, :-1])))
    )

    # Sanity check: single batch only
    assert np.all(all_rois[:, 0] == 0), \
        'Only single item batches are supported'
    # 为每张图片设置正负样本数目
    num_images = 1
    rois_per_image = config.TRAIN_BATCH_SIZE // num_images
    # 128 * 0.25 = 32
    fg_rois_per_image = np.round(config.TRAIN_FG_FRACTION * rois_per_image).astype(np.int32)

    # Sample rois with classification labels and bounding box regression
    # targets
    # 生成训练用的labels 和 边框回归数据
    labels, rois, bbox_targets, bbox_inside_weights = _sample_rois(
        all_rois, gt_boxes, fg_rois_per_image,
        rois_per_image, _num_classes)

    rois = rois.reshape(-1, 5)
    labels = labels.reshape(-1, 1)
    bbox_targets = bbox_targets.reshape(-1, _num_classes * 4)
    bbox_inside_weights = bbox_inside_weights.reshape(-1, _num_classes * 4)

    bbox_outside_weights = np.array(bbox_inside_weights > 0).astype(np.float32)

    return np.float32(rois), labels, bbox_targets, bbox_inside_weights, bbox_outside_weights


def _sample_rois(all_rois, gt_boxes, fg_rois_per_image, rois_per_image, num_classes):
    """
    Generate a random sample of RoIs comprising foreground and background
    examples.
    """
    # 这里是将数组装进连续内存并计算iou
    overlaps = bbox_overlaps(
        np.ascontiguousarray(all_rois[:, 1:5], dtype=np.float),
        np.ascontiguousarray(gt_boxes[:, :4], dtype=np.float))

    gt_assignment = overlaps.argmax(axis=1)
    max_overlaps = overlaps.max(axis=1)
    # 为每个anchor设置所属类别  与哪个gt_boxes相交iou最大就是对应的class
    labels = gt_boxes[gt_assignment, 4]

    # 这里是设置正负样本数目
    # Select foreground RoIs as those with >= FG_THRESH overlap
    fg_inds = np.where(max_overlaps >= config.TRAIN_FG_THRESH)[0]
    # Guard against the case when an image has fewer than fg_rois_per_image
    # foreground RoIs
    fg_rois_per_this_image = min(fg_rois_per_image, fg_inds.size)
    # Sample foreground regions without replacement
    if fg_inds.size > 0:
        # 随机抽样
        fg_inds = np.random.choice(fg_inds, size=fg_rois_per_this_image, replace=False)
    # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
    bg_inds = np.where((max_overlaps < config.TRAIN_BG_THRESH_HI) &
                       (max_overlaps >= config.TRAIN_BG_THRESH_LO))[0]
    # Compute number of background RoIs to take from this image (guarding
    # against there being fewer than desired)
    bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image
    bg_rois_per_this_image = min(bg_rois_per_this_image, bg_inds.size)
    # Sample background regions without replacement
    if bg_inds.size > 0:
        bg_inds = np.random.choice(bg_inds, size=bg_rois_per_this_image, replace=False)
    # The indices that we're selecting (both fg and bg)
    # 得到
    keep_inds = np.append(fg_inds, bg_inds)
    # Select sampled values from various arrays:
    # labels的size 为 128
    labels = labels[keep_inds]

    # Clamp labels for the background RoIs to 0
    # 前32个是正样本  后面的都是负样本 0表示背景
    labels[fg_rois_per_this_image:] = 0
    # 128个
    rois = all_rois[keep_inds]

    # 将候选区域根据坐标回归公式进行转换
    bbox_target_data = _compute_targets(
        rois[:, 1:5], gt_boxes[gt_assignment[keep_inds], :4], labels)
    # 生成坐标回归用的训练数据
    # 将 n * 5 -> n * 4k (k是class_num)
    bbox_targets, bbox_inside_weights = \
        _get_bbox_regression_labels(bbox_target_data, num_classes)

    return labels, rois, bbox_targets, bbox_inside_weights


def _get_bbox_regression_labels(bbox_target_data, num_classes):
    """Bounding-box regression targets (bbox_target_data) are stored in a
    compact form N x (class, tx, ty, tw, th)
    This function expands those targets into the 4-of-4*K representation used
    by the network (i.e. only one class has non-zero targets).
    Returns:
        bbox_target (ndarray): N x 4K blob of regression targets
        bbox_inside_weights (ndarray): N x 4K blob of loss weights
    """
    # 这一块属于fast-rcnn的坐标回归  每个roi回归出一个 4k的向量用来表示回归的坐标
    clss = bbox_target_data[:, 0]
    bbox_targets = np.zeros((clss.size, 4 * num_classes), dtype=np.float32)
    bbox_inside_weights = np.zeros(bbox_targets.shape, dtype=np.float32)
    inds = np.where(clss > 0)[0]
    for ind in inds:
        # 每个类回归4个坐标 按照顺序排序
        # 设置对应的坐标回归值
        cls = clss[ind]
        start = int(4 * cls)
        end = start + 4
        bbox_targets[ind, start:end] = bbox_target_data[ind, 1:]
        bbox_inside_weights[ind, start:end] = (1, 1, 1, 1)
    return bbox_targets, bbox_inside_weights


def _compute_targets(ex_rois, gt_rois, labels):
    """Compute bounding-box regression targets for an image."""

    assert ex_rois.shape[0] == gt_rois.shape[0]
    assert ex_rois.shape[1] == 4
    assert gt_rois.shape[1] == 4

    targets = bbox_transform(ex_rois, gt_rois)
    if config.TRAIN_BBOX_NORMALIZE_TARGETS_PRECOMPUTED:
        # Optionally normalize targets by a precomputed mean and stdev
        targets = ((targets - np.array(config.TRAIN_BBOX_NORMALIZE_MEANS))
                   / np.array(config.TRAIN_BBOX_NORMALIZE_STDS))
    #     将类别拼接到第一维
    return np.hstack((labels[:, np.newaxis], targets)).astype(np.float32, copy=False)

最后分析一下损失函数：

还记得我们在proposal_layer.py中

scores = rpn_bbox_cls_prob[:, :, :, _num_anchors:] 我们说前9个是前景后9个是背景得分

这里我们分析下原因：

因为我们在处理分类损失的时候进行了如下操作

# (1, H, W, 2K) --> (1, 2K, H, W) --> (1, 2, K * H, W ) --> (1, K * H, W, 2 )
shape = rpn_cls_output.shape
rpn_cls_output = tf.reshape(rpn_cls_output, (0, 3, 1, 2))
rpn_cls_output = tf.reshape(rpn_cls_output, (shape[0], 2, shape[3] // 2 * shape[1], shape[2]))
rpn_cls_output = tf.transpose(rpn_cls_output, [0, 2, 3, 1])
rpn_cls_output = tf.reshape(rpn_cls_output, [-1, 2])

# 针对维度进行reshape loss_function 中会做最后的reshape
# [H * W * A, 1] --> [1, H, W, A] --> [1, A, H, W]
labels = labels.reshape((1, feature_height, feature_width, A)).transpose(0, 3, 1, 2)
labels = labels.reshape((1, 1, A * feature_height, feature_width))
rpn_labels = labels

这里对 labels 和 rpn_cls_output 进行了reshape 使其变成了前9个对应背景(negative) label=0后9个对应了前景(positive)label=1

到此为止大家应该很清楚rpn到底做了什么是怎么实现的

def rpn_cls_loss(rpn_cls_output, rpn_labels):
    """
    1/N_cls * sum_i(L_cls(p_i,p_i^*))
    :param rpn_cls_output:
    :param rpn_labels:
    :return:
    """
    with tf.variable_scope('rpn_cls_loss'):
        # (1, H, W, 2K) --> (1, 2K, H, W) --> (1, 2, K * H, W ) --> (1, K * H, W, 2 )
        # 这里是和label进行了一样的变换 如果不做这些变换 会变成 1张背景 1张前景
        shape = rpn_cls_output.shape
        rpn_cls_output = tf.reshape(rpn_cls_output, (0, 3, 1, 2))
        rpn_cls_output = tf.reshape(rpn_cls_output, (shape[0], 2, shape[3] // 2 * shape[1], shape[2]))
        rpn_cls_output = tf.transpose(rpn_cls_output, [0, 2, 3, 1])
        rpn_cls_output = tf.reshape(rpn_cls_output, [-1, 2])

        #  [H * W * A, 1] --> [1, H, W, A] --> [1, A, H, W] --> [1, A * H, W]
        #  labels = labels.reshape((1, feature_height, feature_width, A)).transpose(0, 3, 1, 2)
        #  labels = labels.reshape((1, 1, A * feature_height, feature_width))
        #  rpn_labels = labels
        rpn_labels = tf.reshape(rpn_labels, [-1])
        # Ignore label=-1 (Neither object nor background: IoU between 0.3 and 0.7)
        # 在这里对应label中为-1值的位置排除掉score中的值，并且变成[-1,2]的形状方便计算交叉熵loss
        # 在这里留下label中的非-1的值
        rpn_labels = tf.reshape(tf.gather(rpn_labels, tf.where(tf.not_equal(rpn_labels, -1))), [-1])
        rpn_cls_output = tf.reshape(tf.gather(rpn_cls_output, tf.where(tf.not_equal(rpn_labels, -1))), [-1, 2])
        rpn_cross_entropy = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=rpn_cls_output,
                                                                                          labels=rpn_labels))
        return rpn_cross_entropy


def rpn_bbox_loss(rpn_bbox_pred, rpn_bbox_targets, rpn_inside_weights, rpn_outside_weights):
    """
    Calculate the Region Proposal Network bounding box loss. Measures how well
    the RPN is able to propose regions by the performance of its localization.
    lam/N_reg * sum_i(p_i^* * L_reg(t_i,t_i^*))
    lam: classification vs bbox loss balance parameter
    N_reg: Number of anchor locations (~2500)
    p_i^*: ground truth label for anchor (loss only for positive anchors)
    L_reg: smoothL1 loss
    t_i: Parameterized prediction of bounding box
    t_i^*: Parameterized ground truth of closest bounding box
    :param rpn_bbox_pred: shape (1, H, W, 4 * 9)
    :param rpn_bbox_targets:  (1, H, W, A * 4)
    :param rpn_inside_weights:
    :param rpn_outside_weights:
    :return:
    """
    with tf.variable_scope('rpn_bbox_loss'):
        # 这里是讲rnp_bbox_target trapspose成 和 rpn_bbox_pred一样shape
        # (1, A * 4, H, W) --> (1, H, W, A * 4)  A * 4也就是论文中关于边框回归的4k
        # rpn_bbox_targets = tf.transpose(rpn_bbox_targets, [0, 2, 3, 1])
        # rpn_inside_weights = tf.transpose(rpn_inside_weights, [0, 2, 3, 1])
        # rpn_outside_weights = tf.transpose(rpn_outside_weights, [0, 2, 3, 1])

        # How far off was the prediction?
        # 在进行边框修正loss的计算时，只有前景anchor会起作用，
        # 可以看到这是bbox_inside_weights和bbox_outside_weights在实现。
        # 非前景和背景anchor对应的bbox_inside_weights和bbox_outside_weights都为0。
        diff = tf.multiply(rpn_inside_weights, rpn_bbox_pred - rpn_bbox_targets)
        diff_sl1 = smooth_l1(diff, 3.0)

        # Only count loss for positive anchors. Make sure it's a sum.
        # tf.reduce_sum 这里应该是用reduce_mean吧 要不然lambda是不是没起到平衡作用
        rpn_bbox_reg = tf.reduce_mean(tf.multiply(rpn_outside_weights, diff_sl1))

        # Constant for weighting bounding box loss with classification loss
        rpn_bbox_reg = config.TRAIN_RPN_BBOX_LAMBDA * rpn_bbox_reg

    return rpn_bbox_reg


def fast_rcnn_cls_loss(fast_rcnn_cls_score, labels):
    '''
    Calculate the fast RCNN classifier loss. Measures how well the fast RCNN is
    able to classify objects from the RPN.

    Standard cross-entropy loss on logits
    '''
    with tf.variable_scope('fast_rcnn_cls_loss'):
        # Cross entropy error
        fast_rcnn_cross_entropy = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(logits=tf.squeeze(fast_rcnn_cls_score), labels=labels))

    return fast_rcnn_cross_entropy


def fast_rcnn_bbox_loss(fast_rcnn_bbox_pred, bbox_targets, roi_inside_weights, roi_outside_weights):
    '''
    Calculate the fast RCNN bounding box refinement loss. Measures how well
    the fast RCNN is able to refine localization.
    lam/N_reg * sum_i(p_i^* * L_reg(t_i,t_i^*))
    lam: classification vs bbox loss balance parameter
    N_reg: Number of anchor locations (~2500)
    p_i^*: ground truth label for anchor (loss only for positive anchors)
    L_reg: smoothL1 loss
    t_i: Parameterized prediction of bounding box
    t_i^*: Parameterized ground truth of closest bounding box
    '''
    with tf.variable_scope('fast_rcnn_bbox_loss'):
        # How far off was the prediction?
        diff = tf.multiply(roi_inside_weights, fast_rcnn_bbox_pred - bbox_targets)
        diff_sL1 = smooth_l1(diff, 1.0)

        # Only count loss for positive anchors
        roi_bbox_reg = tf.reduce_mean(tf.reduce_sum(tf.multiply(roi_outside_weights, diff_sL1), axis=1))

        # Constant for weighting bounding box loss with classification loss
        roi_bbox_reg = config.TRAIN_RPN_BBOX_LAMBDA * roi_bbox_reg

    return roi_bbox_reg


def smooth_l1(x, sigma):
    '''
                      0.5 * (sigma * x)^2  if |x| < 1/sigma^2
      smoothL1(x) = {
                      |x| - 0.5/sigma^2    otherwise
    '''

    with tf.variable_scope('smooth_l1'):
        conditional = tf.less(tf.abs(x), 1 / sigma ** 2)
        close = 0.5 * (sigma * x) ** 2
        far = tf.abs(x) - 0.5 / sigma ** 2
        return tf.where(conditional, close, far)

如果有理解错误或者不正确的地方欢迎大家留言讨论