功能:
根据GTbox和topN proposals选择满足要求的128个proposals(包括fg和bg),然后加上物体类别标签和bbox的回归目标,只有在该类别的对应位置上面才会有位置信息),并计算权重weights。(这128个proposals是包含了Gtbox的?)
输入:
bottom[0]: rpn_rois,从proposal_layer提取到的proposals
bottom[1]: gt_boxes: GroundTruth boxes
输出:
top[0]: 'rois':包括所有roi的左上和右下角坐标
top[1]: 'labels':所有提取出的roi的标签,bg = 0
top[2]: 'bbox_targets':所有roi相对于与其有最大IOU的GTboxes的偏移量,是一个[4*classes]的vector,偏移量存放在对应label位置
top[3]: 'bbox_inside_weights': = 1
top[4]: 'bbox_outside_weights':代码里面没有出现
源码:
# --------------------------------------------------------
# Faster R-CNN
# Copyright (c) 2015 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Ross Girshick and Sean Bell
# --------------------------------------------------------
import caffe
import yaml
import numpy as np
import numpy.random as npr
from fast_rcnn.config import cfg
from fast_rcnn.bbox_transform import bbox_transform
from utils.cython_bbox import bbox_overlaps
DEBUG = False
class ProposalTargetLayer(caffe.Layer):
"""
Assign object detection proposals to ground-truth targets. Produces proposal
classification labels and bounding-box regression targets.
"""
def setup(self, bottom, top):
layer_params = yaml.load(self.param_str_)
self._num_classes = layer_params['num_classes']#获取总的分类数量
# sampled rois (0, x1, y1, x2, y2)
top[0].reshape(1, 5)
# labels
top[1].reshape(1, 1)
# bbox_targets
top[2].reshape(1, self._num_classes * 4)
# bbox_inside_weights
top[3].reshape(1, self._num_classes * 4)
# bbox_outside_weights
top[4].reshape(1, self._num_classes * 4)
def forward(self, bottom, top):
# Proposal ROIs (0, x1, y1, x2, y2) coming from RPN
# (i.e., rpn.proposal_layer.ProposalLayer), or any other source
all_rois = bottom[0].data
# GT boxes (x1, y1, x2, y2, label)
# TODO(rbg): it's annoying that sometimes I have extra info before
# and other times after box coordinates -- normalize to one format
gt_boxes = bottom[1].data
# Include ground-truth boxes in the set of candidate rois
# 产生了一个M*1的0矩阵
zeros = np.zeros((gt_boxes.shape[0], 1), dtype=gt_boxes.dtype)
# 由于gt_boxes是有5列信息的(x1,y1,x2,y2,标签),
# 此时只取前4个(gt_boxes[:, :-1])即位置信息,存入all_rois
# 即将rois和gt_boxes在0维拼合在一起,数据还是五列,第一列全0,后四列是box坐标;
# 所以此时的all_rois存放了所有的gt和proposals的坐标
all_rois = np.vstack(
(all_rois, np.hstack((zeros, gt_boxes[:, :-1])))
)
# Sanity check: single batch only
assert np.all(all_rois[:, 0] == 0), \
'Only single item batches are supported'
num_images = 1
#cfg.TRAIN.BATCH_SIZE:感兴趣区域的个数
rois_per_image = cfg.TRAIN.BATCH_SIZE / num_images #128/1 = 128
# cfg.TRAIN.FG_FRACTION :fg的比例
fg_rois_per_image = np.round(cfg.TRAIN.FG_FRACTION * rois_per_image) #0.25*128=32
# Sample rois with classification labels and bounding box regression
# targets
labels, rois, bbox_targets, bbox_inside_weights = _sample_rois(
all_rois, gt_boxes, fg_rois_per_image,
rois_per_image, self._num_classes)
if DEBUG:
print 'num fg: {}'.format((labels > 0).sum())
print 'num bg: {}'.format((labels == 0).sum())
self._count += 1
self._fg_num += (labels > 0).sum()
self._bg_num += (labels == 0).sum()
print 'num fg avg: {}'.format(self._fg_num / self._count)
print 'num bg avg: {}'.format(self._bg_num / self._count)
print 'ratio: {:.3f}'.format(float(self._fg_num) / float(self._bg_num))
# sampled rois
top[0].reshape(*rois.shape)
top[0].data[...] = rois
# classification labels
top[1].reshape(*labels.shape)
top[1].data[...] = labels
# bbox_targets
top[2].reshape(*bbox_targets.shape)
top[2].data[...] = bbox_targets
# bbox_inside_weights
top[3].reshape(*bbox_inside_weights.shape)
top[3].data[...] = bbox_inside_weights
# bbox_outside_weights
top[4].reshape(*bbox_inside_weights.shape)
top[4].data[...] = np.array(bbox_inside_weights > 0).astype(np.float32)
def backward(self, top, propagate_down, bottom):
"""This layer does not propagate gradients."""
pass
def reshape(self, bottom, top):
"""Reshaping happens during the call to forward."""
pass
#计算bbox_target_data向量
def _get_bbox_regression_labels(bbox_target_data, num_classes):
"""Bounding-box regression targets (bbox_target_data) are stored in a
compact form N x (class, tx, ty, tw, th)
This function expands those targets into the 4-of-4*K representation used
by the network (i.e. only one class has non-zero targets).
Returns:
bbox_target (ndarray): N x 4K blob of regression targets
bbox_inside_weights (ndarray): N x 4K blob of loss weights
"""
clss = bbox_target_data[:, 0] #表示一共有clss 个 bbox。
bbox_targets = np.zeros((clss.size, 4 * num_classes), dtype=np.float32)#4 * num_classes = 4*21列
bbox_inside_weights = np.zeros(bbox_targets.shape, dtype=np.float32)
inds = np.where(clss > 0)[0] #只要fg的
for ind in inds: #只在该类别对应的那4个位置放值
cls = clss[ind]
start = 4 * cls
end = start + 4
bbox_targets[ind, start:end] = bbox_target_data[ind, 1:]
bbox_inside_weights[ind, start:end] = cfg.TRAIN.BBOX_INSIDE_WEIGHTS#前景的inside_weights =1
return bbox_targets, bbox_inside_weights
#将label和其与GT的偏移量放在一起得到[label,偏移量]
def _compute_targets(ex_rois, gt_rois, labels):
"""Compute bounding-box regression targets for an image."""
assert ex_rois.shape[0] == gt_rois.shape[0]
assert ex_rois.shape[1] == 4
assert gt_rois.shape[1] == 4
#计算ROI和GT的偏移量
targets = bbox_transform(ex_rois, gt_rois)
if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED:
# Optionally normalize targets by a precomputed mean and stdev
targets = ((targets - np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS))
/ np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS))
return np.hstack(
(labels[:, np.newaxis], targets)).astype(np.float32, copy=False)
#从一张图片的rois里采样得到roi
def _sample_rois(all_rois, gt_boxes, fg_rois_per_image, rois_per_image, num_classes):
"""Generate a random sample of RoIs comprising foreground and background
examples.
"""
# overlaps: (rois x gt_boxes)
#计算ROI和GT的IOU
overlaps = bbox_overlaps(
np.ascontiguousarray(all_rois[:, 1:5], dtype=np.float),
np.ascontiguousarray(gt_boxes[:, :4], dtype=np.float))
#每行的最大值,表示与该anchor具有最大IOU的GT的下标
gt_assignment = overlaps.argmax(axis=1)
#最大IOU
max_overlaps = overlaps.max(axis=1)
#提取与该ROI具有最大IOU的GT的label
labels = gt_boxes[gt_assignment, 4]
# Select foreground RoIs as those with >= FG_THRESH overlap
#找到大于规定阈值的fg,
# 当fg_inds的个数比fg_rois_per_image大时,就只筛选32个出来;否则,全部保留;
fg_inds = np.where(max_overlaps >= cfg.TRAIN.FG_THRESH)[0]
# Guard against the case when an image has fewer than fg_rois_per_image
# foreground RoIs
fg_rois_per_this_image = min(fg_rois_per_image, fg_inds.size)
# Sample foreground regions without replacement
if fg_inds.size > 0:
fg_inds = npr.choice(fg_inds, size=fg_rois_per_this_image, replace=False)
# Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
#挑选满足条件的bg
bg_inds = np.where((max_overlaps < cfg.TRAIN.BG_THRESH_HI) &
(max_overlaps >= cfg.TRAIN.BG_THRESH_LO))[0]
# Compute number of background RoIs to take from this image (guarding
# against there being fewer than desired)
#bg个数 = 总ROI-fg个数,接下来的方法和fg相同
bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image
bg_rois_per_this_image = min(bg_rois_per_this_image, bg_inds.size)
# Sample background regions without replacement
if bg_inds.size > 0:
bg_inds = npr.choice(bg_inds, size=bg_rois_per_this_image, replace=False)
# The indices that we're selecting (both fg and bg)
#所有挑选出来的ROI的下标
keep_inds = np.append(fg_inds, bg_inds)
# Select sampled values from various arrays:
#这些ROIs的标签
labels = labels[keep_inds]
# Clamp labels for the background RoIs to 0
#将bg的label全部置为0.
labels[fg_rois_per_this_image:] = 0
rois = all_rois[keep_inds]
#计算roi的gt的偏移量,返回为[label,四个偏移量]
bbox_target_data = _compute_targets(
rois[:, 1:5], gt_boxes[gt_assignment[keep_inds], :4], labels)
bbox_targets, bbox_inside_weights = \
_get_bbox_regression_labels(bbox_target_data, num_classes)
return labels, rois, bbox_targets, bbox_inside_weights