本篇博客将对_anchor_target_layer进行讲解
_anchor_target_layer:其实也是一个中间函数
注意代码注释
def _anchor_target_layer(self, rpn_cls_score, name):
with tf.variable_scope(name) as scope:
rpn_labels, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights = tf.py_func(
anchor_target_layer,
[rpn_cls_score, self._gt_boxes, self._im_info, self._feat_stride, self._anchors, self._num_anchors],
[tf.float32, tf.float32, tf.float32, tf.float32],
name="anchor_target")
#rpn_cls_score shape=(1, 60, 40, 18), self._gt_boxes表示真实框和类别(?,5), self._im_info表示高宽, self._feat_stride=16
#_anchors特征图上对应的每个Anchor,_num_anchors=9
#output rpn_labels是前景,背景标签;rpn_bbox_targets 真实窗口与anchors偏移量;之后的是两个权重,用于之后的训练
#详细解析见下
rpn_labels.set_shape([1, 1, None, None])
rpn_bbox_targets.set_shape([1, None, None, self._num_anchors * 4])
rpn_bbox_inside_weights.set_shape([1, None, None, self._num_anchors * 4])
rpn_bbox_outside_weights.set_shape([1, None, None, self._num_anchors * 4])
#以上调整shape
rpn_labels = tf.to_int32(rpn_labels, name="to_int32")
self._anchor_targets['rpn_labels'] = rpn_labels
self._anchor_targets['rpn_bbox_targets'] = rpn_bbox_targets
self._anchor_targets['rpn_bbox_inside_weights'] = rpn_bbox_inside_weights
self._anchor_targets['rpn_bbox_outside_weights'] = rpn_bbox_outside_weights
#保存
self._score_summaries.update(self._anchor_targets)
#可视化保存
return rpn_labels#返回标签
anchor_target_layer
注意代码注释
输入说明:
rpn_cls_score shape=(1, 60, 40, 18)
gt_boxes表示真实框和类别(?,5)
im_info表示高宽通道数
feat_stride=16 原图与特征图比例
all_anchors特征图上对应的每Anchor
num_anchors=9每个点位锚
def anchor_target_layer(rpn_cls_score, gt_boxes, im_info, _feat_stride, all_anchors, num_anchors):
"""Same as the anchor target layer in original Fast/er RCNN """
#rpn_cls_score shape=(1, 60, 40, 18), gt_boxes表示真实框, im_info表示高宽, _feat_stride=16
#all_anchors shape=(21600, 4) 特征图上对应的每个Anchor,_num_anchors=9
A = num_anchors #9
total_anchors = all_anchors.shape[0] #21600
K = total_anchors / num_anchors #2400个点位
_allowed_border = 0
# allow boxes to sit over the edge by a small amount
height, width = rpn_cls_score.shape[1:3]
# map of shape (..., H, W), only H,W
inds_inside = np.where(
(all_anchors[:, 0] >= -_allowed_border) &
(all_anchors[:, 1] >= -_allowed_border) &
(all_anchors[:, 2] < im_info[1] + _allowed_border) & # width
(all_anchors[:, 3] < im_info[0] + _allowed_border) # height
)[0]
#left, bottom, right, top有没有超出图像界限。
# only keep anchors inside the image
anchors = all_anchors[inds_inside, :]
# keep only inside anchors
# 这里假设没有剔除,shape=(21600, 4)
labels = np.empty((len(inds_inside),), dtype=np.float32)
labels.fill(-1)
# label: 1 is positive, 0 is negative, -1 is dont care
overlaps = bbox_overlaps(
np.ascontiguousarray(anchors, dtype=np.float),
np.ascontiguousarray(gt_boxes, dtype=np.float))
#ascontiguousarray函数将一个内存不连续存储的数组转换为内存连续存储的数组,使得运行速度更快
#得到一个array,shape=(21600, K),N=21600,(N,K),N表示候选框个数,K表示真实框个数,相当于一个表格,值为候选框与真实框的iou
#下面有代码解析
argmax_overlaps = overlaps.argmax(axis=1)
#候选框对应的最大iou真实框索引,shape=[N]
max_overlaps = overlaps[np.arange(len(inds_inside)), argmax_overlaps]
#候选框对应的最大iou真实框[N]
gt_argmax_overlaps = overlaps.argmax(axis=0)
#真实框对应的最大iou候选框索引 shape=[K]
gt_max_overlaps = overlaps[gt_argmax_overlaps,
np.arange(overlaps.shape[1])]
#候选框对应的最大真实框iou shape=(K)
gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0]
#如果只有参数condition,则函数返回为true的元素的坐标位置信息
#此处为每个标定的真值候选区域,与其重叠比例最大的anchor的索引
"""
np.where([[True, False],[True,False]])
output:(array([0, 1], dtype=int64), array([0, 0], dtype=int64))
"""
# overlaps between the anchors and the gt boxes
# overlaps (ex, gt)
if not cfg.TRAIN.RPN_CLOBBER_POSITIVES:
# assign bg labels first so that positive labels can clobber them
# first set the negatives
#候选框与真实框最大的iou都小与cfg.TRAIN.RPN_NEGATIVE_OVERLAP,标为0
labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0
# fg label: for each gt, anchor with highest overlap
labels[gt_argmax_overlaps] = 1
# fg label: above threshold IOU
labels[max_overlaps >= cfg.TRAIN.RPN_POSITIVE_OVERLAP] = 1
if cfg.TRAIN.RPN_CLOBBER_POSITIVES:
# assign bg labels last so that negative labels can clobber positives
#候选框与真实框最大的iou都小与cfg.TRAIN.RPN_NEGATIVE_OVERLAP,标为0
labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0
#以上就是打标签的原则,得到了label
# subsample positive labels if we have too many
#这里进行抽样对前景样本
num_fg = int(cfg.TRAIN.RPN_FG_FRACTION * cfg.TRAIN.RPN_BATCHSIZE)
fg_inds = np.where(labels == 1)[0]#取出的是索引
if len(fg_inds) > num_fg:
disable_inds = npr.choice(
fg_inds, size=(len(fg_inds) - num_fg), replace=False)
labels[disable_inds] = -1
# subsample negative labels if we have too many
# 这里进行抽样对后景样本,同上
num_bg = cfg.TRAIN.RPN_BATCHSIZE - np.sum(labels == 1)
bg_inds = np.where(labels == 0)[0]
if len(bg_inds) > num_bg:
disable_inds = npr.choice(
bg_inds, size=(len(bg_inds) - num_bg), replace=False)
labels[disable_inds] = -1
#以上是分类任务,下面为回归任务
bbox_targets = np.zeros((len(inds_inside), 4), dtype=np.float32)#(N, 4)
bbox_targets = _compute_targets(anchors, gt_boxes[argmax_overlaps, :])
#记住此时的anchor是被筛选过的,同时也对gt_boxes进行了筛选
#gt_boxes [T,5]表示真实框,shape=(N, 5),T表示真实框数量
#gt_boxes[argmax_overlaps, :] shape=(N, 5)
#anchors shape=(N,4)
#output (N,4) 偏移量
#该函数代码解析,以下会讲
bbox_inside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32)
# only the positive ones have regression targets
bbox_inside_weights[labels == 1, :] = np.array(cfg.TRAIN.RPN_BBOX_INSIDE_WEIGHTS)
#inside_weights
bbox_outside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32)
if cfg.TRAIN.RPN_POSITIVE_WEIGHT < 0:
# uniform weighting of examples (given non-uniform sampling)
num_examples = np.sum(labels >= 0)
positive_weights = np.ones((1, 4)) * 1.0 / num_examples
negative_weights = np.ones((1, 4)) * 1.0 / num_examples
else:
assert ((cfg.TRAIN.RPN_POSITIVE_WEIGHT > 0) &
(cfg.TRAIN.RPN_POSITIVE_WEIGHT < 1))
positive_weights = (cfg.TRAIN.RPN_POSITIVE_WEIGHT /
np.sum(labels == 1))
negative_weights = ((1.0 - cfg.TRAIN.RPN_POSITIVE_WEIGHT) /
np.sum(labels == 0))
bbox_outside_weights[labels == 1, :] = positive_weights
bbox_outside_weights[labels == 0, :] = negative_weights
#outside_weights
#针对前后景样本的权值
# map up to original set of anchors
#labels=(N,) total_anchors=21600 inds_inside=N fill=-1
#因为原来的一共有21600个窗口,筛去只剩N,需要map到原来的数量上
#_unmap以下会讲
labels = _unmap(labels, total_anchors, inds_inside, fill=-1)
bbox_targets = _unmap(bbox_targets, total_anchors, inds_inside, fill=0)
bbox_inside_weights = _unmap(bbox_inside_weights, total_anchors, inds_inside, fill=0)
bbox_outside_weights = _unmap(bbox_outside_weights, total_anchors, inds_inside, fill=0)
# labels (N,)=> (1, height, width, A)=>(1, A, height, width)
labels = labels.reshape((1, height, width, A)).transpose(0, 3, 1, 2)
labels = labels.reshape((1, 1, A * height, width))#(1, 1, A * height, width)
rpn_labels = labels
# bbox_targets (N, 4)=> (1, height, width, A * 4)
bbox_targets = bbox_targets \
.reshape((1, height, width, A * 4))
rpn_bbox_targets = bbox_targets
# bbox_inside_weights (N, 4)=> (1, height, width, A * 4)
bbox_inside_weights = bbox_inside_weights \
.reshape((1, height, width, A * 4))
rpn_bbox_inside_weights = bbox_inside_weights
# bbox_outside_weights (N, 4)=> (1, height, width, A * 4)
bbox_outside_weights = bbox_outside_weights \
.reshape((1, height, width, A * 4))
rpn_bbox_outside_weights = bbox_outside_weights
return rpn_labels, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights
总结以下整个函数就是准备分类和回归标签和权重,用于训练。
代码详细解析
bbox_overlaps
这是一个cython,但是也不难,看懂还是很容易的。
input就是锚和真实框
def bbox_overlaps(
np.ndarray[DTYPE_t, ndim=2] boxes,
np.ndarray[DTYPE_t, ndim=2] query_boxes):
"""
Parameters
----------
boxes: (N, 4) ndarray of float
query_boxes: (K, 4) ndarray of float
Returns
-------
overlaps: (N, K) ndarray of overlap between boxes and query_boxes
"""
#0,1,2,3分别对应#left, bottom, right, top
cdef unsigned int N = boxes.shape[0]
cdef unsigned int K = query_boxes.shape[0]
cdef np.ndarray[DTYPE_t, ndim=2] overlaps = np.zeros((N, K), dtype=DTYPE)
cdef DTYPE_t iw, ih, box_area
cdef DTYPE_t ua
cdef unsigned int k, n
for k in range(K):
box_area = (
(query_boxes[k, 2] - query_boxes[k, 0] + 1) *
(query_boxes[k, 3] - query_boxes[k, 1] + 1)
)#计算真实框面积
for n in range(N):
iw = (
min(boxes[n, 2], query_boxes[k, 2]) -
max(boxes[n, 0], query_boxes[k, 0]) + 1
)#判断left, right水平上是否有交集,这里left,right值不要想成是线,想成是方块,所以需要+1
if iw > 0:
ih = (
min(boxes[n, 3], query_boxes[k, 3]) -
max(boxes[n, 1], query_boxes[k, 1]) + 1
)#这个是水平线上的计算
if ih > 0:
ua = float(
(boxes[n, 2] - boxes[n, 0] + 1) *
(boxes[n, 3] - boxes[n, 1] + 1) +
box_area - iw * ih
)
overlaps[n, k] = iw * ih / ua
return overlaps #iou
_compute_targets
anchors:筛选过后的锚
gt_rois:筛选过后的真实框
def _compute_targets(ex_rois, gt_rois):
"""Compute bounding-box regression targets for an image."""
assert ex_rois.shape[0] == gt_rois.shape[0]
assert ex_rois.shape[1] == 4
assert gt_rois.shape[1] == 5
#调用bbox_transform
return bbox_transform(ex_rois, gt_rois[:, :4]).astype(np.float32, copy=False)
def bbox_transform(ex_rois, gt_rois):
#以下left, bottom, right, top转化为W,H,center_x,center_y
ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.0#anchors widths
ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.0#anchors heights
ex_ctr_x = ex_rois[:, 0] + 0.5 * ex_widths#anchors x center
ex_ctr_y = ex_rois[:, 1] + 0.5 * ex_heights#anchors y center
gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + 1.0
gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + 1.0
gt_ctr_x = gt_rois[:, 0] + 0.5 * gt_widths
gt_ctr_y = gt_rois[:, 1] + 0.5 * gt_heights
#同上
#求delta
targets_dx = (gt_ctr_x - ex_ctr_x) / ex_widths
targets_dy = (gt_ctr_y - ex_ctr_y) / ex_heights
targets_dw = np.log(gt_widths / ex_widths)
targets_dh = np.log(gt_heights / ex_heights)
"""
偏移量:
△x=(x*-x_a)/w_a △y=(y*-y_a)/h_a
△w=log(w*/w_a) △h=log(h*/h_a)
"""
targets = np.vstack(
(targets_dx, targets_dy, targets_dw, targets_dh)).transpose()
return targets#(N,4)
最终求出了偏移量。
让我们看最后一个函数
input:
data:表示各种各样的标签
total_anchors:原来锚的总个数
inds_inside:现有标签原来的index
fill:用什么填充被筛选之后的标签
def _unmap(data, count, inds, fill=0):
""" Unmap a subset of item (data) back to the original set of items (of
size count) """
#labels=(N) total_anchors=21600 inds_inside=N fill=-1
if len(data.shape) == 1:
ret = np.empty((count,), dtype=np.float32)
ret.fill(fill)
ret[inds] = data
else:
ret = np.empty((count,) + data.shape[1:], dtype=np.float32)
ret.fill(fill)
ret[inds, :] = data
return ret
映射回原来的数量上去了。
OK,完工!