# --------------------------------------------------------
# Faster R-CNN
# Copyright (c) 2015 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Ross Girshick and Xinlei Chen
# --------------------------------------------------------
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
from model.config import cfg
import numpy as np
import numpy.random as npr
from utils.bbox import bbox_overlaps
from model.bbox_transform import bbox_transform
import torch
# 解读faster rcnn重要的几个文件之anchor_target_layer.py文件
# https://github.com/ruotianluo/pytorch-faster-rcnn
"""
这个程序里的东西都是为训练服务的,而训练好后前向出来后topn,NMS等操作在proposal_layer.py
因为是为训练服务的,所以其中有赋label的过程,还有为了计算loss而提供的rpn_bbox_inside_weights等数据
rpn_bbox_inside_weights即loss中为了让loss只计算前景的框位置回归而设置的tensor
用法类似于(delta - gt)点乘rpn_bbox_inside_weights,,即得到了边框loss。
anchor_target_layer是入口,此时有2W个原始anchor
处理步骤为:
-->首先赋label:总的锚点为all_anchors,去除边框外的框
-->只算在边界内的框,这些框的下标inds_inside,框为anchors,分类为label=-1,三者行数都相等。
-->将threshhold>0.7的label=1,<0.3的label=0。
-->还需要把与gt的overlap最大的框赋label=1。gt_argmax_overlaps是gt依次对应的第几个anchor。
-->各从前背景label 1 0中选取128个,其余重新设置label=-1
-->内部权重bbox_inside_weights 为坐标前乘的系数,格式为[[0,0,0,0或1111],...刚开始为内部框个,后来是全部all_anchors个]
外部权重bbox_outside_weights 为正负区别的系数,格式同上,不过非0数字为1/N_sample,正负权重可以设置(focal Loss论文中会设置)
-->前面都是在边界内anchor上计算的,
然后用_unmap函数把label,bbox_targets, bbox_inside_weights,bbox_outside_weights这几个参数扩展到
全部anchor的shape尺度,对于边界外的anchor,label填充的是-1,其他三个填充的是0.(列数不变)
-->然后相当于有all_anchors个信息,这些信息的排序方式通过label.reshape((1, height, width, A))可以看出是
联系特征图,先是A(K=9)个anchor为一组,按行排列,排列完一行另起一行。这样height, width, A就解释通了。
数字1应该是1张图片
然后.transpose(0,3,1,2),可以把k理解为通道(只是假设理解),即先填充了一个通道的W×H吗,然后再填第二个通道
最后.reshape((1, 1, A * height, width))顺序应该没变,只是括号[]前后位置变了
其余3个返回值应该还是k-W-H-图的理解方式,只不过元素个数×4,也就是4-k-W-H-图
.reshape((1, height, width, A * 4))
"""
def anchor_target_layer(rpn_cls_score, gt_boxes, im_info, _feat_stride,
all_anchors, num_anchors):
"""Same as the anchor target layer in original Fast/er RCNN """
A = num_anchors # 单尺度anchor个数
total_anchors = all_anchors.shape[0] # 多尺度,总anchor个数
K = total_anchors / num_anchors
# allow boxes to sit over the edge by a small amount
# 边框可以踩一点界,这里是0,即不允许
_allowed_border = 0
# map of shape (..., H, W)
height, width = rpn_cls_score.shape[1:3]
# only keep anchors inside the image 只保留在图像内的框
inds_inside = np.where( # np.where(condition)
(all_anchors[:, 0] >= -_allowed_border) &
(all_anchors[:, 1] >= -_allowed_border) &
(all_anchors[:, 2] < im_info[1] + _allowed_border) & # width
(all_anchors[:, 3] < im_info[0] + _allowed_border) # height
)[0]
# all_anchors的格式的[[x,y,w,h],...]?
# keep only inside anchors
anchors = all_anchors[inds_inside, :] # 从所有的anchor只保留不踩界的
# label: 1 is positive, 0 is negative, -1 is dont care
labels = np.empty((len(inds_inside), ), dtype=np.float32) # 长度
# 只给不跨边界的anchor赋label=0,1,-1 ?
labels.fill(-1) # 用1填充
# overlaps between the anchors and the gt boxes
# overlaps (ex, gt)
overlaps = bbox_overlaps(
np.ascontiguousarray(anchors, dtype=np.float),
np.ascontiguousarray(gt_boxes, dtype=np.float))
# 返回了anchors个行,gt_boxes个列。因为有view,所以得ascontiguousarray
argmax_overlaps = overlaps.argmax(axis=1) # 最大值的下标,每一行的最大值
# anchor匹配的是第几个gt框
max_overlaps = overlaps[np.arange(len(inds_inside)), argmax_overlaps]
# 格式应该是[0.5,0.3,...]
gt_argmax_overlaps = overlaps.argmax(axis=0) # argmax输出格式[1,2,3]
# 与每个gt框最大值的anchor,argmax的shape=anchor的列数,即gt个数
# 有了行,下面的arange是取对应的列,即第一个gt框对应的最大overlap
gt_max_overlaps = overlaps[gt_argmax_overlaps,
np.arange(overlaps.shape[1])]
# overlaps.shape[1]是gt个数
# gt_max_overlaps最后格式 [0.7,0.9,....共gt个]
gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0]
if not cfg.TRAIN.RPN_CLOBBER_POSITIVES:
# assign bg labels first so that positive labels can clobber them
# first set the negatives
labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0 # 0.3负类label=0
# fg label: for each gt, anchor with highest overlap 每个gt最大重叠的框为正
labels[gt_argmax_overlaps] = 1
# fg label: above threshold IOU 大于阈值的框为正
labels[max_overlaps >= cfg.TRAIN.RPN_POSITIVE_OVERLAP] = 1
if cfg.TRAIN.RPN_CLOBBER_POSITIVES:
# assign bg labels last so that negative labels can clobber positives
labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0
# subsample positive labels if we have too many
num_fg = int(cfg.TRAIN.RPN_FG_FRACTION * cfg.TRAIN.RPN_BATCHSIZE)
fg_inds = np.where(labels == 1)[0] # 这个[0]是什么?符合条件的第一个维度值
if len(fg_inds) > num_fg: # 随机挑选anchor,选出来剩下的赋label=-1
disable_inds = npr.choice( # numpy.random.choice ,replace表示是否重用元素,默认True
fg_inds, size=(len(fg_inds) - num_fg), replace=False)
labels[disable_inds] = -1
# subsample negative labels if we have too many
num_bg = cfg.TRAIN.RPN_BATCHSIZE - np.sum(labels == 1) # label=1是前景anchor
bg_inds = np.where(labels == 0)[0] # label=0是背景anchor
if len(bg_inds) > num_bg:
disable_inds = npr.choice(
bg_inds, size=(len(bg_inds) - num_bg), replace=False)
labels[disable_inds] = -1
bbox_targets = np.zeros((len(inds_inside), 4), dtype=np.float32)
# shape=[在边界里的anchor的数目,4]
bbox_targets = _compute_targets(anchors, gt_boxes[argmax_overlaps, :])
# _compute_targets函数:把绝对坐标转换成offset
# 第一个是anchor的坐标,第二个参数是anchors与第几个gt匹配
# 返回的是anchors与gt框的offset,即训练的真实值tx,ty,tw,th.
bbox_inside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32)
# bbox_inside_weights的shape是[边界内anchor个数,4坐标]
# 这里的weight和具体分类时的维度不同,那里的列数是4*类别数
# only the positive ones have regression targets
bbox_inside_weights[labels == 1, :] = np.array(
cfg.TRAIN.RPN_BBOX_INSIDE_WEIGHTS) # (1.0, 1.0, 1.0, 1.0)
# 只让选出来的正类sample计算回归loss
bbox_outside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32)
# bbox_outside_weights 的shape和bbox_inside_weights 同
if cfg.TRAIN.RPN_POSITIVE_WEIGHT < 0: # RPN_POSITIVE_WEIGHT =-1
# uniform weighting of examples (given non-uniform sampling)
num_examples = np.sum(labels >= 0) # 样本的数目
positive_weights = np.ones((1, 4)) * 1.0 / num_examples # 这里即公式里的N_clss,要放到outweight中
negative_weights = np.ones((1, 4)) * 1.0 / num_examples # 正负权重可以设置
else:
assert ((cfg.TRAIN.RPN_POSITIVE_WEIGHT > 0) &
(cfg.TRAIN.RPN_POSITIVE_WEIGHT < 1))
positive_weights = (
cfg.TRAIN.RPN_POSITIVE_WEIGHT / np.sum(labels == 1))
negative_weights = (
(1.0 - cfg.TRAIN.RPN_POSITIVE_WEIGHT) / np.sum(labels == 0))
bbox_outside_weights[labels == 1, :] = positive_weights
bbox_outside_weights[labels == 0, :] = negative_weights
# 外部权重=1/样本个数
# map up to original set of anchors映射到原始total_anchor
# 接下来都使用了_unmap函数,这个函数的作用是从在框内的anchor又扩展回了全部anchor尺度,fill参数代表那些在边界外的anchor要填充的数字
# 虽然拓展到了原来total_anchor行,但是其中非0还是sample个行
labels = _unmap(labels, total_anchors, inds_inside, fill=-1)
# _unmap由把label从只有在边界里的anchor还原到了total_anchors尺度。即正1,负0,不关心-1
# 但是1和0的总数还是256个
bbox_targets = _unmap(bbox_targets, total_anchors, inds_inside, fill=0)
bbox_inside_weights = _unmap(
bbox_inside_weights, total_anchors, inds_inside, fill=0)
bbox_outside_weights = _unmap(
bbox_outside_weights, total_anchors, inds_inside, fill=0)
# labels # 原labels=[1,0,-1,1,1,0,0.....2W个]
labels = labels.reshape((1, height, width, A)).transpose(0, 3, 1, 2)
# 原来的label是[1,0,-1,1,1,0,0.....2W个],顺序是9个为一组,宽为一组,高为一组,图为1组
# 1个张图片,宽,高,A=k=9。转成看一张图片,k,高,宽?这样?
# 联想通道图(实际k不是通道),之前是先从通道方向上方,转置之后是先放完一个通道的整张图,再放第二个通道代表的图
labels = labels.reshape((1, 1, A * height, width))
# 我猜是换成了 宽-高-k为一组,一张图为一组,一个batch为一组
rpn_labels = labels
# bbox_targets # bbox_targets=[[tx,ty,tw,wh],...2W个]
bbox_targets = bbox_targets \
.reshape((1, height, width, A * 4))
rpn_bbox_targets = bbox_targets
# bbox_inside_weights #原bbox_inside_weights=[[1,1,1,1],[0,0,0,0]...1或0形成2W个 ]
bbox_inside_weights = bbox_inside_weights \
.reshape((1, height, width, A * 4))
rpn_bbox_inside_weights = bbox_inside_weights
# bbox_outside_weights
bbox_outside_weights = bbox_outside_weights \
.reshape((1, height, width, A * 4))
rpn_bbox_outside_weights = bbox_outside_weights
return rpn_labels, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights
def _unmap(data, count, inds, fill=0):
""" Unmap a subset of item (data) back to the original set of items (of
size count) """
if len(data.shape) == 1: # data(label)是一维的,如[1,2,3,4]
# 例_unmap(labels, total_anchors, inds_inside, fill=-1)
ret = np.empty((count, ), dtype=np.float32)
ret.fill(fill)
ret[inds] = data # ret[inds_inside] = labels
else:
#例2: _unmap(data=bbox_targets, count=total_anchors, inds=inds_inside, fill=0)
ret = np.empty((count, ) + data.shape[1:], dtype=np.float32)
# 单纯第data.shape[1]是一个数,但是data.shape[1:]是一个元组(2,)
# np.empty((2,)+(3,))是2行3列的空矩阵
ret.fill(fill)
ret[inds, :] = data
return ret
def _compute_targets(ex_rois, gt_rois):
# 这个函数的意思是验证一下维度,实际起作用的是另外一个函数bbox_transform
"""Compute bounding-box regression targets for an image."""
assert ex_rois.shape[0] == gt_rois.shape[0] # gt已经是赋完label的anchor了
# 这里gtbox的格式还需要根据其他地方研究
assert ex_rois.shape[1] == 4 # [[x,y,x,y],...]
assert gt_rois.shape[1] == 5 #gt[[x,y,x,y,类],...]
return bbox_transform( # 把绝对坐标转换成offset坐标
torch.from_numpy(ex_rois), torch.from_numpy(gt_rois[:, :4])).numpy()
Faster R-CNN代码解读之anchor_target_layer.py
最新推荐文章于 2024-07-20 17:36:52 发布