RegionProposalNetwork
在Faster RCNN中第一阶段是由RegionProposalNetwork生成anchors,并通过筛选得到proposal。代码中详细注释了每一部分的过程。
import torch
import torchvision
from torch import nn, Tensor
from torch.nn import functional as F
import math
from typing import Dict
def smooth_l1_loss(input, target, beta: float = 1. / 9, size_average: bool = True):
"""
very similar to the smooth_l1_loss from pytorch, but with
the extra beta parameter
"""
n = torch.abs(input - target)
# cond = n < beta. lt: 代表小于操作
cond = torch.lt(n, beta)
loss = torch.where(cond, 0.5 * n ** 2 / beta, n - 0.5 * beta)
if size_average:
return loss.mean()
return loss.sum()
def nms(boxes, scores, iou_threshold):
# type: (Tensor, Tensor, float) -> Tensor
"""
Performs non-maximum suppression (NMS) on the boxes according
to their intersection-over-union (IoU).
NMS iteratively removes lower scoring boxes which have an
IoU greater than iou_threshold with another (higher scoring)
box.
Parameters
----------
boxes : Tensor[N, 4])
boxes to perform NMS on. They
are expected to be in (x1, y1, x2, y2) format
scores : Tensor[N]
scores for each one of the boxes
iou_threshold : float
discards all overlapping
boxes with IoU < iou_threshold
Returns
-------
keep : Tensor
int64 tensor with the indices
of the elements that have been kept
by NMS, sorted in decreasing order of scores
"""
return torchvision.ops.nms(boxes, scores, iou_threshold)
class RPNHead(nn.Module):
# 计算预测目标概率与bbox regression参数
def __init__(self, in_channels, num_anchors):
super(RPNHead, self).__init__()
self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
# 计算预测的类别分数(这里的类别指前景或者背景)
self.cls_logits = nn.Conv2d(in_channels, num_anchors, kernel_size=1, stride=1)
# 计算预测的目标bbox regression参数
self.bbox_pred = nn.Conv2d(in_channels, num_anchors * 4, kernel_size=1, stride=1)
# 初始化
for layer in self.children():
if isinstance(layer, nn.Conv2d):
torch.nn.init.normal_(layer.weight, std=0.01)
torch.nn.init.constant_(layer.bias, 0)
def forward(self, x):
# x.shape: 不同特征图的输出维度 C=256 [B, 256, H, W]
logits = []
bbox_reg = []
for i, feature in enumerate(x):
output_33 = F.relu(self.conv(feature))
logits.append(self.cls_logits(output_33))
bbox_reg.append(self.bbox_pred(output_33))
return logits, bbox_reg
class AnchorsGenerator(nn.Module):
# sizes: anchor的尺寸 aspect_ratios:anchor采用的不同的比例
def __init__(self, sizes=(128, 256, 512), aspect_ratios=(0.5, 1.0, 2.0)):
super(AnchorsGenerator, self).__init__()
self.sizes = sizes
self.aspect_ratios = aspect_ratios
self.cell_anchors = None
self._cache = {} # 在原图上生成的所有anchor的信息 存入到cache中
def generate_anchors(self, scale, aspect_ratios, dtype=torch.float32, device='cpu'):
# scale: tuple -> tensor
scale = torch.as_tensor(scale, dtype=dtype, device=device)
# aspect_ratios: tuple -> tensor
aspect_ratios = torch.as_tensor(aspect_ratios, dtype=dtype, device=device)
h_ratios = torch.sqrt(aspect_ratios)
w_ratios = 1.0 / h_ratios
# 每一个高、宽比例分别乘上对应预测特征图的尺度 得到每一个anchor对应的高和宽
# ResNet而言 对应不同的预测特征图 scale仅有一个尺度 ws shape: [3]
ws = (w_ratios[:, None] * scale[None, :]).view(-1)
hs = (h_ratios[:, None] * scale[None, :]).view(-1)
# 生成的anchors模板都是以(0, 0)为中心, 对应左上角坐标和右下角坐标. dim=1 shape-> [3, 4] 三个比例的anchor,4左上角坐标和右下角坐标
base_anchor = torch.stack([-ws, -hs, ws, hs], dim=1) / 2
# round 四舍五入取整
return base_anchor.round()
def set_cell_anchors(self, dtype, device):
# 初始化时设置为None
if self.cell_anchors is not None:
cell_anchors = self.cell_anchors
assert cell_anchors is not None
# suppose that all anchors have the same device
# which is a valid assumption in the current state of the codebase
if cell_anchors[0].device == device:
return
# 根据提供的sizes和aspect_ratios生成anchors模板
# 不同预测特征图所对应的sizes不同 遍历不同的预测特征图生成与之对应的anchor模板
# 返回一个list key代表不同的预测特征图 value代表anchor模板
cell_anchors = [
self.generate_anchors(sizes, aspect_ratios, dtype, device)
for sizes, aspect_ratios in zip(self.sizes, self.aspect_ratios)]
self.cell_anchors = cell_anchors
def cached_grid_anchors(self, grid_sizes, strides):
key = str(grid_sizes) + str(strides)
if key in self._cache:
return self._cache[key]
# 得到所有预测特征图映射回原图上所生成的anchors
anchors = self.grid_anchors(grid_sizes, strides)
self._cache[key] = anchors
return anchors
def grid_anchors(self, grid_sizes, strides):
anchors = []
# anchor模板
cell_anchors = self.cell_anchors
# 遍历每个预测特征图的grid_sizes strides和cell_anchors
# size 预测特征图的高度和宽度
# stride 预测特征层上的每一个stride对应原图上的高度和宽度
# cell_anchors anchor模板
for size, stride, base_anchors in zip(grid_sizes, strides, cell_anchors):
grid_height, grid_width = size
stride_height, stride_width = stride
device = base_anchors.device
# 生成对应原图上的x坐标
shifts_x = torch.arange(0, grid_width, dtype=torch.float32, device=device) * stride_width
# 生成对应原图上的x坐标
shifts_y = torch.arange(0, grid_height, dtype=torch.float32, device=device) * stride_height
# 得到每一个点映射回原图上的坐标
shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x)
shift_x = shift_x.reshape(-1) # 展平 -> 向量
shift_y = shift_y.reshape(-1)
# shifts: shape: 当前预测特征图上[grid_width*grid_height, 4]
shifts = torch.stack([shift_x, shift_y, shift_x, shift_y], dim=1)
# shift这个偏移量加上anchor模板 就可以平移到原图上的每个位置上
# shape: [grid_width*grid_height, 3, 4] 3: 当前预测特征图上每个位置生成的3个anchor 4: 对应的左上右下坐标
shifts_anchor = shifts.view(-1, 1, 4) + base_anchors.view(1, -1, 4)
# 当前预测特征图生成的所有anchor存入anchors中
# shape: [grid_width*grid_height*3, 4]
anchors.append(shifts_anchor.reshape(-1, 4)) # 当前预测特征图生成的所有anchor存入anchors中
return anchors
def num_anchors_per_location(self):
# 计算每个预测特征层上每个滑动窗口的预测目标数
return [len(s) * len(a) for s, a in zip(self.sizes, self.aspect_ratios)]
def forward(self, image_list, feature_maps):
# 获取backbone中每个预测特征图的尺寸(h, w)
grid_sizes = list([feature_map.shape[-2:] for feature_map in feature_maps])
# 获取打包成batch的输入图像的height和width
# image_list: 1 图像缩放后的尺寸(image_sizes) 2 预处理过程中将图像打包成一个个batch 每个batch对应一个tensor(tensors)
image_size = image_list.tensors.shape[-2:]
# 获取变量类型和设备类型
dtype, device = feature_maps[0].dtype, feature_maps[0].device
# 计算特征层上的每一步对应原始图像上的尺度
# 图像大小 / 预测特征图大小 = 特征图上的每一步对应原图上像素的大小
strides = [[torch.tensor(image_size[0] // g[0], dtype=torch.int64, device=device),
torch.tensor(image_size[1] // g[1], dtype=torch.int64, device=device)] for g in grid_sizes]
# 根据提供的sizes和aspect_ratios生成anchors模板
self.set_cell_anchors(dtype, device)
# 将anchor模板应用到原图中
# grid_sizes 预测特征图的高度、宽度信息
# strides:预测特征图中每个scale对应原图上的尺度
# 得到不同预测特征图在原图上生成的anchors的坐标信息
anchors_over_all_feature_maps = self.cached_grid_anchors(grid_sizes, strides)
# 遍历batch中的每一张图片
anchors = []
for i, (image_height, image_width) in enumerate(image_list.image_sizes):
anchors_in_image = []
# 遍历所有预测特征图映射回原图中anchor的坐标信息
for anchors_per_feature_map in anchors_over_all_feature_maps:
anchors_in_image.append(anchors_per_feature_map)
anchors.append(anchors_in_image)
# cat: 将一张图像中的所有预测特征图上的anchors拼接到一起
anchors = [torch.cat(anchors_per_image) for anchors_per_image in anchors]
self._cache.clear()
return anchors
def box_area(boxes):
"""
Computes the area of a set of bounding boxes, which are specified by its
(x1, y1, x2, y2) coordinates.
Arguments:
boxes (Tensor[N, 4]): boxes for which the area will be computed. They
are expected to be in (x1, y1, x2, y2) format
Returns:
area (Tensor[N]): area for each box
"""
return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
def box_iou(boxes1, boxes2):
# 计算传入的两组boxes的IOU值
area1 = box_area(boxes1)
area2 = box_area(boxes2)
# [gt_box_num, 1, 2] - [anchor_num, 2] -> shape: [gt_box_num, anchor_num, 2]
left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2])
# [gt_box_num, 1, 2] - [anchor_num, 2] -> shape: [gt_box_num, anchor_num, 2]
right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])
# clamp: 限制最小的输出 shape: [gt_box_num, anchor_num, 2] 2: 相交区域的w和h
wh = (right_bottom - left_top).clamp(min=0)
# wh[:, :, 0] 将每一个相交区域的宽取出 shape -> [gt_box_num, anchor_num]
# wh[:, :, 1] 将每一个相交区域的高取出 shape -> [gt_box_num, anchor_num]
# inter shape: [gt_box_num, anchor_num] 对应每个gtbox与生成的所有anchor之间相交的面积
inter = wh[:, :, 0] * wh[:, :, 1]
# iou shape: [gt_box_num, anchor_num]
iou = inter / (area1[:, None] + area2 - inter)
return iou
def permute_and_flatten(layer, N, A, C, H, W):
# 调整tensor顺序 并进行展平操作
# layer: 预测特征层上预测的目标概率或bboxes regression参数
# N: batch_size
# A: anchors的数量
# C: classes_num or 4(bbox coordinate) RPN中C=1
# H: height
# W: width
# C: class_num RPN只需计算前景还是背景 C=1 shape: [batch_size, anchor_num, C, H, W]
layer = layer.view(N, -1, C, H, W)
# shape: -> [batch_size, H, W, anchor_num, C]
layer = layer.permute(0, 3, 4, 1, 2)
# shape: -> [batch_size, H * W * anchor_num, C]
# view和reshape功能是一样的,先展平所有元素在按照给定shape排列
# view函数只能用于内存中连续存储的tensor,permute等操作会使tensor在内存中变得不再连续,此时就不能再调用view函数
# reshape则不需要依赖目标tensor是否在内存中是连续的
layer = layer.reshape(N, -1, C)
return layer
def concat_box_pred_layers(box_cls, box_regression):
# 存储预测目标分类数的参数
box_cls_flattened = []
# 存储预测bbox回归的参数
box_regression_flattened = []
# 逐层遍历每个预测特征图
for box_cls_per_level, box_regression_per_level in zip(box_cls, box_regression):
# N: batch_size A: anchor的个数 C: 分类数(在RPN中C=1,只区分目标和背景)H: 特征矩阵的高度 W: 特征矩阵的宽度
N, AxC, H, W = box_cls_per_level.shape
Ax4 = box_regression_per_level.shape[1]
# anchor的个数
A = Ax4 // 4
# 分类的个数
C = AxC // A
# 对参数进行展平处理 shape: [batch_size, anchor_num*H*W, C=1]
box_cls_per_level = permute_and_flatten(box_cls_per_level, N, A, C, H, W)
box_cls_flattened.append(box_cls_per_level)
# shape: [batch_size, anchor_num*H*W, C=4]
box_regression_per_level = permute_and_flatten(box_regression_per_level, N, A, 4, H, W)
box_regression_flattened.append(box_regression_per_level)
# torch.cat(box_cls_flattened, dim=1) shape: [batch, anchor_num*H*W, C=1] 将不同预测特征图中生成的anchor数量加到一起
# flatten(0, -2) shape: [Batch*anchor_num*H*W, 1] 其中参数表示从0维度开始,到-2维度停止 来进行展平操作。
box_cls = torch.cat(box_cls_flattened, dim=1).flatten(0, -2)
# shape: [Batch*anchor_num*H*W, 4]
box_regression = torch.cat(box_regression_flattened, dim=1).reshape(-1, 4)
return box_cls, box_regression
def clip_boxes_to_image(boxes, size):
# 裁剪预测的boxes信息,将越界的坐标调整到图片边界上
boxes_x = boxes[..., 0::2] # xmin xmax
boxes_y = boxes[..., 1::2] # ymin ymax
# size 对应当前图像的高宽信息
height, width = size
# clamp 限制信息。 将x坐标范围限制在0-width之间
boxes_x = boxes_x.clamp(min=0, max=width)
# 将y坐标范围限制在0 - height之间
boxes_y = boxes_y.clamp(min=0, max=height)
# 得到裁剪之后的boxes
clipped_boxes = torch.cat((boxes_x, boxes_y), dim=1)
return clipped_boxes
def remove_small_boxes(boxes, min_size):
# proposal的宽高信息
ws, hs = boxes[:, 2] - boxes[:, 0], boxes[:, 3] - boxes[:, 1]
# 当满足宽,高都大于给定阈值时为True
keep = (ws >= min_size) & (hs >= min_size)
# 获取keep中为True的索引
keep = keep.nonzero().squeeze(1)
return keep
def batched_nms(boxes, scores, level_idxs, iou_threshold):
# 判断proposal的数量是否为0
if boxes.numel() == 0:
return torch.empty((0, ), dtype=torch.int64, device=boxes.device)
# 获取所有proposal中坐标最大的数值
max_coordinate = boxes.max()
# to: 使level_idxs的dtype和device与boxes保持一致
offset = level_idxs.to(boxes) * (max_coordinate + 1)
# 生成的proposal是在不同的预测特征图上生成的,加一个很大的offset,是为了将不同预测特征图的proposal分开,保证每一个预测特征图的proposal与其他层的proposal不会相交。
# nms对不同预测特征图的proposal进行处理
boxes_offset = boxes + offset[:, None]
keep = nms(boxes_offset, scores, iou_threshold)
return keep
def encode_boxes(reference_boxes, anchors, weights):
wx = weights[0]
wy = weights[1]
ww = weights[2]
wh = weights[3]
# shape: [anchor_nums, 1]
# 对应anchors的坐标 xmin ymin xmax ymax
anchors_x1 = anchors[:, 0].unsqueeze(1)
anchors_y1 = anchors[:, 1].unsqueeze(1)
anchors_x2 = anchors[:, 2].unsqueeze(1)
anchors_y2 = anchors[:, 3].unsqueeze(1)
# 每个anchors对应的gtbox坐标 xmin ymin xmax ymax
reference_boxes_x1 = reference_boxes[:, 0].unsqueeze(1)
reference_boxes_y1 = reference_boxes[:, 1].unsqueeze(1)
reference_boxes_x2 = reference_boxes[:, 2].unsqueeze(1)
reference_boxes_y2 = reference_boxes[:, 3].unsqueeze(1)
# anchors的宽度
ex_width = anchors_x2 - anchors_x1
# anchors的高度
ex_height = anchors_y2 - anchors_y1
# anchor的中心坐标x
ex_ctr_x = anchors_x1 + 0.5 * ex_width
# anchor的中心坐标y
ex_ctr_y = anchors_y1 + 0.5 * ex_height
# 每一个anchor对应gtbox的w h center_x center_y
gt_widths = reference_boxes_x2 - reference_boxes_x1
gt_heights = reference_boxes_y2 - reference_boxes_y1
gt_ctr_x = reference_boxes_x1 + 0.5 * gt_widths
gt_ctr_y = reference_boxes_y1 + 0.5 * gt_heights
# 计算gtbox对应anchors的回归参数
# ti*: 第i个anchor对应gtbox的回归参数.
# tx* = (gtbox的x坐标 - anchor的x的坐标) / anchor的宽
# ty* = (gtbox的y坐标 - anchor的y的坐标) / anchor的高
# 得到gtbox相对anchor的xy坐标偏移量
target_dx = wx * (gt_ctr_x - ex_ctr_x) / ex_width
target_dy = wy * (gt_ctr_y - ex_ctr_y) / ex_height
# tw* = log(gtbox的w / anchor的w)
# th* = log(gtbox的h / anchor的h)
target_dw = ww * torch.log(gt_widths / ex_width)
target_dh = wh * torch.log(gt_heights / ex_height)
# xywh -> xmin ymin xmax ymax
pred_boxes_xmin = target_dx - torch.tensor(0.5, dtype=target_dx.dtype, device=target_dx.device) * target_dw
pred_boxes_ymin = target_dy - torch.tensor(0.5, dtype=target_dy.dtype, device=target_dy.device) * target_dh
pred_boxes_xmax = target_dx + torch.tensor(0.5, dtype=target_dx.dtype, device=target_dx.device) * target_dw
pred_boxes_ymax = target_dy + torch.tensor(0.5, dtype=target_dy.dtype, device=target_dy.device) * target_dh
# shape: [anchors, 4] 4为xywh的偏移量
# targets = torch.cat((target_dx, target_dy, target_dw, target_dh), dim=1)
# shape: [anchors, 4] 4为xyxy的偏移量
targets = torch.cat((pred_boxes_xmin, pred_boxes_ymin, pred_boxes_xmax, pred_boxes_ymax), dim=1)
return targets
class box_Coder(object):
def __init__(self, weights, bbox_xform_clip=math.log(1000. / 16)):
self.weights = weights
self.bbox_xform_clip = bbox_xform_clip
def encode(self, reference_boxes, anchores):
# 结合anchors和与之对应的gt计算回归参数
# reference_boxes: 每个anchor对应的gtbox坐标 proposals: 对应anchor的坐标
# 统计每张图像的anchors个数
boxes_per_image = [len(b) for b in reference_boxes]
# shape: [batch_size*per_anchor_num, 4]
# 每个anchor对应的gtbox坐标进行拼接
reference_boxes = torch.cat(reference_boxes, dim=0)
# shape: [batch_size*per_anchor_num, 4]
# 每个anchors进行拼接
anchors = torch.cat(anchores, dim=0)
targets = self.encode_single(reference_boxes, anchors)
# shape: [per_anchor_num, 4]
return targets.split(boxes_per_image, 0)
def encode_single(self, reference_boxes, anchors):
dtype = reference_boxes.dtype
device = reference_boxes.device
weights = torch.as_tensor(self.weights, dtype=dtype, device=device) # weights 1 1 1 1
# shape: [anchors, 4] 4: center_x, center_y, w, h的偏移量
# targets = encode_boxes(reference_boxes, anchors, weights)
# shape: [anchors, 4] 4: xmin, ymin, xmax, ymax的偏移量
targets = encode_boxes(reference_boxes, anchors, weights)
return targets
def decode_single(self, rel_codes, boxes):
# rel_codes: 预测的bbox的回归参数
# boxes: anchor模板的坐标
boxes = boxes.to(rel_codes.dtype)
# boxes(cell_anchors) shape: [batchsize*anchor_num, 4] 4: xmin, ymin, xmax, ymax
width = boxes[:, 2] - boxes[:, 0] # shape: [batchsize*anchor_num]
height = boxes[:, 3] - boxes[:, 1]
center_x = boxes[:, 0] + 0.5 * width
center_y = boxes[:, 1] + 0.5 * height
wx, wy, ww, wh = self.weights
# 0::4 从0开始以4为间隔进行采样 但可以多保留一个维度 dx shape: [batchsize*anchor_num, 1]
# rel_codes shape: [batchsize*anchor_num, 4]
dx = rel_codes[:, 0::4] / wx # 预测anchor的中心坐标回归参数x
dy = rel_codes[:, 1::4] / wy # 预测anchor的中心坐标回归参数y
dw = rel_codes[:, 2::4] / ww # 预测anchor的宽度回归参数w
dh = rel_codes[:, 3::4] / wh # 预测anchor的高度回归参数h
# clamp: 限制数值的上下限
# 预防指数爆炸
dw = torch.clamp(dw, max=self.bbox_xform_clip)
dh = torch.clamp(dh, max=self.bbox_xform_clip)
# 将预测值应用于anchor模板中
# tx: 预测得到的中心坐标x的回归参数 wa: anchor的宽度 xa: anchor的中心x坐标
# tx = (x - xa) / wa ty = (y - ya) / ha
# tw = log(w / wa) th = log(h / ha)
# x y w h 为预测值xywh
pred_center_x = dx * width[:, None] + center_x[:, None]
pred_center_y = dy * height[:, None] + center_y[:, None]
pred_w = torch.exp(dw) * width[:, None]
pred_h = torch.exp(dh) * height[:, None]
# [x, y, w, h] -> [xmin, ymin, xmax. ymax]
pred_boxes_xmin = pred_center_x - torch.tensor(0.5, dtype=pred_center_x.dtype, device=pred_w.device) * pred_w
pred_boxes_ymin = pred_center_y - torch.tensor(0.5, dtype=pred_center_x.dtype, device=pred_w.device) * pred_h
pred_boxes_xmax = pred_center_x + torch.tensor(0.5, dtype=pred_center_x.dtype, device=pred_w.device) * pred_w
pred_boxes_ymax = pred_center_y + torch.tensor(0.5, dtype=pred_center_x.dtype, device=pred_w.device) * pred_h
# stack: dim=2 首先新增一个维度 flatten()展平 -> shape: [batchsize*anchor_num, 4]
# stack: 会新增维度 cat: 在指定维度上进行拼接
pred_boxes = torch.stack((pred_boxes_xmin, pred_boxes_ymin, pred_boxes_xmax, pred_boxes_ymax), dim=2).flatten(1)
# pred_boxes = torch.cat((pred_boxes_xmin, pred_boxes_ymin, pred_boxes_xmax, pred_boxes_ymax), dim=1)
return pred_boxes
def decode(self, rel_codes, boxes):
# rel_codes: 预测的bbox的回归参数
# boxes: anchor的坐标
# 将一个batch中所有anchor的坐标信息拼接在一起
concat_boxes = torch.cat(boxes, dim=0)
# 获取一个batch中anchor的总数
box_sum = concat_boxes.shape[0]
# 将预测的bbox的回归参数应用到anchor上
pred_boxes = self.decode_single(rel_codes, concat_boxes)
# shape: [batchsize*anchor_num, 1, 4]
pred_boxes = pred_boxes.reshape(box_sum, -1, 4)
return pred_boxes
class Matcher(object):
# 计算anchors与每个gtboxes匹配的iou最大值,并记录索引,
def __init__(self, high_threshold, low_threshold, allow_low_quality_matches=False):
self.BELOW_LOW_THRESHOLD = -1
self.high_threshold = high_threshold # 0.7
self.low_threshold = low_threshold # 0.3
self.allow_low_quality_matches = allow_low_quality_matches
def __call__(self, match_quality_matrix):
# matched_vals: 每一个anchor对应每个GTbox上最大的iou的值, 即每列的最大值。
# matches: 每列最大值的索引
# 获得anchors与GTbox最大的iou值
matched_value, matches_idx = match_quality_matrix.max(dim=0)
if self.allow_low_quality_matches:
all_matches = matches_idx.clone()
else:
all_matches = None
# 计算iou小于low_threshold. 小于的位置为True, 不小于的位置为False
below_low_threshold = matched_value < self.low_threshold
# 计算iou大于low_threshold且小于high_threshold之间的索引
between_threshold = (matched_value >= self.low_threshold) & (matched_value < self.high_threshold)
# 将matches_idx中小于low_threshold位置的值,设置为-1
matches_idx[below_low_threshold] = -1
# iou在[low_threshold, high_threshold]之间的matches_idx索引值设为-2
matches_idx[between_threshold] = -2
# 对于每一个GTbox而言,与之匹配IOU值最大的anchor,也将其设置为正样本。
# 相当于对于每一个GTbox都有一个与之匹配的anchor,将其设置为正样本,不局限于iou大于0.7时才为正样本。
if self.allow_low_quality_matches:
self.set_low_quality_matches_(matches_idx, all_matches, match_quality_matrix)
return matches_idx
def set_low_quality_matches_(self, matches_idx, all_matches, match_quality_matrix):
# 对于每个GTbox,寻找与其iou值最大的anchor. 对于每一行而言,每一行为一个GTbox
highest_quality_gt_value, _ = match_quality_matrix.max(dim=1)
# 在相同位置上 数值相等的位置为True. nonzero: 寻找为True的位置. 返回每一个非0元素的坐标
gt_anchor_matches_highest_coordiate = torch.nonzero(match_quality_matrix == highest_quality_gt_value[:, None])
gt_anchor_matches_highest_coordiate_update = gt_anchor_matches_highest_coordiate[:, 1]
# 保留该anchor匹配gt最大iou的索引,即使iou低于设定的阈值
matches_idx[gt_anchor_matches_highest_coordiate_update] = all_matches[gt_anchor_matches_highest_coordiate_update]
class BalancedPositiveNegativeSampler(object):
def __init__(self, batch_size_per_image, positive_fraction):
self.batch_size_per_image = batch_size_per_image
self.positive_fraction = positive_fraction
def __call__(self, matched_idxs):
# 记录正负样本的索引
pos_idx = []
neg_idx = []
# 遍历每张图像的matched_idxs 划分为正、负和丢弃样本的labels
for matched_idxs_per_image in matched_idxs:
# >= 1 positive sample, nonzero返回非零元素的索引(正样本对应的索引)
# torch.nonzero(matched_idxs_per_image >= 1) shape: [正样本对应的索引, 1]
# squeeze(1): 降维 -> shape: [正样本对应的索引]
positive = torch.nonzero(matched_idxs_per_image >= 1).squeeze(1)
# = 0的为负样本
negative = torch.nonzero(matched_idxs_per_image == 0).squeeze(1)
# 指定正样本数量
num_pos = int(self.batch_size_per_image * self.positive_fraction)
# 若正样本数量不足,直接使用所有正样本
num_pos = min(positive.numel(), num_pos)
# 负样本数量
num_neg = self.batch_size_per_image - num_pos
# 如果负样本数量不够就直接采用所有负样本 numel: 元素个数
num_neg = min(negative.numel(), num_neg)
# 随机选择指定数量的正负样本 返回的是 索引信息
perm1 = torch.randperm(positive.numel(), device=positive.device)[:num_pos]
perm2 = torch.randperm(negative.numel(), device=negative.device)[:num_neg]
pos_idx_per_image = positive[perm1]
neg_idx_per_image = negative[perm2]
# 创建两个全0的模板 用于存储正样本和负样本
pos_idx_per_image_mask = torch.zeros_like(
matched_idxs_per_image, dtype=torch.uint8
)
neg_idx_per_image_mask = torch.zeros_like(
matched_idxs_per_image, dtype=torch.uint8
)
# 对于正样本模板 对应正样本的索引位置处的值 全部设置为1
pos_idx_per_image_mask[pos_idx_per_image] = 1
# 对于负样本模板 对应负样本的索引位置处的值 全部设置为1
neg_idx_per_image_mask[neg_idx_per_image] = 1
pos_idx.append(pos_idx_per_image_mask)
neg_idx.append(neg_idx_per_image_mask)
return pos_idx, neg_idx
class RegionProposalNetwork(nn.Module):
# batch_size_per_image:RPN计算损失时采用正负样本的总个数 positive_fraction:正样本在总样本中的比例
def __init__(self, anchor_generate, rpn_head, fg_iou_thresh, bg_iou_thresh, batch_size_per_image,
positive_fraction, pre_nms_top_n, post_nms_top_n, nms_thresh):
super(RegionProposalNetwork, self).__init__()
self.anchor_generator = anchor_generate
self.head = rpn_head
# 平衡系数
self.box_coder = box_Coder(weights=(1.0, 1.0, 1.0, 1.0))
# train
self.box_similarity = box_iou
# fg_iou_thresh: 当iou大于fg_iou_thresh(0.7),设置为正样本. bg_iou_thresh: 当iou小于bg_iou_thresh(0.3),设置为负样本.
self.proposal_matcher = Matcher(fg_iou_thresh, bg_iou_thresh, allow_low_quality_matches=True)
# batch_size_per_image: 计算损失时,选择的正负样本的总个数
# positive_fraction: 正样本占全部样本的比例
self.fg_bg_sampler = BalancedPositiveNegativeSampler(
batch_size_per_image, positive_fraction # 256, 0.5
)
# test
self._pre_nms_top_n = pre_nms_top_n
self._post_nms_top_n = post_nms_top_n
self.nms_thresh = nms_thresh
# 过滤proposal时使用
self.min_size = 1e-3
def pre_nms_top_n(self):
# 判断训练还是预测. 训练 pre_nms_top_n = 2000 预测: pre_nms_top_n = 1000
if self.training:
return self._pre_nms_top_n['training']
return self._pre_nms_top_n['testing']
def post_nms_top_n(self):
if self.training:
return self._post_nms_top_n['training']
return self._post_nms_top_n['testing']
def _get_top_n_idx(self, objectness, num_anchors_per_level):
# objectness: 一个batch中每张图像的预测目标概率信息
# num_anchors_per_level: 每个预测特征图上预测anchor的数量
# 记录每个预测特征图上预测目标概率前pre_nms_top_n的索引信息
r = []
offset = 0
# 遍历每个预测特征图上的预测目标概率信息
# split: 在指定维度上 信息按多长进行分割。 对应每一个预测特征图上anchor的个数
for ob in objectness.split(num_anchors_per_level, 1):
# ob shape: [batch_size, 对应预测特征图上anchor的数量]
num_anchors = ob.shape[1]
# 针对每一层取前topn个anchors,得到proposal.
pre_nms_top_n = min(self.pre_nms_top_n(), num_anchors)
# topk. 默认从大到小排序。 return:1、排序后的数值 2、排序后的index
# 对每一层预测特征图进行排序
_, top_n_idx = ob.topk(pre_nms_top_n, dim=1)
# offset. 目的是获得将所有anchor(所有预测特征图中的anchor)合并到一起后的索引, 得到proposal。
r.append(top_n_idx + offset)
offset += num_anchors
# 将每一层的前topk的索引进行拼接
# r中每一层预测特征图的shape: [batch_size, topk]
return torch.cat(r, dim=1)
def assign_targets_to_anchors(self, anchors, targets):
# 对于一个batch中,每一张图像中生成的anchors,为每一个anchors匹配所对应的标签以及GTboxes。
# 记录anchors匹配的标签以及gtboxes所对应的坐标
labels = []
matched_gt_boxes = []
# 遍历每张图像的anchors和target
for anchors_per_image, targets_per_image in zip(anchors, targets):
# targets为dict,分别对应boxes, labels, image_id, area, iscrowd信息。
# RPN只区别目标是前景和背景 因此只提取坐标信息,不需要具体的类别信息。
gt_boxes = targets_per_image['boxes']
# numel(): 获取所有元素的个数
# 如果图像中没有检测目标(gtbox)
if gt_boxes.numel() == 0:
device = anchors_per_image.device
matched_gt_boxes_per_image = torch.zeros(anchors_per_image.shape, dtype=torch.float32, device=device)
labels_per_image = torch.zeros((anchors_per_image.shape[0],), dtype=torch.float32, device=device)
else:
# 计算GTboxes与anchors的IOU
# return shape: [gt_num, anchor_num]
match_quality_matrix = box_iou(gt_boxes, anchors_per_image)
# 通过计算得到的iou值,为每个anchor分配匹配到的gtbox的索引。
# shape: [anchor_num]. 由-1 -2 anchor的索引组成. -1为负样本 -2为丢弃的样本 >=0为正样本
matched_idxs = self.proposal_matcher(match_quality_matrix)
# 获得每一个anchor得到的gtbox的坐标
matched_gt_boxes_per_image = gt_boxes[matched_idxs.clamp(min=0)]
# 记录所有anchors匹配后的标签(所有正样本的位置) 正样本对应索引位置上的值为1
labels_per_image = matched_idxs >= 0
labels_per_image = labels_per_image.to(dtype=torch.float32)
# 记录所有负样本的索引 负样本对应索引位置上的值设为0
bg_indices = matched_idxs == -1
labels_per_image[bg_indices] = 0.0
# 记录丢弃样本的索引 丢弃样本对应索引位置上的值设为-1
between_indices = matched_idxs == -2
labels_per_image[between_indices] = -1.0
labels.append(labels_per_image)
matched_gt_boxes.append(matched_gt_boxes_per_image)
return labels, matched_gt_boxes
def filter_proposals(self, proposals, objectness, image_shapes, num_anchors_per_level):
# proposal shape: [Batch, anchor_num, 4]
# batch size
num_images = proposals.shape[0]
device = proposals.device
# detach 对于fast_rcnn部分,proposal是输入参数,丢弃计算梯度信息,只保留数值信息,requires_grad=False.
objectness = objectness.detach()
# shape: [batch_size, anchor_num]
objectness = objectness.view(num_images, -1)
# levels 记录不同预测特征层上的anchors索引信息。 为了知道对于每一个anchor其属于哪一个预测特征图上的
# idx: 对应预测特征图的索引 n: 该预测特征图上anchor的个数
# torch.full生成维度为n,其中用idx值填充
levels = [torch.full((n,), idx, dtype=torch.int64, device=device) for idx, n in enumerate(num_anchors_per_level)]
# [anchor_num]
levels = torch.cat(levels, dim=0)
# reshape shape: [1, anchor_num]
# expand_as: [batch, anchor_num]
levels = levels.reshape(1, -1).expand_as(objectness)
# 获取每张预测特征图上预测概率排在pre_nms_top_n的anchor的索引值,得到proposals.
# top_n_idx shape: [batch_size, proposal_num]
top_n_idx = self._get_top_n_idx(objectness, num_anchors_per_level)
# batch_size的个数
image_range = torch.arange(num_images, device=device)
# shape: [batch_size, 1]
batch_idx = image_range[:, None]
# 根据每个预测特征图中排在前pre_nms_top_n个anchor的索引值,得到其相应的概率信息,从而得到proposal。
# 在objectness中,每一张图像中对应的topn的索引,得到其概率信息。
# objectness shape: [batch_size, proposal_num]
objectness = objectness[batch_idx, top_n_idx]
# 存储anchor属于某个预测特征层的信息
levels = levels[batch_idx, top_n_idx]
# 预测概率排在pre_nms_top_n个anchor的索引值,得到相应的bbox坐标信息,得到proposal
proposals = proposals[batch_idx, top_n_idx]
final_boxes = []
final_scores = []
# 遍历每张图像上的相关预测信息
for boxes, scores, level, img_shape in zip(proposals, objectness, levels, image_shapes):
# 调整预测的boxes坐标,得到最终的proposal坐标。目的是将越界的坐标调整到图像边界上,限制在图像的内部。
boxes = clip_boxes_to_image(boxes, img_shape)
# 移除proposal中的小boxes 根据self.min_size
# 返回boxes满足宽,高都大于self.min_size的索引
keep = remove_small_boxes(boxes, self.min_size)
boxes, scores, level = boxes[keep], scores[keep], level[keep]
# 经过nms后保留下来的proposal.
# return: 执行完nms后,按照目标的类别scores,从大到小排序输出的索引信息。
keep = batched_nms(boxes, scores, level, self.nms_thresh)
# 只获取post_nms_top_n个proposal
keep = keep[: self.post_nms_top_n()]
boxes, scores = boxes[keep], scores[keep]
final_boxes.append(boxes)
final_scores.append(scores)
return final_boxes, final_scores
def compute_loss(self, objectness, pred_bbox_deltas, labels, regression_targets):
# 按照给定的batch_size_per_image, positive_fraction选择正负样本
# labels: 完成对正负样本的划分。
# fg_bg_sampler: 不是所有的正负样本都拿来训练,选择在计算损失时所使用到的正负样本。
sampled_pos_inds, sampled_neg_inds = self.fg_bg_sampler(labels)
# 将一个batch中的所有正负样本分别拼接在一起,并获取非零位置的索引
sampled_pos_inds = torch.nonzero(torch.cat(sampled_pos_inds, dim=0)).squeeze(1)
sampled_neg_inds = torch.nonzero(torch.cat(sampled_neg_inds, dim=0)).squeeze(1)
# 将所有正负样本索引拼接在一起
sampled_inds = torch.cat([sampled_pos_inds, sampled_neg_inds], dim=0)
objectness = objectness.flatten()
labels = torch.cat(labels, dim=0)
regression_targets = torch.cat(regression_targets, dim=0)
# 计算边界框回归损失 只需要计算正样本的损失
box_loss = smooth_l1_loss(pred_bbox_deltas[sampled_pos_inds], regression_targets[sampled_pos_inds], beta=1 / 9, size_average=False) / (sampled_inds.numel())
# 计算目标预测概率损失
objectness_loss = F.binary_cross_entropy_with_logits(
objectness[sampled_inds], labels[sampled_inds]
)
return objectness_loss, box_loss
def forward(self, image_list, features, targets=None):
# 首先提取所有预测特征图的特征矩阵。features为字典类型,只提取value(特征矩阵),不要key.
# features是所有预测特征层组成的OrderedDict
features = list(features.values())
# 将预测特征图输入到RPNHead中
# 计算每个预测特征层上的预测目标概率和bboxes regression参数
# objectness和pred_bbox_deltas都是list
# objectness: 根据预测特征层预测每一个anchor属于前景还是背景。 shape: [B, 3(每个预测特征图中anchor的个数), H, W]
# pred_bbox_deltas: 根据预测特征层输出坐标偏移量 shape: [B, 3*4, H, W]
objectness, pred_bbox_deltas = self.head(features)
# 得到一个batch中对应每一张图像生成的所有anchors信息
anchors = self.anchor_generator(image_list, features)
# batch_size
num_images = len(anchors)
# 计算每个预测特征图上anchor的数量 shape: [C, H, W]
num_anchors_per_level_shape_tensor = [o[0].shape for o in objectness]
# 每一个预测特征层上所生成anchor的个数
num_anchors_per_level = [s[0] * s[1] * s[2] for s in num_anchors_per_level_shape_tensor]
# 调整box_cls和box_reg两个list中的每个预测特征层的预测信息的tensor排列顺序和shape进行调整
# objectness shape -> [Batch*anchor_num*H*W, 1]
# pred_bbox_deltas shape -> [Batch*anchor_num*H*W, 4]
objectness, pred_bbox_deltas = concat_box_pred_layers(objectness, pred_bbox_deltas)
# 将预测得到的bbox参数应用到anchor中,得到proposal
# detach(): 不计算梯度
proposals = self.box_coder.decode(pred_bbox_deltas.detach(), anchors)
# shape: [Batch*anchor_num, 4] -> [Batch, anchor_num, 4]
proposals = proposals.view(num_images, -1, 4)
# 过滤proposal. 筛出小面积的proposal框, nms处理,根据预测概率取前post_nms_top_n个目标
# proposal shape: [Batch, anchor_num, 4]
boxes, scores = self.filter_proposals(proposals, objectness, image_list.image_sizes, num_anchors_per_level)
# 在训练期间,计算loss
losses = {}
if self.training:
# 计算每个anchor最匹配的GTbox,并将anchor分类成 前景、背景、丢弃 用1,0,-1表示.
labels, matched_gt_boxes = self.assign_targets_to_anchors(anchors, targets)
# 计算gtbox坐标相对于anchor坐标之间的回归参数
regression_targets = self.box_coder.encode(matched_gt_boxes, anchors)
# objectness: 预测的目标分数 pred_bbox_deltas: 预测的坐标偏移量
# labels: 真实的label(正样本1 负样本0 丢弃样本-1) regression_targets: gtbox坐标相对于anchor坐标之间的回归参数
loss_objectness, loss_rpn_box_reg = self.compute_loss(objectness, pred_bbox_deltas, labels, regression_targets)
losses = {
'loss_objectness': loss_objectness,
'loss_rpn_box_reg': loss_rpn_box_reg
}
return boxes, losses