这一部分是对应框架中的两个部分,一个是GeneralizedRCNNTransforn(Normalize, Resize),另一个是GeneralizedRCNN Transform Postprocess(Map predicted bbox back to original image)部分。
对代码的理解注释如下。
import math
from typing import List, Tuple, Dict, Optional
import torch
from torch import nn, Tensor
import torchvision
from .image_list import ImageList
# 对图像进行标准化处理以及resize处理的一个过程
@torch.jit.unused
def _resize_image_onnx(image, self_min_size, self_max_size):
# type: (Tensor, float, float) -> Tensor
from torch.onnx import operators
im_shape = operators.shape_as_tensor(image)[-2:]
min_size = torch.min(im_shape).to(dtype=torch.float32)
max_size = torch.max(im_shape).to(dtype=torch.float32)
scale_factor = torch.min(self_min_size / min_size, self_max_size / max_size)
image = torch.nn.functional.interpolate(
image[None], scale_factor=scale_factor, mode="bilinear", recompute_scale_factor=True,
align_corners=False)[0]
return image
def _resize_image(image, self_min_size, self_max_size):
# type: (Tensor, float, float) -> Tensor
im_shape = torch.tensor(image.shape[-2:])
min_size = float(torch.min(im_shape)) # 获取高宽中的最小值
max_size = float(torch.max(im_shape)) # 获取高宽中的最大值
scale_factor = self_min_size / min_size # 根据指定最小边长和图片最小边长计算缩放比例
# 如果使用该缩放比例计算的图片最大边长大于指定的最大边长
if max_size * scale_factor > self_max_size:
scale_factor = self_max_size / max_size # 将缩放比例设为指定最大边长和图片最大边长之比
# 如果重新计算缩放比例,scale_factor 设为 self_max_size / max_size,可能会导致缩放后的图像最小边长小于指定的 self_min_size.
# 如果将 scale_factor 再次设为 self_min_size / min_size,可能会导致缩放后的图像最大边长又超过指定的 self_max_size,从而陷入死循环。
# 为了避免这种情况的发生,可以在判断缩放比例时,同时考虑最小边长和最大边长,具体实现是比较缩放后的图像的最小边长和最大边长是否都满足指定的 self_min_size 和 self_max_size,
# 如果都满足,则返回缩放后的图像张量;如果不满足,则选择更靠近指定范围的边长进行缩放,比如取最小边长和最大边长的平均值,或者对最小边长进行放大,对最大边长进行缩小,然后重新计算缩放比例,并进行插值操作。这样就可以避免死循环的问题。
# 在这里作者也考虑到这个死循环过程,他就判断如果出现上面的这种情况,就不考虑输入图片的边长的下限,只考虑最大边长的情况
# interpolate利用插值的方法缩放图片
# image[None]操作是在最前面添加batch维度[C, H, W] -> [1, C, H, W],,,用完之后通过切片的方式又给他转回来了[0]
# bilinear只支持4D Tensor(bilinear:双线性插值)
image = torch.nn.functional.interpolate(
image[None], scale_factor=scale_factor, mode="bilinear", recompute_scale_factor=True,
align_corners=False)[0]
return image
class GeneralizedRCNNTransform(nn.Module):
"""
Performs input / target transformation before feeding the data to a GeneralizedRCNN
model.
对输入图像进行归一化和大小调整等预处理操作,使得图像可以被送入 CNN 模型进行特征提取。具体来说,该文件中实现了对输入图像的缩放、均值方差归一化等操作。
The transformations it perform are:
- input normalization (mean subtraction and std division)
- input / target resizing to match min_size / max_size
It returns a ImageList for the inputs, and a List[Dict[Tensor]] for the targets
"""
def __init__(self, min_size, max_size, image_mean, image_std):
super(GeneralizedRCNNTransform, self).__init__()
if not isinstance(min_size, (list, tuple)):
min_size = (min_size,)
self.min_size = min_size # 指定图像的最小边长范围
self.max_size = max_size # 指定图像的最大边长范围
self.image_mean = image_mean # 指定图像在标准化处理中的均值
self.image_std = image_std # 指定图像在标准化处理中的方差
def normalize(self, image):
"""标准化处理"""
# 消除量纲影响:不同的变量往往有不同的单位和量纲,而这些单位和量纲的不同会影响到不同变量之间的比较。标准化处理可以将不同变量的值转化为相同的单位和量纲,消除量纲影响,使得不同变量之间更容易进行比较和分析。
dtype, device = image.dtype, image.device
mean = torch.as_tensor(self.image_mean, dtype=dtype, device=device)
std = torch.as_tensor(self.image_std, dtype=dtype, device=device)
# [:, None, None]: shape [3] -> [3, 1, 1]( image是三维的[chanel,height,width],所以对mean和std也转变一下,使得维度相同
return (image - mean[:, None, None]) / std[:, None, None] # 图像减去均值除以方差就得到我们的标准化处理结果
def torch_choice(self, k):
# type: (List[int]) -> int
"""
Implements `random.choice` via torch ops so it can be compiled with
TorchScript. Remove if https://github.com/pytorch/pytorch/issues/25803
is fixed.
"""
index = int(torch.empty(1).uniform_(0., float(len(k))).item())
return k[index]
def resize(self, image, target):
# type: (Tensor, Optional[Dict[str, Tensor]]) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]
"""
将图片缩放到指定的大小范围内,并对应缩放bboxes信息
Args:
image: 输入的图片
target: 输入图片的相关信息(包括bboxes信息)
Returns:
image: 缩放后的图片
target: 缩放bboxes后的图片相关信息
"""
# image shape is [channel, height, width]
h, w = image.shape[-2:]
if self.training:
size = float(self.torch_choice(self.min_size)) # 指定输入图片的最小边长,注意是self.min_size不是min_size
else:
# FIXME assume for now that testing uses the largest scale
size = float(self.min_size[-1]) # 指定输入图片的最小边长,注意是self.min_size不是min_size
if torchvision._is_tracing():
image = _resize_image_onnx(image, size, float(self.max_size))
else:
image = _resize_image(image, size, float(self.max_size))
# 验证情况
if target is None:
return image, target
bbox = target["boxes"]
# 根据图像的缩放比例来缩放bbox
bbox = resize_boxes(bbox, [h, w], image.shape[-2:])
target["boxes"] = bbox
return image, target
# _onnx_batch_images() is an implementation of
# batch_images() that is supported by ONNX tracing.
@torch.jit.unused
def _onnx_batch_images(self, images, size_divisible=32):
# type: (List[Tensor], int) -> Tensor
max_size = []
for i in range(images[0].dim()):
max_size_i = torch.max(torch.stack([img.shape[i] for img in images]).to(torch.float32)).to(torch.int64)
max_size.append(max_size_i)
stride = size_divisible
max_size[1] = (torch.ceil((max_size[1].to(torch.float32)) / stride) * stride).to(torch.int64)
max_size[2] = (torch.ceil((max_size[2].to(torch.float32)) / stride) * stride).to(torch.int64)
max_size = tuple(max_size)
# work around for
# pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
# which is not yet supported in onnx
padded_imgs = []
for img in images:
padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))]
padded_img = torch.nn.functional.pad(img, [0, padding[2], 0, padding[1], 0, padding[0]])
padded_imgs.append(padded_img)
return torch.stack(padded_imgs)
def max_by_axis(self, the_list):
# type: (List[List[int]]) -> List[int]
maxes = the_list[0]
for sublist in the_list[1:]:
for index, item in enumerate(sublist):
maxes[index] = max(maxes[index], item)
return maxes
def batch_images(self, images, size_divisible=32):
# type: (List[Tensor], int) -> Tensor
"""
将一批图像打包成一个batch返回(注意batch中每个tensor的shape是相同的)
Args:
images: 输入的一批图片
size_divisible: 将图像高和宽调整到该数的整数倍
Returns:
batched_imgs: 打包成一个batch后的tensor数据
"""
if torchvision._is_tracing():
# batch_images() does not export well to ONNX
# call _onnx_batch_images() instead
return self._onnx_batch_images(images, size_divisible)
# 分别计算一个batch中所有图片中的最大channel, height, width
max_size = self.max_by_axis([list(img.shape) for img in images])
stride = float(size_divisible)
# max_size = list(max_size)
# 将height向上调整到stride的整数倍
max_size[1] = int(math.ceil(float(max_size[1]) / stride) * stride)
# 将width向上调整到stride的整数倍
max_size[2] = int(math.ceil(float(max_size[2]) / stride) * stride)
# [batch, channel, height, width]
batch_shape = [len(images)] + max_size
# 创建shape为batch_shape且值全部为0的tensor
batched_imgs = images[0].new_full(batch_shape, 0)
for img, pad_img in zip(images, batched_imgs):
# 将输入images中的每张图片复制到新的batched_imgs的每张图片中,对齐左上角,保证bboxes的坐标不变
# 这样保证输入到网络中一个batch的每张图片的shape相同
# copy_: Copies the elements from src into self tensor and returns self
pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
return batched_imgs
def postprocess(self,
result, # type: List[Dict[str, Tensor]]
image_shapes, # type: List[Tuple[int, int]]
original_image_sizes # type: List[Tuple[int, int]]
):
# type: (...) -> List[Dict[str, Tensor]]
"""
对网络的预测结果进行后处理(主要将bboxes还原到原图像尺度上)
Args:
result: list(dict), 网络的预测结果, len(result) == batch_size
image_shapes: list(torch.Size), 图像预处理缩放后的尺寸, len(image_shapes) == batch_size
original_image_sizes: list(torch.Size), 图像的原始尺寸, len(original_image_sizes) == batch_size
Returns:
"""
# 如果是训练的话就不需要把图像进行后处理了,就只需要获取他的损失,然后反向传播就行了
if self.training:
return result
# 验证过程,需要获取他原始的bounding box信息,通过zip方法同时遍历(result、image_shapes、original_image_sizes信息)
# 遍历每张图片的预测信息,将boxes信息还原回原尺度
# enumerate(iterable, start=0)、其中,iterable 是要枚举的可迭代对象,例如列表、元组、字符串等,start 是开始枚举的索引,默认值为 0。
# 在这里i 是索引,(pred, im_s, o_im_s) 分别对应zip里面的信息
for i, (pred, im_s, o_im_s) in enumerate(zip(result, image_shapes, original_image_sizes)):
boxes = pred["boxes"]
boxes = resize_boxes(boxes, im_s, o_im_s) # 将bboxes缩放回原图像尺度上
result[i]["boxes"] = boxes
# 验证这里就把对应的bboxes缩放回原始图像了,训练那里不需要
return result
def __repr__(self):
"""自定义输出实例化对象的信息,可通过print打印实例信息"""
format_string = self.__class__.__name__ + '('
_indent = '\n '
format_string += "{0}Normalize(mean={1}, std={2})".format(_indent, self.image_mean, self.image_std)
format_string += "{0}Resize(min_size={1}, max_size={2}, mode='bilinear')".format(_indent, self.min_size,
self.max_size)
format_string += '\n)'
return format_string
def forward(self,
images, # type: List[Tensor]
targets=None # type: Optional[List[Dict[str, Tensor]]]
):
# type: (...) -> Tuple[ImageList, Optional[List[Dict[str, Tensor]]]]
images = [img for img in images]
for i in range(len(images)):
image = images[i]
target_index = targets[i] if targets is not None else None
if image.dim() != 3:
raise ValueError("images is expected to be a list of 3d tensors "
"of shape [C, H, W], got {}".format(image.shape))
image = self.normalize(image) # 对图像进行标准化处理
image, target_index = self.resize(image, target_index) # 对图像和对应的bboxes缩放到指定范围
images[i] = image
if targets is not None and target_index is not None:
targets[i] = target_index
# 记录resize后的图像尺寸
image_sizes = [img.shape[-2:] for img in images]
images = self.batch_images(images) # 将images打包成一个batch
image_sizes_list = torch.jit.annotate(List[Tuple[int, int]], [])
for image_size in image_sizes:
assert len(image_size) == 2
image_sizes_list.append((image_size[0], image_size[1]))
image_list = ImageList(images, image_sizes_list)
return image_list, targets
def resize_boxes(boxes, original_size, new_size):
# type: (Tensor, List[int], List[int]) -> Tensor
"""
将boxes参数根据图像的缩放情况进行相应缩放
Arguments:
original_size: 图像缩放前的尺寸
new_size: 图像缩放后的尺寸
"""
# ratios分别有高度和宽度方向的缩放因子
ratios = [
torch.tensor(s, dtype=torch.float32, device=boxes.device) /
torch.tensor(s_orig, dtype=torch.float32, device=boxes.device)
for s, s_orig in zip(new_size, original_size)
]
ratios_height, ratios_width = ratios
# Removes a tensor dimension, boxes [minibatch, 4]
# Returns a tuple of all slices along a given dimension, already without it.
# 在索引为1的情况展开
xmin, ymin, xmax, ymax = boxes.unbind(1)
xmin = xmin * ratios_width
xmax = xmax * ratios_width
ymin = ymin * ratios_height
ymax = ymax * ratios_height
return torch.stack((xmin, ymin, xmax, ymax), dim=1)