该API主要就是将 img 的下侧和右侧填充0,将同一个 batch 的照片填充为相同大小的图片
PadBatch
代码:
@register_op
class PadBatch(BaseOperator):
"""
Pad a batch of samples so they can be divisible by a stride.
The layout of each image should be 'CHW'.
Args:
pad_to_stride (int): If `pad_to_stride > 0`, pad zeros to ensure
height and width is divisible by `pad_to_stride`.
"""
def __init__(self, pad_to_stride=0):
super(PadBatch, self).__init__()
self.pad_to_stride = pad_to_stride
def __call__(self, samples, context=None):
"""
Args:
samples (list): a batch of sample, each is dict.
"""
coarsest_stride = self.pad_to_stride
# multi scale input is nested list
if isinstance(samples,
typing.Sequence) and len(samples) > 0 and isinstance(
samples[0], typing.Sequence):
inner_samples = samples[0]
else:
inner_samples = samples
max_shape = np.array(
[data['image'].shape for data in inner_samples]).max(axis=0)
if coarsest_stride > 0:
max_shape[1] = int(
np.ceil(max_shape[1] / coarsest_stride) * coarsest_stride)
max_shape[2] = int(
np.ceil(max_shape[2] / coarsest_stride) * coarsest_stride)
for data in inner_samples:
im = data['image']
im_c, im_h, im_w = im.shape[:]
padding_im = np.zeros(
(im_c, max_shape[1], max_shape[2]), dtype=np.float32)
padding_im[:, :im_h, :im_w] = im
data['image'] = padding_im
if 'semantic' in data and data['semantic'] is not None:
semantic = data['semantic']
padding_sem = np.zeros(
(1, max_shape[1], max_shape[2]), dtype=np.float32)
padding_sem[:, :im_h, :im_w] = semantic
data['semantic'] = padding_sem
if 'gt_segm' in data and data['gt_segm'] is not None:
gt_segm = data['gt_segm']
padding_segm = np.zeros(
(gt_segm.shape[0], max_shape[1], max_shape[2]),
dtype=np.uint8)
padding_segm[:, :im_h, :im_w] = gt_segm
data['gt_segm'] = padding_segm
return samples
传入的 sample 数据:
>>> samples[0].keys()
dict_keys(['im_id', 'h', 'w', 'is_crowd', 'gt_class', 'gt_bbox', 'curr_iter', 'image', 'im_shape', 'scale_factor', 'flipped'])
>>> samples[0]
{'curr_iter': 0,
'flipped': True,
'gt_bbox': array([[139.37924, 31.7056 , 437.1072 , 720.53754]], dtype=float32),
'gt_class': array([[14]], dtype=int32),
'h': 640.0,
'w': 475.0
'im_id': array([270705]),
'im_shape': array([819.2, 608. ], dtype=float32),
'image': array([[[-1.5699117 , -1.5527871 , -1.5870365 , ..., 2.2489083 ,
......
2.1171246 , 2.1345537 ]]], dtype=float32),
'is_crowd': array([[0]], dtype=int32),
'scale_factor': array([1.28, 1.28], dtype=float32),
}
该 transform 的作用是,取同一个 batch 的所有照片,找最大的图片长宽,然后填充更多的0,使得该长宽可以整除 pad_to_stride
值,
该 transform 的作用和 PadMaskBatch
的作用类似:
https://blog.csdn.net/HaoZiHuang/article/details/128421348
只不过后者会返回 mask , 而 PadBatch
不返回mask,只返回原图的宽高
# 找到图片最大的尺度
max_shape = np.array(
[data['image'].shape for data in inner_samples]).max(axis=0)
# 把 max_shape 换成可以整除的
if coarsest_stride > 0:
max_shape[1] = int(
np.ceil(max_shape[1] / coarsest_stride) * coarsest_stride) # np.ceil 是向上取整的意思
max_shape[2] = int(
np.ceil(max_shape[2] / coarsest_stride) * coarsest_stride)
for data in inner_samples: # 迭代每一张图片的信息
# --------- 将图片放到全部填充为0的 padding_im ---------
im = data['image']
im_c, im_h, im_w = im.shape[:]
padding_im = np.zeros(
(im_c, max_shape[1], max_shape[2]), dtype=np.float32)
padding_im[:, :im_h, :im_w] = im
data['image'] = padding_im
# 以下是对分割信息的处理
if 'semantic' in data and data['semantic'] is not None:
semantic = data['semantic']
padding_sem = np.zeros(
(1, max_shape[1], max_shape[2]), dtype=np.float32)
padding_sem[:, :im_h, :im_w] = semantic
data['semantic'] = padding_sem
if 'gt_segm' in data and data['gt_segm'] is not None:
gt_segm = data['gt_segm']
padding_segm = np.zeros(
(gt_segm.shape[0], max_shape[1], max_shape[2]),
dtype=np.uint8)
padding_segm[:, :im_h, :im_w] = gt_segm
data['gt_segm'] = padding_segm
return samples