gluoncv之yolov3训练源码解析
文章目录
前言
本项为gluoncv官方源码解析
一、训练集预处理
训练集预处理在YOLO3DefaultTrainTransform类中实现,关于图像的预处理操作在__call__函数中,
方便直接调用。__init__初始化函数中的YOLOV3PrefetchTargetGenerator类主要是为了标注训练集中的标签,生成符合yolo格式的标签(objectness,bounding_box,cls)。
class YOLO3DefaultTrainTransform(object):
def __init__(self, width, height, net=None, mean=(0.485, 0.456, 0.406),
std=(0.229, 0.224, 0.225), mixup=False, **kwargs):
self._width = width
self._height = height
self._mean = mean
self._std = std
self._mixup = mixup
self._target_generator = None
if net is None:
return
# in case network has reset_ctx to gpu
self._fake_x = mx.nd.zeros((1, 3, height, width))
net = copy.deepcopy(net)
net.collect_params().reset_ctx(None)
with autograd.train_mode():
_, self._anchors, self._offsets, self._feat_maps, _, _, _, _ = net(self._fake_x)
from ....model_zoo.yolo.yolo_target import YOLOV3PrefetchTargetGenerator
self._target_generator = YOLOV3PrefetchTargetGenerator(
num_class=len(net.classes), **kwargs)
def __call__(self, src, label):
"""Apply transform to training image/label."""
# random color jittering
img = experimental.image.random_color_distort(src)
# random expansion with prob 0.5
if np.random.uniform(0, 1) > 0.5:
img, expand = timage.random_expand(img, fill=[m * 255 for m in self._mean])
bbox = tbbox.translate(label, x_offset=expand[0], y_offset=expand[1])
else:
img, bbox = img, label
# random cropping
h, w, _ = img.shape
bbox, crop = experimental.bbox.random_crop_with_constraints(bbox, (w, h))
x0, y0, w, h = crop
img = mx.image.fixed_crop(img, x0, y0, w, h) # 按照裁剪后的box裁剪图像
# resize with random interpolation
h, w, _ = img.shape
interp = np.random.randint(0, 5) # 插值
img = timage.imresize(img, self._width, self._height, interp=interp)
bbox = tbbox.resize(bbox, (w, h), (self._width, self._height))
# random horizontal flip
h, w, _ = img.shape
img, flips = timage.random_flip(img, px=0.5)
bbox = tbbox.flip(bbox, (w, h), flip_x=flips[0])
# to tensor
img = mx.nd.image.to_tensor(img) # 转换为ndarray格式,数值分布在0-1之间的矩阵
img = mx.nd.image.normalize(img, mean=self._mean, std=self._std) # 标准化
if self._target_generator is None:
return img, bbox.astype(img.dtype)
# generate training target so cpu workers can help reduce the workload on gpu
# 下面的为bbox的标注程序
gt_bboxes = mx.nd.array(bbox[np.newaxis, :, :4])
gt_ids = mx.nd.array(bbox[np.newaxis, :, 4:5])
if self._mixup:
gt_mixratio = mx.nd.array(bbox[np.newaxis, :, -1:])
else:
gt_mixratio = None
objectness, center_targets, scale_targets, weights, class_targets = self._target_generator(
self._fake_x, self._feat_maps, self._anchors, self._offsets,
gt_bboxes, gt_ids, gt_mixratio)
return (img, objectness[0], center_targets[0], scale_targets[0], weights[0],
class_targets[0], gt_bboxes[0])
1.random_color_distort函数
其实就是对图像的亮度、对比度、饱和度、色度进行调整。
2.random_expand函数
先贴一张图片
就是简单的生成一张类似于画布的背景,然后将原始图片随机的放在上面。
3.random_crop_with_constraints函数
生成裁剪之后的边界框,然后根据生成的边界框来裁剪图像。这里的裁剪方法是参考了Single Shot Multibox Detector,也就是SSD。
def random_crop_with_constraints(bbox, size, min_scale=0.3, max_scale=1,
max_aspect_ratio=2, constraints=None,
max_trial=50):
# default params in paper
if constraints is None:
constraints = (
(0.1, None),
(0.3, None),
(0.5, None),
(0.7, None),
(0.9, None),
(None, 1),
)
w, h = size
candidates = [(0, 0, w, h)]
for min_iou, max_iou in constraints:
min_iou = -np.inf if min_iou is None else min_iou
max_iou = np.inf if max_iou is None else max_iou
for _ in range(max_trial): # 最大循环max_trial次,满足条件则退出循环
scale = random.uniform(min_scale, max_scale)
aspect_ratio = random.uniform(
max(1 / max_aspect_ratio, scale * scale),
min(max_aspect_ratio, 1 / (scale * scale)))
# 参考SSD中的生成锚框计算公式
crop_h = int(h * scale / np.sqrt(aspect_ratio)) # 裁剪后的bbox高
crop_w = int(w * scale * np.sqrt(aspect_ratio)) # 裁剪后的bbox高
crop_t = random.randrange(h - crop_h)
crop_l = random.randrange(w - crop_w)
crop_bb = np.array((crop_l, crop_t, crop_l + crop_w, crop_t + crop_h))
if len(bbox) == 0:
top, bottom = crop_t, crop_t + crop_h
left, right = crop_l, crop_l + crop_w
return bbox, (left, top, right-left, bottom-top)
iou = tbbox.bbox_iou(bbox, crop_bb[np.newaxis]) # 计算bbox和裁剪后的bbox的iou
if min_iou <= iou.min() and iou.max() <= max_iou: # 大于最小iou,小于最大iou则满足条件
top, bottom = crop_t, crop_t + crop_h # 左上角坐标
left, right = crop_l, crop_l + crop_w # 右下角坐标
candidates.append((left, top, right-left, bottom-top)) # [x,y,w,h]形式存储,(x,y)为左上角坐标
break
# random select one
while candidates:
crop = candidates.pop(np.random.randint(0, len(candidates))) # 随机返回一个bbox框
new_bbox = tbbox.crop(bbox, crop, allow_outside_center=False) # 筛序正确的bbox
if new_bbox.size < 1:
continue
new_crop = (crop[0], crop[1], crop[2], crop[3])
return new_bbox, new_crop
return bbox, (0, 0, w, h)
4.imresize函数
简单调用opencv的resize函数,iterp为插值选项
4.random_flip函数
随机翻转,可以选择水平,垂直翻转,主要调用mxnet.nd.flip函数。
5.to_tensor,normalize函数
to_tensor 主要是转换为ndarray格式,数值分布在0-1之间的矩阵
normalize 再进行进行标准化处理
二、验证集预处理
验证集预处理实现在YOLO3DefaultValTransform类中,
只是简单地resize和标准化,原理同上。
class YOLO3DefaultValTransform(object):
def __init__(self, width, height, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)):
self._width = width
self._height = height
self._mean = mean
self._std = std
def __call__(self, src, label):
"""Apply transform to validation image/label."""
# resize
h, w, _ = src.shape
img = image.imresize(src, self._width, self._height, interp=_get_interp(9, (h, w, self._height, self._width)))
bbox = tbbox.resize(label, in_size=(w, h), out_size=(self._width, self._height))
img = mx.nd.image.to_tensor(img)
img = mx.nd.image.normalize(img, mean=self._mean, std=self._std)
return img, bbox.astype(img.dtype)