SSD系列代码解读:(一) Prior Box
SSD系列代码解读:(二) Data Augmentation
SSD系列代码解读:(三) MultiboxLoss
本部分代码是pytorch版本的,非官方的caffe实现,贴上代码解读的同时会与caffe实现进行比较。先贴代码
import torch
from torchvision import transforms
import cv2
import numpy as np
import random
import math
from utils.box_utils import matrix_iou
def _crop(image, boxes, labels):
height, width, _ = image.shape
if len(boxes)== 0:
return image, boxes, labels
while True: # caffe中的min_iou多了个1.0
mode = random.choice((
None,
(0.1, None),
(0.3, None),
(0.5, None),
(0.7, None),
(0.9, None),
(None, None),
))
if mode is None: #随到None,直接返回,1/6概率
return image, boxes, labels
min_iou, max_iou = mode
if min_iou is None:
min_iou = float('-inf')
if max_iou is None:
max_iou = float('inf')
for _ in range(50): #最大重复裁剪50次,直到某次裁剪合格
# 面积比是scale^2, aspect_ratio是长宽比,从而获得img_n的w和h
scale = random.uniform(0.3,1.)
min_ratio = max(0.5, scale*scale)
max_ratio = min(2, 1. / scale / scale)
ratio = math.sqrt(random.uniform(min_ratio, max_ratio))
w = int(scale * ratio * width)
h = int((scale / ratio) * height)
# 随机生成img_n的左上角点坐标,进而获得img_n的位置,就是roi
l = random.randrange(width - w)
t = random.randrange(height - h)
roi = np.array((l, t, l + w, t + h))
iou = matrix_iou(boxes, roi[np.newaxis])
# 若不存在任何一个GT与roi的iou大于之前随机的iou_min,则重新裁剪
if not (min_iou <= iou.min() and iou.max() <= max_iou):
continue
# 获取img_n的像素信息,注意height是第一维
image_t = image[roi[1]:roi[3], roi[0]:roi[2]]
# 仅保留GT中心在img_n的img_n,若没有,则重新裁剪
centers = (boxes[:, :2] + boxes[:, 2:]) / 2
mask = np.logical_and(roi[:2] < centers, centers < roi[2:]) \
.all(axis=1)
boxes_t = boxes[mask].copy()
labels_t = labels[mask].copy()
if len(boxes_t) == 0:
continue
# 对GT的坐标重新限定,主要是因为边界问题
boxes_t[:, :2] = np.maximum(boxes_t[:, :2], roi[:2])
boxes_t[:, :2] -= roi[:2]
boxes_t[:, 2:] = np.minimum(boxes_t[:, 2:], roi[2:])
boxes_t[:, 2:] -= roi[:2]
# 返回裁剪后的img,box和label信息
return image_t, boxes_t,labels_t
# 亮度对比度在RGB空间调整,色相饱和度在HSV空间调整,都是以0.5的概率
def _distort(image):
def _convert(image, alpha=1, beta=0):
tmp = image.astype(float) * alpha + beta
tmp[tmp < 0] = 0
tmp[tmp > 255] = 255
image[:] = tmp
image = image.copy()
if random.randrange(2):
_convert(image, beta=random.uniform(-32, 32))
if random.randrange(2):
_convert(image, alpha=random.uniform(0.5, 1.5))
image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
if random.randrange(2):
tmp = image[:, :, 0].astype(int) + random.randint(-18, 18)
tmp %= 180
image[:, :, 0] = tmp
if random.randrange(2):
_convert(image[:, :, 1], alpha=random.uniform(0.5, 1.5))
image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR)
return image
# 扩展图片,以p的概率,caffe中p=0.5,pytorch中p=0.6
def _expand(image, boxes,fill, p):
if random.random() > p:
return image, boxes
height, width, depth = image.shape
for _ in range(50): # 最大重复实验50次
scale = random.uniform(1,4)
min_ratio = max(0.5, 1./scale/scale)
max_ratio = min(2, scale*scale)
ratio = math.sqrt(random.uniform(min_ratio, max_ratio))
ws = scale*ratio
hs = scale/ratio
if ws < 1 or hs < 1: # 扩展后的长和宽必须都要大于1
continue
w = int(ws * width)
h = int(hs * height)
# 随机生成左上角的点的坐标
left = random.randint(0, w - width)
top = random.randint(0, h - height)
# 对GT的坐标的调整
boxes_t = boxes.copy()
boxes_t[:, :2] += (left, top)
boxes_t[:, 2:] += (left, top)
# 扩展后的图像,和原图重叠部分原像素填充;其他部分填充均值,因为后续需要减去均值,所以等价于0填充,即为黑边
expand_image = np.empty(
(h, w, depth),
dtype=image.dtype)
expand_image[:, :] = fill
expand_image[top:top + height, left:left + width] = image
image = expand_image
return image, boxes_t
# 以0.5的概率水平翻转,返回处理后的图片和GT信息
def _mirror(image, boxes):
_, width, _ = image.shape
if random.randrange(2):
image = image[:, ::-1]
boxes = boxes.copy()
boxes[:, 0::2] = width - boxes[:, 2::-2]
return image, boxes
# 随机选择一种resize方式,进行resize,并将channel维度调到第一维
def preproc_for_test(image, insize, mean):
interp_methods = [cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA, cv2.INTER_NEAREST, cv2.INTER_LANCZOS4]
interp_method = interp_methods[random.randrange(5)]
image = cv2.resize(image, (insize, insize),interpolation=interp_method)
image = image.astype(np.float32)
image -= mean
return image.transpose(2, 0, 1)
# 数据增强类
class preproc(object):
def __init__(self, resize, rgb_means, p):
self.means = rgb_means
self.resize = resize
self.p = p
def __call__(self, image, targets):
# targets.shape = (n,5),n是img中target的数量,5是(x1,y1,x2,y2,label)
# image 是原图
boxes = targets[:,:-1].copy()
labels = targets[:,-1].copy()
if len(boxes) == 0: # 若img中没有gt, resize后再减去均值直接返回
#boxes = np.empty((0, 4))
targets = np.zeros((1,5))
image = preproc_for_test(image, self.resize, self.means)
return torch.from_numpy(image), targets
# 下面的代码段实现拷贝作用,备份。
image_o = image.copy()
targets_o = targets.copy()
height_o, width_o, _ = image_o.shape
boxes_o = targets_o[:,:-1]
labels_o = targets_o[:,-1]
boxes_o[:, 0::2] /= width_o
boxes_o[:, 1::2] /= height_o
labels_o = np.expand_dims(labels_o,1)
targets_o = np.hstack((boxes_o,labels_o))
#数据增强部分
image_t, boxes, labels = _crop(image, boxes, labels) # 先裁剪
image_t = _distort(image_t) # 亮度对比度色相饱和度等属性调整
image_t, boxes = _expand(image_t, boxes, self.means, self.p) #裁剪后再扩展
image_t, boxes = _mirror(image_t, boxes) # 水平翻转
height, width, _ = image_t.shape
image_t = preproc_for_test(image_t, self.resize, self.means) # aug后的img进行resize并减去均值
# GT信息不参与resize操作,并将其转化为和resize前aug后的图片的百分比形式,因为SSD的预测信息是小数。
boxes = boxes.copy()
boxes[:, 0::2] /= width
boxes[:, 1::2] /= height
b_w = (boxes[:, 2] - boxes[:, 0])*1.
b_h = (boxes[:, 3] - boxes[:, 1])*1.
mask_b= np.minimum(b_w, b_h) > 0.01 # 太小的GT排除掉
boxes_t = boxes[mask_b]
labels_t = labels[mask_b].copy()
if len(boxes_t)==0: #若aug后的img都是太小的GT,则取消aug,直接对原图resize并剪均值。
image = preproc_for_test(image_o, self.resize, self.means) #此处体现了之前备份的作用
return torch.from_numpy(image),targets_o
labels_t = np.expand_dims(labels_t,1)
targets_t = np.hstack((boxes_t,labels_t)) #整合targets信息
return torch.from_numpy(image_t), targets_t
class BaseTransform(object):
"""Defines the transformations that should be applied to test PIL image
for input into the network
dimension -> tensorize -> color adj
Arguments:
resize (int): input dimension to SSD
rgb_means ((int,int,int)): average RGB of the dataset
(104,117,123)
swap ((int,int,int)): final order of channels
Returns:
transform (transform) : callable transform to be applied to test/val
data
"""
def __init__(self, resize, rgb_means, swap=(2, 0, 1)):
self.means = rgb_means
self.resize = resize
self.swap = swap
# assume input is cv2 img for now
def __call__(self, img):
interp_methods = [cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA, cv2.INTER_NEAREST, cv2.INTER_LANCZOS4]
interp_method = interp_methods[0]
img = cv2.resize(np.array(img), (self.resize,
self.resize),interpolation = interp_method).astype(np.float32)
img -= self.means
img = img.transpose(self.swap)
return torch.from_numpy(img)
整个Aug的流程是:
crop的操作最为复杂:
1. 首先随机选取iou_min;
2. 随机选择scale(<1),进行面积上的缩放,再随机出aspect_ratio, 进行长宽比缩放,进一步随机出crop区域的左上角坐标,从而确定crop区域;
3. 判断是否存在一个GT与crop区域的 iou > iou_min,若不存在,最大重复实验50次;
4. 满足3后,再筛选出满足GT的中心点在crop区域的crop区域,若没有,最大重复实验50次;
5,针对crop区域,修改GT信息,主要是边界的调整。
expand的操作如下:
1. 以一定的概率进行扩展;
2. 随机生成scale(>1)和aspect_ratio, 判断扩展后的width和height是否都大于未扩展前的,若不满足,最大重复实验50次;
3. 随机生成扩展图像的左上角坐标,并修改GT信息,主要是进行一个平移;
4. 扩展后的图像像素填充。未扩展前图像的那部分原像素填充,其他部分则均值填充,因为后续还需要减去均值,所以等价于0值填充。