简介
Single Shot MultiBox Detector (SSD) 是一种单阶段的目标检测器。与两阶段的检测方法不同,单阶段目标检测并不进行区域推荐,而是直接从特征图回归出目标的边界框和分类概率。SSD 运用了这种单阶段检测的思想,并且对其进行改进:在不同尺度的特征图上检测对应尺度的目标。如下图所示,SSD 在六个尺度的特征图上进行了不同层级的预测。每个层级由两个3x3卷积分别对目标类别和边界框偏移进行回归。因此对于每个类别,SSD 的六个层级一共会产生 38x38x4 + 19x19x6 + 10x10x6 + 5x5x6 + 3x3x4 + 1x1x4 = 8732 个检测结果。
SSD 可以方便地插入到任何一种标准卷积网络中,比如 VGG、ResNet 或者 MobileNet,这些网络被称作检测器的基网络。在这个示例中我们使用 MobileNet。
在训练时还会对图片进行数据增强,包括随机扰动、扩张、翻转和裁剪:
- 扰动: 扰动图片亮度、对比度、饱和度和色相。
- 扩张: 将原始图片放进一张使用像素均值填充(随后会在减均值操作中减掉)的扩张图中,再对此图进行裁剪、缩放和翻转。
- 翻转: 水平翻转。
- 裁剪: 根据缩放比例、长宽比例两个参数生成若干候选框,再依据这些候选框和标注框的面积交并比(IoU)挑选出符合要求的裁剪结果。
参考链接
- 参考论文:https://arxiv.org/pdf/1512.02325.pdf
- 推荐博客:https://blog.csdn.net/xiaohu2022/article/details/79833786
引入 pascal-voc 数据集,解压,然后删除不必要的图片
In[2]
# 查看当前挂载的数据集目录
!cd /home/aistudio/data/data4379 && unzip -q pascalvoc.zip
处理预训练的模型,创建pretrained-model目录,解压模型后移动到该目录下
In[3]
!cp data/data5389/mobilenet_v1_imagenet.zip pretrained-model/
!cd pretrained-model && unzip -qo mobilenet_v1_imagenet.zip
!cd pretrained-model && mv mobilenet_v1_imagenet/* . && rm -r mobilenet_v1_imagenet && rm mobilenet_v1_imagenet.zip
定义训练ssd相关的配置
In[1]
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import uuid
import numpy as np
import time
import six
import math
import paddle
import paddle.fluid as fluid
import logging
import xml.etree.ElementTree
import codecs
from paddle.fluid.initializer import MSRA
from paddle.fluid.param_attr import ParamAttr
from PIL import Image, ImageEnhance, ImageDraw
logger = None
train_parameters = {
"input_size": [3, 300, 300],
"class_dim": -1,
"label_dict": {},
"image_count": -1,
"log_feed_image": False,
"pretrained": True,
"pretrained_model_dir": "./pretrained-model",
"continue_train": False,
"save_model_dir": "./ssd-model",
"model_prefix": "mobilenet-ssd",
#"data_dir": "/home/work/xiangyubo/common_resource/pascalvoc/pascalvoc",
"data_dir": "/home/aistudio/data/data4379/pascalvoc",
"mean_rgb": [127.5, 127.5, 127.5],
"file_list": "train.txt",
"mode": "train",
"multi_data_reader_count": 5,
"num_epochs": 120,
"train_batch_size": 64,
"use_gpu": True,
"apply_distort": True,
"apply_expand": True,
"apply_corp": True,
"image_distort_strategy": {
"expand_prob": 0.5,
"expand_max_ratio": 4,
"hue_prob": 0.5,
"hue_delta": 18,
"contrast_prob": 0.5,
"contrast_delta": 0.5,
"saturation_prob": 0.5,
"saturation_delta": 0.5,
"brightness_prob": 0.5,
"brightness_delta": 0.125
},
"rsm_strategy": {
"learning_rate": 0.001,
"lr_epochs": [20, 40, 60, 80, 100],
"lr_decay": [1, 0.5, 0.25, 0.1, 0.05, 0.01],
},
"momentum_strategy": {
"learning_rate": 0.1,
"decay_steps": 2 ** 7,
"decay_rate": 0.8
},
"early_stop": {
"sample_frequency": 50,
"successive_limit": 3,
"min_loss": 1.28,
"min_curr_map": 0.86
}
}
定义基于 mobile-net 的SSD网络结构
In[2]
class MobileNetSSD:
def __init__(self):
pass
def conv_bn(self,
input,
filter_size,
num_filters,
stride,
padding,
num_groups=1,
act='relu',
use_cudnn=True):
parameter_attr = ParamAttr(learning_rate=0.1, initializer=MSRA())
conv = fluid.layers.conv2d(
input=input,
num_filters=num_filters,
filter_size=filter_size,
stride=stride,
padding=padding,
groups=num_groups,
act=None,
use_cudnn=use_cudnn,
param_attr=parameter_attr,
bias_attr=False)
return fluid.layers.batch_norm(input=conv, act=act)
def depthwise_separable(self, input, num_filters1, num_filters2, num_groups, stride, scale):
depthwise_conv = self.conv_bn(
input=input,
filter_size=3,
num_filters=int(num_filters1 * scale),
stride=stride,
padding=1,
num_groups=int(num_groups * scale),
use_cudnn=False)
pointwise_conv = self.conv_bn(
input=depthwise_conv,
filter_size=1,
num_filters=int(num_filters2 * scale),
stride=1,
padding=0)
return pointwise_conv
def extra_block(self, input, num_filters1, num_filters2, num_groups, stride, scale):
# 1x1 conv
pointwise_conv = self.conv_bn(
input=input,
filter_size=1,
num_filters=int(num_filters1 * scale),
stride=1,
num_groups=int(num_groups * scale),
padding=0)
# 3x3 conv
normal_conv = self.conv_bn(
input=pointwise_conv,
filter_size=3,
num_filters=int(num_filters2 * scale),
stride=2,
num_groups=int(num_groups * scale),
padding=1)
return normal_conv
def net(self, num_classes, img, img_shape, scale=1.0):
# 300x300
tmp = self.conv_bn(img, 3, int(32 * scale), 2, 1)
# 150x150
tmp = self.depthwise_separable(tmp, 32, 64, 32, 1, scale)
tmp = self.depthwise_separable(tmp, 64, 128, 64, 2, scale)
# 75x75
tmp = self.depthwise_separable(tmp, 128, 128, 128, 1, scale)
tmp = self.depthwise_separable(tmp, 128, 256, 128, 2, scale)
# 38x38
tmp = self.depthwise_separable(tmp, 256, 256, 256, 1, scale)
tmp = self.depthwise_separable(tmp, 256, 512, 256, 2, scale)
# 19x19
for i in range(5):
tmp = self.depthwise_separable(tmp, 512, 512, 512, 1, scale)
module11 = tmp
tmp = self.depthwise_separable(tmp, 512, 1024, 512, 2, scale)
# 10x10
module13 = self.depthwise_separable(tmp, 1024, 1024, 1024, 1, scale)
module14 = self.extra_block(module13, 256, 512, 1, 2, scale)