-
SSD是一个利用多尺度特性的目标检测网络,特征提取部分使用VGG backbone,后面作者又添加了多尺度feature maps,并从这些feature maps上的固定位置处划分bounding box,对这些box进行分类和边框尺寸回归达到目标检测目的。
-
总体架构:
- VGG16特征提取,注意 这里对最后的全连接层进行了修改
- 多尺度特征层拓展
- 边框分类和回归
-
预备知识:
- VGG16的创建方式参考Pytorch VGG16源码解读
- 边框分类和回归的一些基础知识还需参考CNN based目标检测始祖 R-CNN
-
下文代码来源https://github.com/amdegroot/ssd.pytorch 我将在代码中适当位置加入注解(中文)帮助分析SSD
1. SSD网络结构
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from layers import *
from data import voc, coco
import os
# SSD类是所有功能的集合体
class SSD(nn.Module):
"""Single Shot Multibox Architecture
The network is composed of a base VGG network followed by the
added multibox conv layers. Each multibox layer branches into
1) conv2d for class conf scores
2) conv2d for localization predictions
3) associated priorbox layer to produce default bounding
boxes specific to the layer's feature map size.
See: https://arxiv.org/pdf/1512.02325.pdf for more details.
Args:
phase: (string) Can be "test" or "train"
size: input image size
base: VGG16 layers for input, size of either 300 or 500
extras: extra layers that feed to multibox loc and conf layers
head: "multibox head" consists of loc and conf conv layers
"""
def __init__(self, phase, size, base, extras, head, num_classes):
super(SSD, self).__init__()
self.phase = phase
self.num_classes = num_classes
self.cfg = (coco, voc)[num_classes == 21]
self.priorbox = PriorBox(self.cfg)
self.priors = Variable(self.priorbox.forward(), volatile=True)
self.size = size
# SSD network
self.vgg = nn.ModuleList(base)
# Layer learns to scale the l2 normalized features from conv4_3
self.L2Norm = L2Norm(512, 20)
self.extras = nn.ModuleList(extras)
# head里面是列表形式,在此可以将列表形式的网络转换成可以反向传播的网络
# 理解这里需要先看一下 下文的multibox函数!
# ModuleList 参考[1]
self.loc = nn.ModuleList(head[0])
self.conf = nn.ModuleList(head[1])
if phase == 'test':
self.softmax = nn.Softmax(dim=-1)
self.detect = Detect(num_classes, 0, 200, 0.01, 0.45)
def forward(self, x):
"""Applies network layers and ops on input image(s) x.
Args:
x: input image or batch of images. Shape: [batch,3,300,300].
Return:
Depending on phase:
test:
Variable(tensor) of output class label predictions,
confidence score, and corresponding location predictions for
each object detected. Shape: [batch,topk,7]
train:
list of concat outputs from:
1: confidence layers, Shape: [batch*num_priors,num_classes]
2: localization layers, Shape: [batch,num_priors*4]
3: priorbox layers, Shape: [2,num_priors*4]
"""
sources = list() # sources 是拿来分类和回归的 list() 创建空列表
loc = list()
conf = list()
# 通过这种方式得到不同的feature maps,存放到sources里面
# VGG16部分
# 操作方法可以使用nn.Sequential进行简化,从而避免使用for 循环
# apply vgg up to conv4_3 relu
for k in range(23):
x = self.vgg[k](x)
# VGG 取出来的第一部分
s = self.L2Norm(x)
sources.append(s)
# apply vgg up to fc7
for k in range(23, len(self.vgg)):
x = self.vgg[k](x)
# VGG取之来的第二部分
sources.append(x)
# 其余的多尺度特征
# apply extra layers and cache source layer outputs
for k, v in enumerate(self.extras):
x = F.relu(v(x), inplace=True) # extra layers进行卷积
if k % 2 == 1: # 每隔两次把得到的特征层加入sources
sources.append(x)
# apply multibox head to source layers
# 所有特征层进行回归和分类
# 语法 压缩遍历
for (x, l, c) in zip(sources, self.loc, self.conf):
loc.append(l(x).permute(0, 2, 3, 1).contiguous()) # 交换batch_size 的位置
conf.append(c(x).permute(0, 2, 3, 1).contiguous())
# view : 相当于reshape
loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1)
conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1)
if self.phase == "test":
output = self.detect(
loc.view(loc.size(0), -1, 4), # loc preds
self.softmax(conf.view(conf.size(0), -1,
self.num_classes)), # conf preds
self.priors.type(type(x.data)) # default boxes
)
else:
output = (
loc.view(loc.size(0), -1, 4),
conf.view(conf.size(0), -1, self.num_classes),
self.priors
)
return output
# 加载训练的权重
def load_weights(self, base_file):
other, ext = os.path.splitext(base_file)
if ext == '.pkl' or '.pth':
print('Loading weights into state dict...')
self.load_state_dict(torch.load(base_file,
map_location=lambda storage, loc: storage))
print('Finished!')
else:
print('Sorry only .pth and .pkl files supported.')
# This function is derived from torchvision VGG make_layers()
# https://github.com/pytorch/vision/blob/master/torchvision/models/vgg.py
'''
搭建VGG网络
最后输出1024
'''
def vgg(cfg, i, batch_norm=False):
layers = []
in_channels = i
for v in cfg:
if v == 'M': # 最大池化 特征尺寸为偶数
layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
elif v == 'C': # 应对奇数尺寸的特征
layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)]
else:
conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
if batch_norm:
layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
else:
layers += [conv2d, nn.ReLU(inplace=True)]
in_channels = v
pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
# 有别于VGG原始结构,这里改成了卷积操作,而不是fc layer
conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6)
conv7 = nn.Conv2d(1024, 1024, kernel_size=1)
layers += [pool5, conv6,
nn.ReLU(inplace=True), conv7, nn.ReLU(inplace=True)]
return layers
'''
VGG之后额外的特征提取网络
这一部分需要额外注意的是for循环部分!
'''
def add_extras(cfg, i, batch_norm=False):
# Extra layers added to VGG for feature scaling
layers = []
in_channels = i
flag = False
for k, v in enumerate(cfg): # 特别需要注意这个循环内部的处理
if in_channels != 'S': # 通过这一条语句,判断上一个循环是否是“S”,如果是S,则本次循环只更新in_channels
if v == 'S':
layers += [nn.Conv2d(in_channels, cfg[k + 1],
kernel_size=(1, 3)[flag], stride=2, padding=1)]
else:
layers += [nn.Conv2d(in_channels, v, kernel_size=(1, 3)[flag])]
flag = not flag
in_channels = v
return layers
def multibox(vgg, extra_layers, cfg, num_classes):
loc_layers = []
conf_layers = []
vgg_source = [21, -2]
# 得到分类和回归层
for k, v in enumerate(vgg_source):
loc_layers += [nn.Conv2d(vgg[v].out_channels,
cfg[k] * 4, kernel_size=3, padding=1)]
conf_layers += [nn.Conv2d(vgg[v].out_channels,
cfg[k] * num_classes, kernel_size=3, padding=1)]
for k, v in enumerate(extra_layers[1::2], 2):
loc_layers += [nn.Conv2d(v.out_channels, cfg[k]
* 4, kernel_size=3, padding=1)]
conf_layers += [nn.Conv2d(v.out_channels, cfg[k]
* num_classes, kernel_size=3, padding=1)]
return vgg, extra_layers, (loc_layers, conf_layers)
# cfg
base = {
'300': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M',
512, 512, 512],
'512': [],
}
extras = {
'300': [256, 'S', 512, 128, 'S', 256, 128, 256, 128, 256],
'512': [],
}
mbox = {
'300': [4, 6, 6, 6, 4, 4], # number of boxes per feature map location
'512': [],
}
"""
phase - test/train
size - inputImg size
num_class - 输出类别
"""
def build_ssd(phase, size=300, num_classes=21):
if phase != "test" and phase != "train":
print("ERROR: Phase: " + phase + " not recognized")
return
if size != 300:
print("ERROR: You specified size " + repr(size) + ". However, " +
"currently only SSD300 (size=300) is supported!")
return
base_, extras_, head_ = multibox(vgg(base[str(size)], 3),
add_extras(extras[str(size)], 1024),
mbox[str(size)], num_classes)
return SSD(phase, size, base_, extras_, head_, num_classes)
2. train.py – github下载下来是有很多报错的 ! 这里我把改好可以直接运行的贴出来(但是数据集的路径需要各位对应改一下,其他都ok了) 注意 此处使用1.5版PyTorch
from data import *
from utils.augmentations import SSDAugmentation
from layers.modules import MultiBoxLoss
from ssd import build_ssd
import os
import sys
import time
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.optim as optim
import torch.backends.cudnn as cudnn
import torch.nn.init as init
import torch.utils.data as data
import numpy as np
import argparse
# 字符串转bool 解析参数的时候用
def str2bool(v):
return v.lower() in ("yes", "true", "t", "1")
root_path = os.path.abspath("")
data_path = root_path + "/data"
vgg16_reducedfc_path = data_path + "/weights/vgg16_reducedfc.pth"
check_pointing_weights = root_path + "/weights/ssd300_COCO_45000.pth"
have_cuda = True
batch_size = 32
parser = argparse.ArgumentParser(description='Single Shot MultiBox Detector Training With Pytorch')
train_set = parser.add_mutually_exclusive_group()
parser.add_argument('--dataset', default='VOC', choices=['VOC', 'COCO'],
type=str, help='VOC or COCO')
parser.add_argument('--dataset_root', default=VOC_ROOT,
help='Dataset root directory path')
parser.add_argument('--basenet', default=vgg16_reducedfc_path,
help='Pretrained base model')
parser.add_argument('--batch_size', default=batch_size, type=int,
help='Batch size for training')
parser.add_argument('--resume', default=check_pointing_weights, type=str,
help='Checkpoint state_dict file to resume training from')
parser.add_argument('--start_iter', default=45000, type=int,
help='Resume training at this iter') # 用于从中途恢复训练
parser.add_argument('--num_workers', default=8, type=int,
help='Number of workers used in dataloading')
parser.add_argument('--cuda', default=have_cuda, type=str2bool,
help='Use CUDA to train model')
parser.add_argument('--lr', '--learning-rate', default=1e-3, type=float,
help='initial learning rate')
parser.add_argument('--momentum', default=0.9, type=float,
help='Momentum value for optim')
parser.add_argument('--weight_decay', default=5e-4, type=float,
help='Weight decay for SGD')
parser.add_argument('--gamma', default=0.1, type=float,
help='Gamma update for SGD')
parser.add_argument('--visdom', default=False, type=str2bool,
help='Use visdom for loss visualization')
parser.add_argument('--save_folder', default='weights/',
help='Directory for saving checkpoint models')
args = parser.parse_args()
if torch.cuda.is_available():
if args.cuda:
torch.set_default_tensor_type('torch.cuda.FloatTensor')
if not args.cuda:
print("WARNING: It looks like you have a CUDA device, but aren't " +
"using CUDA.\nRun with --cuda for optimal training speed.")
torch.set_default_tensor_type('torch.FloatTensor')
else:
torch.set_default_tensor_type('torch.FloatTensor')
if not os.path.exists(args.save_folder):
os.mkdir(args.save_folder)
# 初始化
def xavier(param):
init.xavier_uniform_(param)
def weights_init(m):
if isinstance(m, nn.Conv2d):
xavier(m.weight.data)
m.bias.data.zero_()
# 可视化
def create_vis_plot(_xlabel, _ylabel, _title, _legend):
return viz.line(
X=torch.zeros((1,)).cpu(),
Y=torch.zeros((1, 3)).cpu(),
opts=dict(
xlabel=_xlabel,
ylabel=_ylabel,
title=_title,
legend=_legend
)
)
# 可视化更新
def update_vis_plot(iteration, loc, conf, window1, window2, update_type, epoch_size=1):
viz.line(
X=torch.ones((1, 3)).cpu() * iteration,
Y=torch.Tensor([loc, conf, loc + conf]).unsqueeze(0).cpu() / epoch_size,
win=window1,
update=update_type
)
# initialize epoch plot on first iteration
if iteration == 0:
viz.line(
X=torch.zeros((1, 3)).cpu(),
Y=torch.Tensor([loc, conf, loc + conf]).unsqueeze(0).cpu(),
win=window2,
update=True
)
def adjust_learning_rate(optimizer, gamma, step):
"""Sets the learning rate to the initial LR decayed by 10 at every
specified step
# Adapted from PyTorch Imagenet example:
# https://github.com/pytorch/examples/blob/master/imagenet/main.py
"""
lr = args.lr * (gamma ** (step))
for param_group in optimizer.param_groups:
param_group['lr'] = lr
def train():
# 准备数据集
if args.dataset == 'COCO':
if args.dataset_root == VOC_ROOT:
if not os.path.exists(COCO_ROOT):
parser.error('Must specify dataset_root if specifying dataset')
print("WARNING: Using default COCO dataset_root because " +
"--dataset_root was not specified.")
args.dataset_root = COCO_ROOT
cfg = coco
dataset = COCODetection(root=args.dataset_root,
transform=SSDAugmentation(cfg['min_dim'], MEANS))
elif args.dataset == 'VOC':
if args.dataset_root == COCO_ROOT:
parser.error('Must specify dataset if specifying dataset_root')
cfg = voc
dataset = VOCDetection(root=args.dataset_root,
transform=SSDAugmentation(cfg['min_dim'], MEANS))
# 是否启用可视化
if args.visdom:
import visdom
viz = visdom.Visdom()
# 准备网络!
ssd_net = build_ssd('train', cfg['min_dim'], cfg['num_classes'])
net = ssd_net
# GPU
if args.cuda:
net = torch.nn.DataParallel(ssd_net)
cudnn.benchmark = True
# 从已有的继续?
if args.resume:
print('Resuming training, loading {}...'.format(args.resume))
ssd_net.load_weights(args.resume)
else:
# 加载vgg预训练权重,说明了如何加载预训练权重
vgg_weights = torch.load(args.basenet) # vgg16_reducedfc
print('Loading base network...')
ssd_net.vgg.load_state_dict(vgg_weights)
if args.cuda:
net = net.cuda()
if not args.resume:
print('Initializing weights...')
# initialize newly added layers' weights with xavier method
ssd_net.extras.apply(weights_init)
ssd_net.loc.apply(weights_init)
ssd_net.conf.apply(weights_init)
# 优化器
optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum,
weight_decay=args.weight_decay)
# 损失函数
criterion = MultiBoxLoss(cfg['num_classes'], 0.5, True, 0, True, 3, 0.5, False, args.cuda)
net.train()
# loss counters
loc_loss = 0
conf_loss = 0
epoch = 0
print('Loading the dataset...')
epoch_size = len(dataset) // args.batch_size
print('Training SSD on:', dataset.name)
print('Using the specified args:')
print(args)
step_index = 0
if args.visdom:
vis_title = 'SSD.PyTorch on ' + dataset.name
vis_legend = ['Loc Loss', 'Conf Loss', 'Total Loss']
iter_plot = create_vis_plot('Iteration', 'Loss', vis_title, vis_legend)
epoch_plot = create_vis_plot('Epoch', 'Loss', vis_title, vis_legend)
data_loader = data.DataLoader(dataset, args.batch_size,
num_workers=args.num_workers,
shuffle=True,
collate_fn=detection_collate,
pin_memory=True) # pin_memory 锁页内存,不会被放到虚拟内存中,与GPU交换数据更快(毕竟我们内存充足)
data_lenth = str(len(data_loader))
# create batch iterator
batch_iterator = iter(data_loader)
for iteration in range(args.start_iter, cfg['max_iter']):
# 可视化 更新loss
if args.visdom and iteration != 0 and (iteration % epoch_size == 0):
update_vis_plot(epoch, loc_loss, conf_loss, epoch_plot, None, 'append', epoch_size)
# reset epoch loss counters
loc_loss = 0
conf_loss = 0
epoch += 1
# 更新学习率
if iteration in cfg['lr_steps']:
step_index += 1
adjust_learning_rate(optimizer, args.gamma, step_index)
# load train data
# 过images, targets = next(batch_iterator)读取图片时,如果next()中没有数据后会触发Stoplteration异常,
# 使用下面语句替换 images, targets = next(batch_iterator)将解决这种异常问题。
# images, targets = next(batch_iterator)
# 报错解决 https://blog.csdn.net/weixin_43905350/article/details/100802246
try:
images, targets = next(batch_iterator)
except StopIteration: # 异常捕获
batch_iterator = iter(data_loader)
images, targets = next(batch_iterator)
if args.cuda:
images = Variable(images.cuda())
with torch.no_grad():
targets = [Variable(ann.cuda()) for ann in targets]
else:
images = Variable(images)
with torch.no_grad():
targets = [Variable(ann) for ann in targets]
# forward
t0 = time.time()
out = net(images)
# back_propogation
optimizer.zero_grad()
loss_l, loss_c = criterion(out, targets) # predictions, targets
loss = loss_l + loss_c
loss.backward()
optimizer.step()
t1 = time.time()
# loc_loss += loss_l.data[0] ####################################################################### 原来的
# conf_loss += loss_c.data[0]
loc_loss += loss_l.item() # 修改后的
conf_loss += loss_c.item()
if iteration % 10 == 0:
print('timer: %.4f sec.' % (t1 - t0))
# print('iter ' + repr(iteration) + ' || Loss: %.4f ||' % (loss.data[0]), end=' ') ###################### 原来的
print('iter ' + repr(iteration) + "/" + data_lenth + ' || Loss: %.4f ||' % (loss.item()), end=' ') # 修改后
if args.visdom:
update_vis_plot(iteration, loss_l.data[0], loss_c.data[0],
iter_plot, epoch_plot, 'append')
# 保存一次参数
if iteration != 0 and iteration % 5000 == 0:
print('Saving state, iter:', iteration)
torch.save(ssd_net.state_dict(), 'weights/ssd300_COCO_' +
repr(iteration) + '.pth')
# 训练完毕 保存参数
torch.save(ssd_net.state_dict(), args.save_folder + '' + args.dataset + '.pth')
if __name__ == '__main__':
train()
- Reference
[1] PyTorch 中的 ModuleList 和 Sequential: 区别和使用场景