代码地址:RefineDet
相关链接:
- 《RefineDet:Single-Shot Refinement Neural Network for Object Detection》论文笔记
- RefineDet:(1)训练脚本解析
- RefineDet:(3)C++测试代码
这部分讲解的是从TCB模块构建之后创建ARM与ODM模块的代码。这里涉及到的RefineDet源自于SSD,但是设计的灵感来自于FPN、Faster RCNN。这里的ARM模块就是相当于RPN网络在基础anchor的基础上对默认检测框进行优化,之后再送到ODM模块进行进一步边界框回归,这部分回归的算法就和SSD的方法类似了,下面的代码就是ARM与ODM模块的构建部分,生成之后就是在训练脚本中构建Loss了。
'''
所用文件位于:RefineDet-master\python\caffe\model_libs.py
'''
# CreateRefineDetHead函数是本文关于网络结构构造的重点,这部分代码也是在原来SSD的
# CreateMultiBoxHead函数 基础上修改得到的,可以看作是将原来SSD的CreateMultiBoxHead函数
# 内容实现了两遍,一遍用来实现ARM部分(类似于Faster中的RPN网络), 另一边用来实现ORM部分。
# from_layers和from_layers2是两个重点输入, 分别对应论文中Figure1的ARM和OBM两部分输出。
# 因此这两遍实现除了输入不同外,另一个不同是ARM部分 是类似RPN网络的bbox回归和二分类,
# 而ORM部分是类似SSD检测网络的bbox回归和object分类。这个函数的返回值:
# mbox_layers[0]是"arm_loc",表示bbox的回归输出;
# mbox_layers[1]是"arm_conf",表示bbox的分类输出(是否是object的二分类);
# mbox_layers[2]是"arm_priorbox",表示priorbox(anchor)的信息;
# mbox_layers[3]是”odm_conf“,表示bbox的回归输出;
# mbox_layers[4]是”odm_loc“,表示bbox的分类输出(类别数是所有object的类别数+背景)。
def CreateRefineDetHead(net, data_layer="data", num_classes=[], from_layers=[], from_layers2=[],
normalizations=[], use_batchnorm=True, lr_mult=1, min_sizes=[], max_sizes=[], prior_variance = [0.1],
aspect_ratios=[], steps=[], img_height=0, img_width=0, share_location=True,
flip=True, clip=True, offset=0.5, inter_layer_depth=[], kernel_size=1, pad=0,
conf_postfix='', loc_postfix='', **bn_param):
assert num_classes, "must provide num_classes"
assert num_classes > 0, "num_classes must be positive number"
if normalizations:
assert len(from_layers) == len(normalizations), "from_layers and normalizations should have same length"
assert len(from_layers) == len(min_sizes), "from_layers and min_sizes should have same length"
if max_sizes:
assert len(from_layers) == len(max_sizes), "from_layers and max_sizes should have same length"
if aspect_ratios:
assert len(from_layers) == len(aspect_ratios), "from_layers and aspect_ratios should have same length"
if steps:
assert len(from_layers) == len(steps), "from_layers and steps should have same length"
net_layers = net.keys()
assert data_layer in net_layers, "data_layer is not in net's layers"
if inter_layer_depth:
assert len(from_layers) == len(inter_layer_depth), "from_layers and inter_layer_depth should have same length"
prefix = 'arm'
num_classes_rpn = 2 # RPN网络的输出类别,只有背景和前景
num = len(from_layers)
priorbox_layers = [] # anchor默认检测框生成层列表
loc_layers = [] # ARM模块检测框定位预测生成层列表
conf_layers = [] # ARM模块检测框RPN分类置信度预测生成层列表
for i in range(0, num):
from_layer = from_layers[i]
# Get the normalize value. 这部分未执行
if normalizations:
if normalizations[i] != -1:
norm_name = "{}_norm".format(from_layer)
net[norm_name] = L.Normalize(net[from_layer], scale_filler=dict(type="constant", value=normalizations[i]),
across_spatial=False, channel_shared=False)
from_layer = norm_name
# Add intermediate layers. 这部分在VGG网络中未执行未执行
if inter_layer_depth:
if inter_layer_depth[i] > 0:
inter_name = "{}_inter".format(from_layer)
ResBody(net, from_layer, inter_name, out2a=256, out2b=256, out2c=1024, stride=1, use_branch1=True)
# ConvBNLayer(net, from_layer, inter_name, use_bn=use_batchnorm, use_relu=True, lr_mult=lr_mult,
# num_output=inter_layer_depth[i], kernel_size=3, pad=1, stride=1, **bn_param)
from_layer = "res{}".format(inter_name)
# Estimate number of priors per location given provided parameters.
# 计算网络中需要设置的anchor的数目,缺省情况下为3
min_size = min_sizes[i]
if type(min_size) is not list:
min_size = [min_size]
aspect_ratio = []
if len(aspect_ratios) > i:
aspect_ratio = aspect_ratios[i]
if type(aspect_ratio) is not list:
aspect_ratio = [aspect_ratio]
max_size = []
if len(max_sizes) > i:
max_size = max_sizes[i]
if type(max_size) is not list:
max_size = [max_size]
if max_size:
assert len(max_size) == len(min_size), "max_size and min_size should have same length."
if max_size:
num_priors_per_location = (2 + len(aspect_ratio)) * len(min_size)
else:
num_priors_per_location = (1 + len(aspect_ratio)) * len(min_size)
if flip:
num_priors_per_location += len(aspect_ratio) * len(min_size) # 最后计算出来为3
step = []
if len(steps) > i: # 当前特征的stride
step = steps[i]
# Create location prediction layer.
# 生成检测框定位层,输出的box大小为feature map的宽高乘以anchor_num,用以和后面生成的acnhor
# 一起确定检测框
name = "{}_mbox_loc{}".format(from_layer, loc_postfix)
num_loc_output = num_priors_per_location * 4 # 定位预测的输出:anchor_num*4
if not share_location:
num_loc_output *= num_classes_rpn
ConvBNLayer(net, from_layer, name, use_bn=use_batchnorm, use_relu=False, lr_mult=lr_mult,
num_output=num_loc_output, kernel_size=kernel_size, pad=pad, stride=1, **bn_param)
permute_name = "{}_perm".format(name)
net[permute_name] = L.Permute(net[name], order=[0, 2, 3, 1])
flatten_name = "{}_flat".format(name)
net[flatten_name] = L.Flatten(net[permute_name], axis=1)
loc_layers.append(net[flatten_name]) # 添加到ARM模块检测框定位列表
# Create confidence prediction layer.
# 生成检测框置信度层,这里是二分类(前景和背景)
name = "{}_mbox_conf{}".format(from_layer, conf_postfix)
num_conf_output = num_priors_per_location * num_classes_rpn
ConvBNLayer(net, from_layer, name, use_bn=use_batchnorm, use_relu=False, lr_mult=lr_mult,
num_output=num_conf_output, kernel_size=kernel_size, pad=pad, stride=1, **bn_param)
permute_name = "{}_perm".format(name)
net[permute_name] = L.Permute(net[name], order=[0, 2, 3, 1])
flatten_name = "{}_flat".format(name)
net[flatten_name] = L.Flatten(net[permute_name], axis=1)
conf_layers.append(net[flatten_name]) # 添加到ARM模块RPN置信度列表
# Create prior generation layer.
# 产生有anchor生成默认检测框,是固定的
name = "{}_mbox_priorbox".format(from_layer)
net[name] = L.PriorBox(net[from_layer], net[data_layer], min_size=min_size,
clip=clip, variance=prior_variance, offset=offset)
if max_size:
net.update(name, {'max_size': max_size})
if aspect_ratio:
net.update(name, {'aspect_ratio': aspect_ratio, 'flip': flip})
if step:
net.update(name, {'step': step})
if img_height != 0 and img_width != 0:
if img_height == img_width:
net.update(name, {'img_size': img_height})
else:
net.update(name, {'img_h': img_height, 'img_w': img_width})
priorbox_layers.append(net[name]) # 添加到anchor默认检测框列表
# Concatenate priorbox, loc, and conf layers.
# 将这些生成的列表连接起来,不同特征上结果融合
mbox_layers = []
name = '{}{}'.format(prefix, "_loc")
net[name] = L.Concat(*loc_layers, axis=1)
mbox_layers.append(net[name])
name = '{}{}'.format(prefix, "_conf")
net[name] = L.Concat(*conf_layers, axis=1)
mbox_layers.append(net[name])
name = '{}{}'.format(prefix, "_priorbox")
net[name] = L.Concat(*priorbox_layers, axis=2)
mbox_layers.append(net[name])
prefix = 'odm'
num = len(from_layers2)
loc_layers = [] # ODM模块的定位层
conf_layers = [] # ODM模块的分类层
# 这个循环就是作用于每个融合层,文章中默认融合层有4个['P3', 'P4', 'P5', 'P6']
for i in range(0, num):
from_layer = from_layers2[i]
# Get the normalize value. 这部分未执行
if normalizations:
if normalizations[i] != -1:
norm_name = "{}_norm".format(from_layer)
net[norm_name] = L.Normalize(net[from_layer], scale_filler=dict(type="constant", value=normalizations[i]),
across_spatial=False, channel_shared=False)
from_layer = norm_name
# Add intermediate layers. 这部分在VGG网络中未执行
if inter_layer_depth:
if inter_layer_depth[i] > 0:
inter_name = "{}_inter".format(from_layer)
ResBody(net, from_layer, inter_name, out2a=256, out2b=256, out2c=1024, stride=1, use_branch1=True)
# ConvBNLayer(net, from_layer, inter_name, use_bn=use_batchnorm, use_relu=True, lr_mult=lr_mult,
# num_output=inter_layer_depth[i], kernel_size=3, pad=1, stride=1, **bn_param)
# from_layer = inter_name
from_layer = "res{}".format(inter_name)
# Estimate number of priors per location given provided parameters.
# 计算每个位置产生anchor的数目
min_size = min_sizes[i]
if type(min_size) is not list:
min_size = [min_size]
aspect_ratio = []
if len(aspect_ratios) > i:
aspect_ratio = aspect_ratios[i]
if type(aspect_ratio) is not list:
aspect_ratio = [aspect_ratio]
max_size = []
if len(max_sizes) > i:
max_size = max_sizes[i]
if type(max_size) is not list:
max_size = [max_size]
if max_size:
assert len(max_size) == len(min_size), "max_size and min_size should have same length."
if max_size:
num_priors_per_location = (2 + len(aspect_ratio)) * len(min_size)
else:
num_priors_per_location = (1 + len(aspect_ratio)) * len(min_size)
if flip:
num_priors_per_location += len(aspect_ratio) * len(min_size)
# Create location prediction layer.
# 创建ODM模块检测框定位层
name = "{}_mbox_loc{}".format(from_layer, loc_postfix)
num_loc_output = num_priors_per_location * 4
if not share_location:
num_loc_output *= num_classes
ConvBNLayer(net, from_layer, name, use_bn=use_batchnorm, use_relu=False, lr_mult=lr_mult,
num_output=num_loc_output, kernel_size=kernel_size, pad=pad, stride=1, **bn_param)
permute_name = "{}_perm".format(name)
net[permute_name] = L.Permute(net[name], order=[0, 2, 3, 1])
flatten_name = "{}_flat".format(name)
net[flatten_name] = L.Flatten(net[permute_name], axis=1)
loc_layers.append(net[flatten_name])
# Create confidence prediction layer.
# 创建ODM模块分类层,这里就是给网络定义的分类了,比如coco的81类
name = "{}_mbox_conf{}".format(from_layer, conf_postfix)
num_conf_output = num_priors_per_location * num_classes
ConvBNLayer(net, from_layer, name, use_bn=use_batchnorm, use_relu=False, lr_mult=lr_mult,
num_output=num_conf_output, kernel_size=kernel_size, pad=pad, stride=1, **bn_param)
permute_name = "{}_perm".format(name)
net[permute_name] = L.Permute(net[name], order=[0, 2, 3, 1])
flatten_name = "{}_flat".format(name)
net[flatten_name] = L.Flatten(net[permute_name], axis=1)
conf_layers.append(net[flatten_name])
# 注意这里没有prior_box的生成,这是由于使用了ARM经过优化的box作为输入,ARM就是相当于是RPN网络
# Concatenate priorbox, loc, and conf layers.
# 将这些生成的列表连接起来
name = '{}{}'.format(prefix, "_loc")
net[name] = L.Concat(*loc_layers, axis=1)
mbox_layers.append(net[name])
name = '{}{}'.format(prefix, "_conf")
net[name] = L.Concat(*conf_layers, axis=1)
mbox_layers.append(net[name])
return mbox_layers