结构
实现
在vgg16的基础上修改,使用slim实现,比较简单。
- 超参数定义:
def ssd_arg_scope(weight_decay=0.0005, data_format='NHWC'):
with slim.arg_scope([slim.conv2d, slim.fully_connected],
activation_fn=tf.nn.relu,
weights_regularizer=slim.l2_regularizer(weight_decay),
weights_initializer=tf.contrib.layers.xavier_initializer(),
biases_initializer=tf.zeros_initializer()):
with slim.arg_scope([slim.conv2d, slim.max_pool2d],
padding='SAME', #conv和pool的padding都是SAME,因此在stride为1的情况下不会改变feature map的尺寸.(conv2d的默认stride为1, max_pool2d的默认stride为2)
data_format=data_format):
with slim.arg_scope([custom_layers.pad2d,
custom_layers.l2_normalization,
custom_layers.channel_to_last],
data_format=data_format) as sc:
return sc
- 网络实现
输出都以list的形式组织, 每个feat_layer一个元素
def ssd_net(inputs,
num_classes=SSDNet.default_params.num_classes,
feat_layers=SSDNet.default_params.feat_layers,
anchor_sizes=SSDNet.default_params.anchor_sizes,
anchor_ratios=SSDNet.default_params.anchor_ratios,
normalizations=SSDNet.default_params.normalizations,
is_training=True,
dropout_keep_prob=0.5,
prediction_fn=slim.softmax,
reuse=None,
scope='ssd_300_vgg'):
"""SSD net definition.
"""
# if data_format == 'NCHW':
# inputs = tf.transpose(inputs, perm=(0, 3, 1, 2))
# End_points collect relevant activations for external use.
end_points = {}
with tf.variable_scope(scope, 'ssd_300_vgg', [inputs], reuse=reuse):
# Original VGG-16 blocks.
net = slim.repeat(inputs, 2, slim.conv2d, 64, [3, 3], scope='conv1') #(300, 300) * 64
end_points['block1'] = net
net = slim.max_pool2d(net, [2, 2], scope='pool1') #(150, 150) * 64
# Block 2.
net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3], scope='conv2') #(150, 150) * 128
end_points['block2'] = net
net = slim.max_pool2d(net, [2, 2], scope='pool2') #(75, 75) * 128
# Block 3.
net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3], scope='conv3') #(75, 75) * 256
end_points['block3'] = net
net = slim.max_pool2d(net, [2, 2], scope='pool3') #(38, 38) * 256
# Block 4.
net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv4') #(38, 38) * 512
end_points['block4'] = net
net = slim.max_pool2d(net, [2, 2], scope='pool4') #(19, 19) * 512
# Block 5.
net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv5') #(19, 19) * 512
end_points['block5'] = net
net = slim.max_pool2d(net, [3, 3], stride=1, scope='pool5') # stride为1, 尺寸不变(19, 19) * 512
# 以下为在vgg16基础上增加的层
# Block 6: 空洞卷积
net = slim.conv2d(net, 1024, [3, 3], rate=6, scope='conv6') #使用空洞卷积,目的是扩大感受野, stride为1, padding为same,不改变大小, (19, 19) * 1024
end_points['block6'] = net
net = tf.layers.dropout(net, rate=dropout_keep_prob, training=is_training)
# Block 7: 1x1 conv.
net = slim.conv2d(net, 1024, [1, 1], scope='conv7') #(19, 19) * 1024
end_points['block7'] = net
net = tf.layers.dropout(net, rate=dropout_keep_prob, training=is_training)
# Block 8/9/10/11: 1x1 and 3x3 convolutions stride 2 (except lasts).
end_point = 'block8'
with tf.variable_scope(end_point):
net = slim.conv2d(net, 256, [1, 1], scope='conv1x1') #(19, 19) * 256
net = custom_layers.pad2d(net, pad=(1, 1))
net = slim.conv2d(net, 512, [3, 3], stride=2, scope='conv3x3', padding='VALID') # (10, 10) * 256
end_points[end_point] = net
end_point = 'block9'
with tf.variable_scope(end_point):
net = slim.conv2d(net, 128, [1, 1], scope='conv1x1') # (10, 10) * 128
net = custom_layers.pad2d(net, pad=(1, 1))
net = slim.conv2d(net, 256, [3, 3], stride=2, scope='conv3x3', padding='VALID') # (5, 5) * 256
end_points[end_point] = net
end_point = 'block10'
with tf.variable_scope(end_point):
net = slim.conv2d(net, 128, [1, 1], scope='conv1x1')
net = slim.conv2d(net, 256, [3, 3], scope='conv3x3', padding='VALID') # (3, 3) * 256
end_points[end_point] = net
end_point = 'block11'
with tf.variable_scope(end_point):
net = slim.conv2d(net, 128, [1, 1], scope='conv1x1')
net = slim.conv2d(net, 256, [3, 3], scope='conv3x3', padding='VALID') #(1, 1) * 256
end_points[end_point] = net
# 预测部分
predictions = []
logits = []
localisations = []
for i, layer in enumerate(feat_layers): #['block4', 'block7', 'block8', 'block9', 'block10', 'block11']
with tf.variable_scope(layer + '_box'):
# p: (n, h, w, num_anchors, num_classes)
# l: (n, h, w, num_anchors, 4)
p, l = ssd_multibox_layer(end_points[layer],
num_classes,
anchor_sizes[i],
anchor_ratios[i],
normalizations[i])
predictions.append(prediction_fn(p)) #概率(n, h, w, num_anchors, num_classes)
logits.append(p)
localisations.append(l)
return predictions, localisations, logits, end_points
关键函数
- ssd_multibox_layer: 从feat_layer计算class和location的预测值
输入的是各个feat_layer
输出两个值:- 分类的logit,shape为(n, h, w, num_anchors_i, num_classes)
- 坐标偏移,shape为(n, h, w, num_anchors_i, 4)
def ssd_multibox_layer(inputs,
num_classes,
sizes,
ratios=[1],
normalization=-1,
bn_normalization=False):
"""Construct a multibox layer, return a class and localization predictions.
"""
net = inputs
if normalization > 0:
net = custom_layers.l2_normalization(net, scaling=True)
# 每个点上的default box数量,4或6
num_anchors = len(sizes) + len(ratios)
# 位置偏移计算
# 先conv成num_anchors * 4个通道,然后把channel转置到最后一维,再reshape成(n, h, w, num_anchors, 4)
num_loc_pred = num_anchors * 4 # (cy, cx, h, w) * num_anchors
loc_pred = slim.conv2d(net, num_loc_pred, [3, 3], activation_fn=None,
scope='conv_loc') # (n, h, w, num_anchors * 4)
loc_pred = custom_layers.channel_to_last(loc_pred)
loc_pred = tf.reshape(loc_pred,
tensor_shape(loc_pred, 4)[:-1]+[num_anchors, 4]) #(n, h, w, num_anchors, 4)
# 分类
# 先conv成num_anchors * num_classes个通道,然后把channel转置到最后一维,再reshape成(n, h, w, num_anchors, num_classes)
num_cls_pred = num_anchors * num_classes
cls_pred = slim.conv2d(net, num_cls_pred, [3, 3], activation_fn=None,
scope='conv_cls') #(n, h, w, num_anchors * num_classes)
cls_pred = custom_layers.channel_to_last(cls_pred)
cls_pred = tf.reshape(cls_pred,
tensor_shape(cls_pred, 4)[:-1]+[num_anchors, num_classes]) #(n, h, w, num_anchors, num_classes)
return cls_pred, loc_pred