SSD代码解读之三——net

结构

在这里插入图片描述

实现

在vgg16的基础上修改,使用slim实现,比较简单。

  • 超参数定义:
def ssd_arg_scope(weight_decay=0.0005, data_format='NHWC'):
    with slim.arg_scope([slim.conv2d, slim.fully_connected],
                        activation_fn=tf.nn.relu,
                        weights_regularizer=slim.l2_regularizer(weight_decay),
                        weights_initializer=tf.contrib.layers.xavier_initializer(),
                        biases_initializer=tf.zeros_initializer()):
        with slim.arg_scope([slim.conv2d, slim.max_pool2d],
                            padding='SAME', #conv和pool的padding都是SAME,因此在stride为1的情况下不会改变feature map的尺寸.(conv2d的默认stride为1, max_pool2d的默认stride为2)
                            data_format=data_format):
            with slim.arg_scope([custom_layers.pad2d,
                                 custom_layers.l2_normalization,
                                 custom_layers.channel_to_last],
                                data_format=data_format) as sc:
                return sc
  • 网络实现
    输出都以list的形式组织, 每个feat_layer一个元素
def ssd_net(inputs,
            num_classes=SSDNet.default_params.num_classes,
            feat_layers=SSDNet.default_params.feat_layers,
            anchor_sizes=SSDNet.default_params.anchor_sizes,
            anchor_ratios=SSDNet.default_params.anchor_ratios,
            normalizations=SSDNet.default_params.normalizations,
            is_training=True,
            dropout_keep_prob=0.5,
            prediction_fn=slim.softmax,
            reuse=None,
            scope='ssd_300_vgg'):
    """SSD net definition.
    """
    # if data_format == 'NCHW':
    #     inputs = tf.transpose(inputs, perm=(0, 3, 1, 2))

    # End_points collect relevant activations for external use.
    end_points = {}
    with tf.variable_scope(scope, 'ssd_300_vgg', [inputs], reuse=reuse):
        # Original VGG-16 blocks.
        net = slim.repeat(inputs, 2, slim.conv2d, 64, [3, 3], scope='conv1') #(300, 300) * 64
        end_points['block1'] = net
        net = slim.max_pool2d(net, [2, 2], scope='pool1') #(150, 150) * 64
        # Block 2.
        net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3], scope='conv2') #(150, 150) * 128
        end_points['block2'] = net
        net = slim.max_pool2d(net, [2, 2], scope='pool2') #(75, 75) * 128
        # Block 3.
        net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3], scope='conv3') #(75, 75) * 256
        end_points['block3'] = net
        net = slim.max_pool2d(net, [2, 2], scope='pool3') #(38, 38) * 256
        # Block 4.
        net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv4') #(38, 38) * 512
        end_points['block4'] = net
        net = slim.max_pool2d(net, [2, 2], scope='pool4') #(19, 19) * 512
        # Block 5.
        net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv5')  #(19, 19) * 512
        end_points['block5'] = net
        net = slim.max_pool2d(net, [3, 3], stride=1, scope='pool5') # stride为1, 尺寸不变(19, 19) * 512

        # 以下为在vgg16基础上增加的层
        # Block 6: 空洞卷积
        net = slim.conv2d(net, 1024, [3, 3], rate=6, scope='conv6') #使用空洞卷积,目的是扩大感受野, stride为1, padding为same,不改变大小, (19, 19) * 1024
        end_points['block6'] = net
        net = tf.layers.dropout(net, rate=dropout_keep_prob, training=is_training)
        # Block 7: 1x1 conv. 
        net = slim.conv2d(net, 1024, [1, 1], scope='conv7') #(19, 19) * 1024
        end_points['block7'] = net
        net = tf.layers.dropout(net, rate=dropout_keep_prob, training=is_training)

        # Block 8/9/10/11: 1x1 and 3x3 convolutions stride 2 (except lasts).
        end_point = 'block8'
        with tf.variable_scope(end_point):
            net = slim.conv2d(net, 256, [1, 1], scope='conv1x1') #(19, 19) * 256
            net = custom_layers.pad2d(net, pad=(1, 1))
            net = slim.conv2d(net, 512, [3, 3], stride=2, scope='conv3x3', padding='VALID') # (10, 10) * 256
        end_points[end_point] = net
        end_point = 'block9'
        with tf.variable_scope(end_point):
            net = slim.conv2d(net, 128, [1, 1], scope='conv1x1') # (10, 10) * 128
            net = custom_layers.pad2d(net, pad=(1, 1))
            net = slim.conv2d(net, 256, [3, 3], stride=2, scope='conv3x3', padding='VALID') # (5, 5) * 256
        end_points[end_point] = net
        end_point = 'block10'
        with tf.variable_scope(end_point):
            net = slim.conv2d(net, 128, [1, 1], scope='conv1x1')
            net = slim.conv2d(net, 256, [3, 3], scope='conv3x3', padding='VALID') # (3, 3) * 256
        end_points[end_point] = net
        end_point = 'block11'
        with tf.variable_scope(end_point):
            net = slim.conv2d(net, 128, [1, 1], scope='conv1x1')
            net = slim.conv2d(net, 256, [3, 3], scope='conv3x3', padding='VALID') #(1, 1) * 256
        end_points[end_point] = net

        # 预测部分
        predictions = []
        logits = []
        localisations = []
        for i, layer in enumerate(feat_layers): #['block4', 'block7', 'block8', 'block9', 'block10', 'block11']
            with tf.variable_scope(layer + '_box'):
              # p: (n, h, w, num_anchors, num_classes)
              # l: (n, h, w, num_anchors, 4)
                p, l = ssd_multibox_layer(end_points[layer],
                                          num_classes,
                                          anchor_sizes[i],
                                          anchor_ratios[i],
                                          normalizations[i])
            predictions.append(prediction_fn(p)) #概率(n, h, w, num_anchors, num_classes)
            logits.append(p) 
            localisations.append(l)

        return predictions, localisations, logits, end_points

关键函数

  • ssd_multibox_layer: 从feat_layer计算class和location的预测值
    输入的是各个feat_layer
    输出两个值:
    • 分类的logit,shape为(n, h, w, num_anchors_i, num_classes)
    • 坐标偏移,shape为(n, h, w, num_anchors_i, 4)
def ssd_multibox_layer(inputs,
                       num_classes,
                       sizes,
                       ratios=[1],
                       normalization=-1,
                       bn_normalization=False):
    """Construct a multibox layer, return a class and localization predictions.
    """
    net = inputs
    if normalization > 0:
        net = custom_layers.l2_normalization(net, scaling=True)
    # 每个点上的default box数量,4或6
    num_anchors = len(sizes) + len(ratios) 
	
	# 位置偏移计算
    # 先conv成num_anchors * 4个通道,然后把channel转置到最后一维,再reshape成(n, h, w, num_anchors, 4)
    num_loc_pred = num_anchors * 4 # (cy, cx, h, w) * num_anchors
    loc_pred = slim.conv2d(net, num_loc_pred, [3, 3], activation_fn=None,
                           scope='conv_loc') # (n, h, w, num_anchors * 4)
    loc_pred = custom_layers.channel_to_last(loc_pred)
    loc_pred = tf.reshape(loc_pred,
                          tensor_shape(loc_pred, 4)[:-1]+[num_anchors, 4]) #(n, h, w, num_anchors, 4)
    
    # 分类
    # 先conv成num_anchors * num_classes个通道,然后把channel转置到最后一维,再reshape成(n, h, w, num_anchors, num_classes)
    num_cls_pred = num_anchors * num_classes
    cls_pred = slim.conv2d(net, num_cls_pred, [3, 3], activation_fn=None,
                           scope='conv_cls') #(n, h, w, num_anchors * num_classes)
    cls_pred = custom_layers.channel_to_last(cls_pred)
    cls_pred = tf.reshape(cls_pred,
                          tensor_shape(cls_pred, 4)[:-1]+[num_anchors, num_classes]) #(n, h, w, num_anchors, num_classes)
    return cls_pred, loc_pred

  • 1
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值