目标检测自学记录之SSD

本文详述了作者自学目标检测算法SSD的过程,包括理论学习、Tensorflow源码分析以及训练和评估的实践。文章重点介绍了SSD网络结构、阅读和运行源码的理解,并分享了训练时遇到的问题及解决方案,为读者提供了SSD学习的参考路径。
摘要由CSDN通过智能技术生成

前言

作为一名即将入学的硕士研究生,我觉得有必要将自己平时学习、做项目过程中遇到的问题等记录下来,以便于以后的整理与复习。因本人水平有限,博客中可能会有一些错误理解,欢迎大家指正,共同交流进步。本文为我的第一篇博客,格式、内容、表达等方面都有欠缺,以后会逐渐加强。本文记录我在学习SSD过程中参考的相关资料以及部分个人理解。

SSD理论

SSD的相关理论知识,网络上有许多参考资料,我主要通过这篇博客来学习: https://blog.csdn.net/xiaohu2022/article/details/79833786

学习源码(Tensorflow)

接下来,我通过学习balancap版本(https://github.com/balancap/SSD-Tensorflow)的源码来加深理解。

阅读理解源码

我从nets文件夹下的ssd_vgg_300.py开始阅读:

class SSDNet():
    default_params = SSDParams(
        img_shape=(300, 300),#输入图片大小
        num_classes=21,#类别数(20种+1个背景类)
        no_annotation_label=21,#无标注标签
        feat_layers=['block4', 'block7', 'block8', 'block9', 'block10', 'block11'],#选取的特征层
        feat_shapes=[(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)],#特征层大小
        anchor_size_bounds=[0.15, 0.90],#候选框比例范围
        anchor_sizes=[(21., 45.),#不同特征图的先验框尺度(第一个值是s_k,第2个值是s_k+1)
                      (45., 99.),
                      (99., 153.),
                      (153., 207.),
                      (207., 261.),
                      (261., 315.)],
        anchor_ratios=[[2, .5],#不同特征图先验框长宽比
                       [2, .5, 3, 1. / 3],
                       [2, .5, 3, 1. / 3],
                       [2, .5, 3, 1. / 3],
                       [2, .5],
                       [2, .5]],
        anchor_steps=[8, 16, 32, 64, 100, 300],#特征图放大到原图的比例,相当于特征图单元的大小,不是准确值
        anchor_offset=0.5,#候选框中心偏移值
        normalizations=[20, -1, -1, -1, -1, -1],#是否归一化,大于0则进行,否则不做归一化;目前看来只对block_4进行正则化,因为该层比较靠前,其norm较大,需做L2正则化(仅仅对每个像素在channel维度做归一化)以保证和后面检测层差异不是很大;
        prior_scaling=[0.1, 0.1, 0.2, 0.2] #编码解码预测框时的尺寸缩放,variance值,对应cx,cy,w,h)
    def __init__(self,params=None):
    	if isinstance(params,SSDParams):
            self.params=params
        else:
            self.params=SSDNet.default_params

    def net(self,inputs,#网络输入
            is_training=True,
            update_feat_shapes=True,#是否按实际情况更新特征层的尺寸
            dropout_keep_prob=0.5,#droupout=0.5
            prediction_fn=slim.softmax,  #采用softmax预测函数
            reuse=None,
            scope='ssd_300_vgg'):
        r=ssd_net(inputs,#输入例如 ‘NWHC’
                    num_classes=self.params.num_classes,
                    feat_layers=self.params.feat_layers,
                    anchor_sizes=self.params.anchor_sizes,
                    anchor_ratios=self.params.anchor_ratios,
                    normalizations=self.params.normalizations,
                    is_training=is_training,
                    dropout_keep_prob=dropout_keep_prob,
                    prediction_fn=prediction_fn,
                    reuse=reuse,
                    scope=scope)
        #自动计算feature_shapes   # Update feature shapes (try at least!)
        if update_feat_shapes:  # 是否更新特征层图像尺寸
            shapes = ssd_feat_shapes_from_net(r[0],self.params.feat_shapes)  # 输入特征层图像尺寸以及r[0](r[0]是ssd_net输出的类别predictions,它是一个list,length为所选特征层数,list中每一个元素的维度为[N(batch_size),特征图h,特征图w,每个单元候选框个数,种类数]),输出更新后的特征图尺寸列表
            self.params = self.params._replace(feat_shapes=shapes)  # 将更新的特征图尺寸shapes替换当前的特征图尺寸       长度为特征图数目的list:[特征图h,特征图w,每个cell先验框数]
        return r

这里用到了ssd_netssd_feat_shapes_from_net,先看ssd_feat_shapes_from_net函数:

def ssd_feat_shapes_from_net(predictions, default_shapes=None):
    """Try to obtain the feature shapes from the prediction layers. The latter
    can be either a Tensor or Numpy ndarray.
    Return:
      list of feature shapes. Default values if predictions shape not fully
      determined.
    """
    feat_shapes = []
    for l in predictions:              #l:是预测的特征形状       [N,h,w,每个cell先验框数,种类数]
        # Get the shape, from either a np array or a tensor.
        if isinstance(l, np.ndarray):  #如果l是np.ndarray类型,则将l的形状赋给shape;否则将tensorflow张量shape作为list
            shape = l.shape
        else:
            shape = l.get_shape().as_list()
        shape = shape[1:4]#[h,w,每个cell先验框数]
        # Problem: undetermined shape...     #如果预测的特征尺寸未定,则使用默认的形状;否则将shape中的值赋给特征形状列表中
        if None in shape:
            return default_shapes
        else:
            feat_shapes.append(shape)
    return feat_shapes                        #返回更新后的特征尺寸list

然后来看ssd_net:

def ssd_net(inputs,
            num_classes=SSDNet.default_params.num_classes,
            feat_layers=SSDNet.default_params.feat_layers,
            anchor_sizes=SSDNet.default_params.anchor_sizes,
            anchor_ratios=SSDNet.default_params.anchor_ratios,
            normalizations=SSDNet.default_params.normalizations,
            is_training=True,
            dropout_keep_prob=0.5,
            prediction_fn=slim.softmax,
            reuse=None,
            scope='ssd_300_vgg'):
    """SSD net definition.
    """
    # if data_format == 'NCHW':
    #     inputs = tf.transpose(inputs, perm=(0, 3, 1, 2))

    # End_points collect relevant activations for external use.
    end_points = {
   }#收集每一层的输出结果
    with tf.variable_scope(scope, 'ssd_300_vgg', [inputs], reuse=reuse):
        # Original VGG-16 blocks.
        net = slim.repeat(inputs, 2, slim.conv2d, 64, [3, 3], scope='conv1')#vgg16的第一层,重复两次3✖3卷积,输出特征数(卷积核个数)64
        end_points['block1'] = net#存入第一层输出
        net = slim.max_pool2d(net, [2, 2], scope='pool1')#2X2最大池化
        # Block 2.
        net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3], scope='conv2')
        end_points['block2'] = net
        net = slim.max_pool2d(net, [2, 2], scope='pool2')
        # Block 3.
        net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3], scope='conv3')
        end_points['block3'] = net
        net = slim.max_pool2d(net, [2, 2], scope='pool3')
        # Block 4.
        net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv4')
        end_points['block4'] = net
        net = slim.max_pool2d(net, [2, 2], scope='pool4')
        # Block 5.
        net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv5')
        end_points['block5'] = net
        net = slim.max_pool2d(net, [3, 3], stride=1, scope='pool5')

        # Additional SSD blocks.   去掉了vgg16的全连接层
        # Block 6: let's dilate the hell out of it!
        net = slim.conv2d(net, 1024, [3, 3], rate=6, scope='conv6')#做空洞卷积,扩大感受野,6为扩张率
        end_points['block6'] = net
        net = tf.layers.dropout(net, rate=dropout_keep_prob, training=is_training)#随机失活
        # Block 7: 1x1 conv. Because the fuck.
        net = slim.conv2d(net, 1024, [1, 1], scope='conv7')
        end_points['block7'] = net
        net = tf.layers.dropout(net, rate=dropout_keep_prob, training=is_training)

        # Block 8/9/10/11: 1x1 and 3x3 convolutions stride 2 (except lasts).
        end_point = 'block8'
        with tf.variable_scope(end_point):
            net = slim.conv2d(net, 256, [1, 1], scope='conv1x1')
            net = custom_layers.pad2d(net, pad=(1, 1))
            net = slim.conv2d(net, 512, [3, 3], stride=2, scope='conv3x3', padding='VALID')
        end_points[end_point] = net
        end_point = 'block9'
        with tf.variable_scope(end_point):
            net = slim.conv2d(net, 128, [1, 1], scope='conv1x1')
            net = custom_layers.pad2d(net, pad=(1, 1))
            net = slim.conv2d(net, 256, [3, 3], stride=2, scope='conv3x3', padding='VALID')
        end_points[end_point] = net
        end_point = 'block10'
        with tf.variable_scope(end_point):
            net = slim.conv2d(net, 128, [1, 1], scope='conv1x1')
            net = slim.conv2d(net, 256, [3, 3], scope='conv3x3', padding='VALID')
        end_points[end_point] = net
        end_point = 'block11'
        with tf.variable_scope(end_point):
            net = slim.conv2d(net, 128, [1, 1], scope='conv1x1')
            net = slim.conv2d(net, 256, [3, 3], scope='conv3x3', padding='VALID')
        end_points[end_point] = net

        # Prediction and localisations layers.
        predictions = []
        logits = []
        localisations = []
        for i, layer in enumerate(feat_layers):
            with tf.variable_scope(layer + '_box'):
                p, l = ssd_multibox_layer(end_points[layer],
                                          num_classes,
                                          anchor_sizes[i],
                                          anchor_ratios[i],
                                          normalizations[i])
            predictions.append(prediction_fn(p))
            logits.append(p)
            localisations.append(l)

        return predictions, localisations, logits, end_points

ssd_net中用到了custom_layers.pad2dssd_multibox_layer,先来看前者:

def pad2d(inputs,
          pad=(0, 0),
          mode='CONSTANT',
          data_format='NHWC',
          trainable=True,
          scope=None):
    """2D Padding layer, adding a symmetric padding to H and W dimensions.

    Aims to mimic padding in Caffe and MXNet, helping the port of models to
    TensorFlow. Tries to follow the naming convention of `tf.contrib.layers`.

    Args:
      inputs: 4D input Tensor;
      pad: 2-Tuple with padding values for H and W dimensions;
      mode: Padding mode. C.f. `tf.pad`
      data_format:  NHWC or NCHW data format.
    """
    with tf.name_scope(scope, 'pad2d', [inputs]):
        # Padding shape.
        if data_format == 'NHWC':
            paddings = [[0, 0], [pad[0], pad[0]], [pad[1], pad[1]], [0, 0]]#在HW上手动padding,与caffee版本保持一致
        elif data_format == 'NCHW':
            paddings = [[0, 0], [0, 0], [pad[0], pad[0]], [pad[1], pad[1]]]
        net = tf.pad(inputs, paddings, mode=mode)
        return net

接下来看ssd_multibox_layer

def ssd_multibox_layer(inputs,#end_points[layer]   对于每一特征层所保存的输出
                       num_classes,
                       sizes,
                       ratios=[1],
                       normalization=-1,
                       bn_normalization=False):
    """Construct a multibox layer, return a class and localization predictions.
    """
    net = inputs
    if normalization > 0:
        net = custom_layers.l2_normalization(net, scaling=True)
    # Number of anchors.
    num_anchors = len(sizes) + len(ratios)#先验框个数

    # Location.
    num_loc_pred = num_anchors * 4#预测位置   4:cx,cy,w,h
    loc_pred = slim.conv2d(net, num_loc_pred, [3, 3], activation_fn=None,
                           scope='conv_loc') # 该部分是定位信息,输出维度为[N,特征图h,特征图w,每个单元所有锚点框坐标num_loc_pred]
    loc_pred = custom_layers.channel_to_last(loc_pred)
    loc_pred = tf.reshape(loc_pred,
                          tensor_shape(loc_pred, 4)[:-1]+[num_anchors, 4])#[N,h,w,每个cell先验框数,种类数]
    # Class prediction.
    num_cls_pred = num_anchors * num_classes#预测类别
    cls_pred = slim.conv2d(net, num_cls_pred, [3, 3], activation_fn=None,
                           scope='conv_cls')#类别信息,[N,h,w,num_cls_pred]
    cls_pred = custom_layers.channel_to_last(cls_pred)
    cls_pred = tf.reshape(cls_pred,
                          tensor_shape(cls_pred, 4)[:-1]+[num_anchors, num_classes])#[N,h,w,num_anchors,num_classes]
    return cls_pred, loc_pred

上面用到了custom_layers.l2_normalizationcustom_layers.channel_to_last

def l2_normalization(  #L2正则化:稀疏正则化操作
        inputs,   #输入特征层,[batch_size,h,w,c]
        scaling=False,  #默认归一化后是否设置缩放变量gamma
        scale_initializer=init_ops.ones_initializer(),   #scale初始化为1
        reuse=None,
        variables_collections=None,
        outputs_collections=None,
        trainable=True,
        scope=None):


    with variable_scope.variable_scope(
            scope, 'L2Normalization', [inputs], reuse=reuse) as sc:
        inputs_shape = inputs.get_shape()   #得到输入特征层的维度信息
        inputs_rank = inputs_shape.ndims    #维度数=4
        dtype = inputs.dtype.base_dtype     #数据类型
        norm_dim = tf.range(inputs_rank-1, inputs_rank)  #需要正则化的维度是4-1=3即channel这个维度
        params_shape = inputs_shape[-1:]                #通道数

        # Normalize along spatial dimensions.
        outputs = nn.l2_normalize(inputs, norm_dim, epsilon=1e-12)   #对通道所在维度进行正则化(归一),其中epsilon是避免除0风险
        # Additional scaling.
        if scaling:                   #判断是否对正则化后设置缩放变量
            scale_collections = utils.get_variable_collections(
                variables_collections, 'scale')
            scale = variables.model_variable('gamma',
                                             shape=params_shape,
                                             dty
  • 1
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值