SSD讲解以及代码解读（二）_ssd的锚框代码解读、-CSDN博客

本文链接：https://blog.csdn.net/m0_37663944/article/details/104122402

在上一章节中，我们得到了一个end_points，里面包含多个层，现在我们需要针对每一个层，得到框。这里需要提出一个概念，锚。也就是需要对每一个像素生成多个框。假如layer_1层，大小为36*36*3，每一个像素生成6个框，那么就有layer_1就有36*36*6个框。同时每个像素的6个框是不一样的。如何得到这6个框。
首先我们必须对每一个layer得到相应的6个默认框。计算公式如下
在这里插入图片描述
上述公式中常量为 $S_{min}$ 论文中为0.2代码中为1.5， $S_{max}$ 论文中代码中都为0.9。 $m$ 是用于预测的特征图个数， $k$ 代表层数根据上述公式，我们得到每一层的默认框21, 45,99,153,207,261,315(这些是长度=宽度 $S_k$ )。根据默认框调整成不同的长宽比，一般有6个。 $\alpha_r=\{1, 2, 3, 1/2, 1/3\}$ ，这里有5个计算公式如下：
$w_k^a =s_k\sqrt{\alpha_r}$ , $h_k^a =s_k/\sqrt{\alpha_r}$
还有一个还会设置一个尺度为 $s^′_k=\sqrt{s_ks_{k+1}}$ 且 $a_r=1$ 的先验框。
详细代码如下：

default_params = {
        "img_shape":(300, 300, 3),  # 图片输入尺寸
        "num_classes":21,     # 预测类别20+1（背景）
        "no_annotation_label":21,
        "feat_layers":['block4', 'block7', 'block8', 'block9', 'block10', 'block11'],  # 用于生成default box的特征层
        "feat_shapes":[(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)],    # 对应特征层的特征图尺寸
        "anchor_size_bounds":[0.15, 0.90],   # Smin = 0.15， Smax = 0.9
        # anchor_size_bounds=[0.20, 0.90],
        "anchor_sizes":[(21., 45.),   # 当前层与下一层的预测默认矩形边框尺寸，即Sk的值，与论文中的计算公式并不对应
                      (45., 99.),
                      (99., 153.),
                      (153., 207.),
                      (207., 261.),
                      (261., 315.)],
        "anchor_ratios":[[2, .5],   # 生成默认框的形状比例，不包含1:1的比例
                       [2, .5, 3, 1./3],
                       [2, .5, 3, 1./3],
                       [2, .5, 3, 1./3],
                       [2, .5],
                       [2, .5]],
        "anchor_steps":[8, 16, 32, 64, 100, 300],   # 特征图上一步对应在原图上的跨度  anchor_step*feat_shapey与等于300
        "anchor_offset":0.5,  # 偏移
        "normalizations":[20, -1, -1, -1, -1, -1],  # 特征层是否正则处理
        "prior_scaling":[0.1, 0.1, 0.2, 0.2]  # 默认框与真实框的差异缩放比例
        }
#生成框
def ssd_anchor_one_layer(img_shape,  # 原始图像shape
                         feat_shape,  # 特征图shape
                         sizes,  # 默认box大小
                         ratios,  # 默认box长宽比
                         step,  # 特征图上一步对应在原图上的跨度
                         offset=0.5,
                         dtype=np.float32):
    """Computer SSD default anchor boxes for one feature layer.
    Determine the relative position grid of the centers, and the relative
    width and height.
    Arguments:
      feat_shape: Feature shape, used for computing relative position grids;
      size: Absolute reference sizes;
      ratios: Ratios to use on these features;
      img_shape: Image shape, used for computing height, width relatively to the
        former;
      offset: Grid offset.
    Return:
      y, x, h, w: Relative x and y grids, and height and width.
    """
    # Compute the position grid: simple way.
    # y, x = np.mgrid[0:feat_shape[0], 0:feat_shape[1]]
    # y = (y.astype(dtype) + offset) / feat_shape[0]
    # x = (x.astype(dtype) + offset) / feat_shape[1]
    # Weird SSD-Caffe computation using steps values...
    # 计算默认框中心坐标（相对原图）
    y, x = np.mgrid[0:feat_shape[0], 0:feat_shape[1]]
    y = (y.astype(dtype) + offset) * step / img_shape[0]
    x = (x.astype(dtype) + offset) * step / img_shape[1]
 
    # Expand dims to support easy broadcasting.
    y = np.expand_dims(y, axis=-1)
    x = np.expand_dims(x, axis=-1)
 
    # Compute relative height and width.
    # Tries to follow the original implementation of SSD for the order.
    num_anchors = len(sizes) + len(ratios)  # 默认框的个数
    print(num_anchors)
    h = np.zeros((num_anchors, ), dtype=dtype)  # 初始化高
    w = np.zeros((num_anchors, ), dtype=dtype)  # 初始化宽
    # Add first anchor boxes with ratio=1.
    h[0] = sizes[0] / img_shape[0]  # 添加长宽比为1的默认框
    w[0] = sizes[0] / img_shape[1]
    di = 1
    if len(sizes) > 1:
        h[1] = math.sqrt(sizes[0] * sizes[1]) / img_shape[0]  # 添加一组特殊的默认框，长宽比为1，大小为sqrt（s（i） + s（i+1））
        w[1] = math.sqrt(sizes[0] * sizes[1]) / img_shape[1]
        di += 1
    for i, r in enumerate(ratios):  # 添加不同比例的默认框（ratios中不含1）
        h[i+di] = sizes[0] / img_shape[0] / math.sqrt(r)
        w[i+di] = sizes[0] / img_shape[1] * math.sqrt(r)
    return y, x, h, w

#为每一个框打标签
def bboxes_encode(self, labels, bboxes, anchors,  # lables是GT box对应的标签， bboxes是GT box对应的坐标信息
                      scope=None):                    # anchors是生成的默认框
    """Encode labels and bounding boxes.
    """
    return tf_ssd_bboxes_encode(
            labels, bboxes, anchors,
            self.params.num_classes,
            self.params.no_annotation_label,
            ignore_threshold=0.5,
            prior_scaling=self.params.prior_scaling,
            scope=scope)

def tf_ssd_bboxes_encode(labels,  # 真实标签
                         bboxes,  # 真实bbox
                         anchors,  # 存放每一个预测层生成的默认框
                         num_classes,
                         no_annotation_label,
                         ignore_threshold=0.5,
                         prior_scaling=[0.1, 0.1, 0.2, 0.2],
                         dtype=tf.float32,
                         scope='ssd_bboxes_encode'):
    """Encode groundtruth labels and bounding boxes using SSD net anchors.
    Encoding boxes for all feature layers.
    Arguments:
      labels: 1D Tensor(int64) containing groundtruth labels;
      bboxes: Nx4 Tensor(float) with bboxes relative coordinates;
      anchors: List of Numpy array with layer anchors;
      matching_threshold: Threshold for positive match with groundtruth bboxes;
      prior_scaling: Scaling of encoded coordinates.
    Return:
      (target_labels, target_localizations, target_scores):
        Each element is a list of target Tensors.
    """
    with tf.name_scope(scope):
        target_labels = []  # 存放匹配到的GTbox的label的容器
        target_localizations = []  # 存放匹配到的GTbox的位置信息的容器
        target_scores = []  # 存放默认框与匹配到的GTbox的IOU（交并比）
        for i, anchors_layer in enumerate(anchors):  # 遍历每个预测层的默认框
            with tf.name_scope('bboxes_encode_block_%i' % i):
                t_labels, t_loc, t_scores = \
                    tf_ssd_bboxes_encode_layer(labels, bboxes, anchors_layer,  # 匹配默认框的ground truth box并计算偏差
                                               num_classes, no_annotation_label,
                                               ignore_threshold,
                                               prior_scaling, dtype)
                target_labels.append(t_labels)  # 匹配到的ground truth box对应标签
                target_localizations.append(t_loc)  # 默认框与匹配到的ground truth box的坐标差异
                target_scores.append(t_scores)  # 默认框与匹配到的ground truth box的IOU（交并比）
        return target_labels, target_localizations, target_scores


def tf_ssd_bboxes_encode_layer(labels,  # GTbox类别
                               bboxes,  # GTbox的位置信息
                               anchors_layer,  # 默认框坐标信息（中心点坐标以及宽、高）
                               num_classes,
                               no_annotation_label,
                               ignore_threshold=0.5,
                               prior_scaling=[0.1, 0.1, 0.2, 0.2],
                               dtype=tf.float32):
    """Encode groundtruth labels and bounding boxes using SSD anchors from
    one layer.
    Arguments:
      labels: 1D Tensor(int64) containing groundtruth labels;
      bboxes: Nx4 Tensor(float) with bboxes relative coordinates;
      anchors_layer: Numpy array with layer anchors;
      matching_threshold: Threshold for positive match with groundtruth bboxes;
      prior_scaling: Scaling of encoded coordinates.
    Return:
      (target_labels, target_localizations, target_scores): Target Tensors.
    """
    # Anchors coordinates and volume.
    yref, xref, href, wref = anchors_layer
    ymin = yref - href / 2.  # 转换到默认框的左上角坐标以及右下角坐标
    xmin = xref - wref / 2.
    ymax = yref + href / 2.
    xmax = xref + wref / 2.
    vol_anchors = (xmax - xmin) * (ymax - ymin)  # 默认框的面积
 
    # Initialize tensors...
    # 初始化各参数
    shape = (yref.shape[0], yref.shape[1], href.size)
    feat_labels = tf.zeros(shape, dtype=tf.int64)  # 存放默认框匹配的GTbox标签
    feat_scores = tf.zeros(shape, dtype=dtype)  # 存放默认框与匹配的GTbox的IOU（交并比）
 
    feat_ymin = tf.zeros(shape, dtype=dtype)  # 存放默认框匹配到的GTbox的坐标信息
    feat_xmin = tf.zeros(shape, dtype=dtype)
    feat_ymax = tf.ones(shape, dtype=dtype)
    feat_xmax = tf.ones(shape, dtype=dtype)
 
    def jaccard_with_anchors(bbox):  # 计算重叠度函数
        """Compute jaccard score between a box and the anchors.
        """
        int_ymin = tf.maximum(ymin, bbox[0])
        int_xmin = tf.maximum(xmin, bbox[1])
        int_ymax = tf.minimum(ymax, bbox[2])
        int_xmax = tf.minimum(xmax, bbox[3])
        h = tf.maximum(int_ymax - int_ymin, 0.)
        w = tf.maximum(int_xmax - int_xmin, 0.)
        # Volumes.
        inter_vol = h * w
        union_vol = vol_anchors - inter_vol \
            + (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
        jaccard = tf.div(inter_vol, union_vol)
        return jaccard
 
    def intersection_with_anchors(bbox):
        """Compute intersection between score a box and the anchors.
        """
        int_ymin = tf.maximum(ymin, bbox[0])
        int_xmin = tf.maximum(xmin, bbox[1])
        int_ymax = tf.minimum(ymax, bbox[2])
        int_xmax = tf.minimum(xmax, bbox[3])
        h = tf.maximum(int_ymax - int_ymin, 0.)
        w = tf.maximum(int_xmax - int_xmin, 0.)
        inter_vol = h * w
        scores = tf.div(inter_vol, vol_anchors)
        return scores
 
    def condition(i, feat_labels, feat_scores,  # 循环条件
                  feat_ymin, feat_xmin, feat_ymax, feat_xmax):
        """Condition: check label index.
        """
        r = tf.less(i, tf.shape(labels))  # tf.shape(labels)GTbox的个数，当i<=tf.shape(labels)是返回True
        return r[0]
 
    def body(i, feat_labels, feat_scores,  # 循环执行主体
             feat_ymin, feat_xmin, feat_ymax, feat_xmax):
        """Body: update feature labels, scores and bboxes.
        Follow the original SSD paper for that purpose:
          - assign values when jaccard > 0.5;
          - only update if beat the score of other bboxes.
        寻找该层所有默认框匹配满足条件的GTbox
        """
        # Jaccard score.
        label = labels[i]
        bbox = bboxes[i]
        jaccard = jaccard_with_anchors(bbox)   # 计算该层所有的默认框与该真实框的交并比
        # Mask: check threshold + scores + no annotations + num_classes.
        mask = tf.greater(jaccard, feat_scores)  # 交并比是否比之前匹配的GTbox大
        # mask = tf.logical_and(mask, tf.greater(jaccard, matching_threshold))
        mask = tf.logical_and(mask, feat_scores > -0.5)  # 暂不清楚意义，但这里并不是为了获取正样本所以并不是大于0.5
        mask = tf.logical_and(mask, label < num_classes)  # 感觉没有任何意义真实标签label肯定小于num_classes,防止出错？
        imask = tf.cast(mask, tf.int64)  # 转型
        fmask = tf.cast(mask, dtype)  # dtype float32
        # Update values using mask. 根据mask更新标签和交并比
        feat_labels = imask * label + (1 - imask) * feat_labels  # 当imask为1时更新标签
        feat_scores = tf.where(mask, jaccard, feat_scores)  # 当mask为true时更新为jaccard，否则为feat_score
 
        feat_ymin = fmask * bbox[0] + (1 - fmask) * feat_ymin  # 当fmask为1.0时更新坐标信息
        feat_xmin = fmask * bbox[1] + (1 - fmask) * feat_xmin
        feat_ymax = fmask * bbox[2] + (1 - fmask) * feat_ymax
        feat_xmax = fmask * bbox[3] + (1 - fmask) * feat_xmax
 
        # Check no annotation label: ignore these anchors...
        # interscts = intersection_with_anchors(bbox)
        # mask = tf.logical_and(interscts > ignore_threshold,
        #                       label == no_annotation_label)
        # # Replace scores by -1.
        # feat_scores = tf.where(mask, -tf.cast(mask, dtype), feat_scores)
 
        return [i+1, feat_labels, feat_scores,
                feat_ymin, feat_xmin, feat_ymax, feat_xmax]
    # Main loop definition.
    i = 0
    [i, feat_labels, feat_scores,
     feat_ymin, feat_xmin,
     feat_ymax, feat_xmax] = tf.while_loop(condition, body,  # tf.while_loop是一个循环函数condition是循环条件，body是循环体
                                           [i, feat_labels, feat_scores,   # 第三项是参数
                                            feat_ymin, feat_xmin,
                                            feat_ymax, feat_xmax])
    # Transform to center / size. 转换回中心坐标以及宽高
    feat_cy = (feat_ymax + feat_ymin) / 2.
    feat_cx = (feat_xmax + feat_xmin) / 2.
    feat_h = feat_ymax - feat_ymin
    feat_w = feat_xmax - feat_xmin
    # Encode features.
    feat_cy = (feat_cy - yref) / href / prior_scaling[0]  # 默认框中心与匹配的真实框中心坐标偏差
    feat_cx = (feat_cx - xref) / wref / prior_scaling[1]
    feat_h = tf.log(feat_h / href) / prior_scaling[2]  # 高和宽的偏差
    feat_w = tf.log(feat_w / wref) / prior_scaling[3]
    # Use SSD ordering: x / y / w / h instead of ours.
    feat_localizations = tf.stack([feat_cx, feat_cy, feat_w, feat_h], axis=-1)
    return feat_labels, feat_localizations, feat_scores

接下来看如何loss如何组成，代码如下：

def ssd_losses(logits, localisations,  # logits预测类别  localisation预测偏移位置
               gclasses, glocalisations, gscores,  # gclasses正确类别  glocalisation实际偏移位置  gscores与GT的交并比
               match_threshold=0.5,
               negative_ratio=3.,
               alpha=1.,
               label_smoothing=0.,
               device='/cpu:0',
               scope=None):
    with tf.name_scope(scope, 'ssd_losses'):
        lshape = logits[0].shape
        num_classes = lshape[-1]
        batch_size = lshape[0]
 
        # Flatten out all vectors!  展平所有向量
        flogits = []
        fgclasses = []
        fgscores = []
        flocalisations = []
        fglocalisations = []
        for i in range(len(logits)):
            flogits.append(tf.reshape(logits[i], [-1, num_classes]))
            fgclasses.append(tf.reshape(gclasses[i], [-1]))
            fgscores.append(tf.reshape(gscores[i], [-1]))
            flocalisations.append(tf.reshape(localisations[i], [-1, 4]))
            fglocalisations.append(tf.reshape(glocalisations[i], [-1, 4]))
        # And concat the crap!
        logits = tf.concat(flogits, axis=0) #prediction
        gclasses = tf.concat(fgclasses, axis=0) #true class
        gscores = tf.concat(fgscores, axis=0) #true scores
        localisations = tf.concat(flocalisations, axis=0) #predict localisation
        glocalisations = tf.concat(fglocalisations, axis=0) #true localisation
        dtype = logits.dtype
 
        # Compute positive matching mask... 计算正样本数目
        pmask = gscores > match_threshold   # 交并比是否大于0.5
        fpmask = tf.cast(pmask, dtype)
        n_positives = tf.reduce_sum(fpmask)  # 正样本数目
 
        # Hard negative mining...
        no_classes = tf.cast(pmask, tf.int32)
        predictions = Softmax()(logits)
        nmask = tf.logical_and(tf.logical_not(pmask),  # 交并比小于0.5并大于-0.5的负样本
                               gscores > -0.5)
        fnmask = tf.cast(nmask, dtype)  # 转成float型
        nvalues = tf.where(nmask,      # True时为背景概率，False时为1.0
                           predictions[:, 0],   # 0 是 background
                           1. - fnmask)
        nvalues_flat = tf.reshape(nvalues, [-1])
        # Number of negative entries to select.
        max_neg_entries = tf.cast(tf.reduce_sum(fnmask), tf.int32)  # 所有供选择的负样本数目
        n_neg = tf.cast(negative_ratio * n_positives, tf.int32) + batch_size
        n_neg = tf.minimum(n_neg, max_neg_entries)  # 负样本的个数
 
        val, idxes = tf.nn.top_k(-nvalues_flat, k=n_neg)  # 按顺序排获取前k个值，以及对应id
        max_hard_pred = -val[-1]  # 负样本的背景概率阈值
        # Final negative mask.
        nmask = tf.logical_and(nmask, nvalues < max_hard_pred)  # 交并比小于0.5并大于-0.5的负样本，且概率小于max_hard_pred
        fnmask = tf.cast(nmask, dtype)
 
        # Add cross-entropy loss.
        with tf.name_scope('cross_entropy_pos'):
            loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
                                                                  labels=gclasses)
            loss = tf.div(tf.reduce_sum(loss * fpmask), batch_size, name='value')  # fpmask是正样本的mask，正1，负0
            tf.losses.add_loss(loss)
 
        with tf.name_scope('cross_entropy_neg'):
            loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
                                                                  labels=no_classes)
            loss = tf.div(tf.reduce_sum(loss * fnmask), batch_size, name='value')  # fnmask是负样本的mask，负为1，正为0
            tf.losses.add_loss(loss)
 
        # Add localization loss: smooth L1, L2, ...
        with tf.name_scope('localization'):
            # Weights Tensor: positive mask + random negative.
            weights = tf.expand_dims(alpha * fpmask, axis=-1)
            loss = abs_smooth(localisations - glocalisations)
            loss = tf.div(tf.reduce_sum(loss * weights), batch_size, name='value')
            tf.losses.add_loss(loss)

def abs_smooth(diss):
    if abs(diss)<1:
        return 0.5*(diss**2)
    else:
        return abs(diss) - 0.5