SSD-Tensorflow-master源码解读

1、计算default box(ssd_vgg_300.py)

(1)计算每个feature map的default box

def ssd_anchor_one_layer(img_shape,    #图像的大小
                         feat_shape,   #feature map的大小
                         sizes,
                         ratios,
                         step,
                         offset=0.5,
    
    """计算当前feature map上的每一点映射到图像上的坐标,格式如下:"""
    #y=[[[y1], [y1], [y1], ... ,[y1]],
    #   [[y2], [y2], [y2], ... ,[y2]],
    #   [[y3], [y3], [y3], ... ,[y3]],
    #   ...
    #   [[yn], [yn], [yn], ... ,[yn]]]

    #x=[[[x1], [x2], [x3], ... ,[xn]],
    #   [[x1], [x2], [x3], ... ,[xn]],        
    #   [[x1], [x2], [x3], ... ,[xn]],
    #   ...
    #   [[x1], [x2], [x3], ... ,[xn]]]    
#其中的xi, yi分别feature map上的点在image上的平均分布所对应的位置
    #另外返回的x和y分别为归一化后的值(除以image大小)

     """--------------以下标注皆是以第一个feature map为例的------------"""
    #y=[[0,0,0,...,0],
    #   [1,1,1,...,1],
    #   ...
    #   [38,38,38,...,38]]
    
    #x=[[0,1,...,38],
    #   [0,1,...,38],
    #   ...
    #   [0,1,...,38]]
    y, x = np.mgrid[0:feat_shape[0], 0:feat_shape[1]]
    
    #y=[[4,4,4,...,4],
    #   [12,12,12,...,12],
    #   [20,20,20,...,20],
    #   ...
    #   [300,300,300,...,300]] / 300 (归一化处理)
    y = (y.astype(dtype) + offset) * step / img_shape[0]
    
    #x=[[4,12,20,...,300],
    #   [4,12,20,...,300],
    #   [4,12,20,...,300],
    #   ...
    #   [4,12,20,...,300]] / 300 (归一化处理)
    x = (x.astype(dtype) + offset) * step / img_shape[1]

    # Expand dims to support easy broadcasting.
    #y=[[[4],[4],[4],...,[4]],
    #   [[12],[12],[12],...,[12]],
    #   [[20],[20],[20],...,[20]],
    #   ...
    #   [[300],[300],[300],...,[300]]] / 300
    y = np.expand_dims(y, axis=-1)
    
    #x=[[[4],[12],[20],...,[300]],
    #   [[4],[12],[20],...,[300]],
    #   [[4],[12],[20],...,[300]],
    #   ...
    #   [[4],[12],[20],...,[300]]] / 300
    x = np.expand_dims(x, axis=-1)

    """计算feature map的某个位置出的anchors的宽和高,格式如下:"""
    #w=[w1,w2,...,wm], h=[h1,h2,...,h3],其中m为num_anchors
    #num_anchors: feature map的某个位置处的anchors个数(两个长宽相等的+多个长宽不等的)
    num_anchors = len(sizes) + len(ratios)
    h = np.zeros((num_anchors, ), dtype=dtype)
    w = np.zeros((num_anchors, ), dtype=dtype)
    # Add first anchor boxes with ratio=1.
    #h[0].shape = 21/300
    h[0] = sizes[0] / img_shape[0]
    #w[0].shape = 21/300
    w[0] = sizes[0] / img_shape[1]
    di = 1
    if len(sizes) > 1:
        #h[1].shape = 45/300
        h[1] = math.sqrt(sizes[0] * sizes[1]) / img_shape[0]
        #w[1].shape = 45/300
        w[1] = math.sqrt(sizes[0] * sizes[1]) / img_shape[1]
        di += 1
    
    #对于第一个feature map,ratios=(21,45)
    #h[2].shape=21/sqrt(2)/300
    #w[2].shape=21*sqrt(2)/300
    #h[3].shape=21/sqrt(0.5)/300
    #w[3].shape=21*sqrt(0.5)/300
    for i, r in enumerate(ratios):
        h[i+di] = sizes[0] / img_shape[0] / math.sqrt(r)
        w[i+di] = sizes[0] / img_shape[1] * math.sqrt(r)
    return y, x, h, w

(2)计算所有feature map的default box

"""***************计算所有feature map层的default anchor box*****************"""
def ssd_anchors_all_layers(img_shape,
                           layers_shape,
                           anchor_sizes,
                           anchor_ratios,
                           anchor_steps,
                           offset=0.5,
                           dtype=np.float32):
    """Compute anchor boxes for all feature layers.
    """
    layers_anchors = []
    for i, s in enumerate(layers_shape):
        anchor_bboxes = ssd_anchor_one_layer(img_shape, s,
                                             anchor_sizes[i],
                                             anchor_ratios[i],
                                             anchor_steps[i],
                                             offset=offset, dtype=dtype)
        layers_anchors.append(anchor_bboxes)
    return layers_anchors
#返回 layers_anchors=[layer1_anchors, layer2_anchors, ... , layerk_anchors]
#其中layer1_anchors=[y,x,h,w]

2、根据上面计算的所有的default box,为其对应输出标记label(正例和反例)和location(正例)(ssd_common.py)

(1)根据Ground Truth和Default Box计算IOU重合度

(2)

计算某一个feature map对应的所有的default box的相应label,location和score(标记为正例的default box的最大IOU值)

def tf_ssd_bboxes_encode_layer(labels,
                               bboxes,
                               anchors_layer,
                               num_classes,
                               no_annotation_label,
                               ignore_threshold=0.5,
                               prior_scaling=[0.1, 0.1, 0.2, 0.2],
                               dtype=tf.float32):
    """Encode groundtruth labels and bounding boxes using SSD anchors from one layer.

    Arguments:
      labels: 1D Tensor(int64) containing groundtruth labels;
      bboxes: Nx4 Tensor(float) with bboxes relative coordinates;
      anchors_layer: Numpy array with layer anchors;
      matching_threshold: Threshold for positive match with groundtruth bboxes;
      prior_scaling: Scaling of encoded coordinates.

    Return:
      (target_labels, target_localizations, target_scores): Target Tensors.
    """
    # Anchors coordinates and volume.

    #计算当前feature map上所有default box的左下角和右上角坐标以及box的面积
    #其中(xref, yref), href, wref分别对应为default box的中心坐标和高宽[列表]
    #anchors_layer:(yref, xref, href, wref). yref:n×n×1; xref:n×n×1; n×n表示feature map大小
    #href:m; wref:m. 为别表示某个固定位置共有m中长宽组合
    yref, xref, href, wref = anchors_layer

    ymin = yref - href / 2.
    xmin = xref - wref / 2.
    ymax = yref + href / 2.
    xmax = xref + wref / 2.
    vol_anchors = (xmax - xmin) * (ymax - ymin)

    # Initialize tensors...
    #初始化一个feature map对应的所有的default box的相应label,location和score
    shape = (yref.shape[0], yref.shape[1], href.size)    #(n,n,m)如:(38,38,4)
    feat_labels = tf.zeros(shape, dtype=tf.int64)
    feat_scores = tf.zeros(shape, dtype=dtype)

    feat_ymin = tf.zeros(shape, dtype=dtype)
    feat_xmin = tf.zeros(shape, dtype=dtype)
    feat_ymax = tf.ones(shape, dtype=dtype)
    feat_xmax = tf.ones(shape, dtype=dtype)

    #计算IOU
    def jaccard_with_anchors(bbox):
        """Compute jaccard score between a box and the anchors."""
        #(int_xmin,int_ymin)和(int_xmax,int_ymax)分别为Default Box和Ground Box
        #合并的区域的左下角和右上角坐标
        int_ymin = tf.maximum(ymin, bbox[0])
        int_xmin = tf.maximum(xmin, bbox[1])
        int_ymax = tf.minimum(ymax, bbox[2])
        int_xmax = tf.minimum(xmax, bbox[3])

        #h和w为合并区域的高和宽
        h = tf.maximum(int_ymax - int_ymin, 0.)
        w = tf.maximum(int_xmax - int_xmin, 0.)

        # Volumes.
        #计算两个区域的交集和并集面积(体积)
        inter_vol = h * w
        union_vol = vol_anchors - inter_vol \
            + (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
        #计算A∩B/A∪B
        jaccard = tf.div(inter_vol, union_vol)
        return jaccard

    def intersection_with_anchors(bbox):
        """Compute intersection between score a box and the anchors.
        """
        int_ymin = tf.maximum(ymin, bbox[0])
        int_xmin = tf.maximum(xmin, bbox[1])
        int_ymax = tf.minimum(ymax, bbox[2])
        int_xmax = tf.minimum(xmax, bbox[3])
        h = tf.maximum(int_ymax - int_ymin, 0.)
        w = tf.maximum(int_xmax - int_xmin, 0.)
        inter_vol = h * w
        scores = tf.div(inter_vol, vol_anchors)
        return scores

    #如果i<tf.shape(labels)则返回True,即遍历完一幅图的所有GT
    def condition(i, feat_labels, feat_scores,
                  feat_ymin, feat_xmin, feat_ymax, feat_xmax):
        """Condition: check label index."""
        r = tf.less(i, tf.shape(labels))
        return r[0]

    def body(i, feat_labels, feat_scores,
             feat_ymin, feat_xmin, feat_ymax, feat_xmax):
        """Body: update feature labels, scores and bboxes.
        Follow the original SSD paper for that purpose:
          - assign values when jaccard > 0.5;
          - only update if beat the score of other bboxes.
        """
        # Jaccard score.
        #某一幅图的第i个Ground True的Label,box,以及与当前feature map中所有的Default Box的IOU
        label = labels[i]
        bbox = bboxes[i]
        jaccard = jaccard_with_anchors(bbox)
        # Mask: check threshold + scores + no annotations + num_classes.
        """下面程序主要实现:对于feature map中的所有Default Box,若其与某个图像的
        当前Ground Truth的IOU大于前一个(第一个全为0),且大于阈值matching_threshold,则
        对应的mask(mask[k])为True,从而实现本次返回的label、location和score为当前GT的
        label、location和score(IOU),否则返回对应的mask(mask[k])为False,进而返回前一个
        GT的label、location和score(IOU),当遍历完一幅图的所有GT后,便会返回非极大值IOU
        抑制后的得到的label、location和score"""
        
        #一个feature map中所有的Default Box若其中存在与当前的GT的IOU大于前一个GT则mask相应位置为True
        mask = tf.greater(jaccard, feat_scores)
        
        #mask = tf.logical_and(mask, tf.greater(jaccard, matching_threshold))
        mask = tf.logical_and(mask, feat_scores > -0.5)
        mask = tf.logical_and(mask, label < num_classes)
        
        #Bool到int和float,True->1,False->0,方便下面计算
        imask = tf.cast(mask, tf.int64)
        fmask = tf.cast(mask, dtype)
        
        # Update values using mask.
        #对于与当前GT的IOU大于前一个且大于阈值的Default Box,label为当前GT的label,否则为上一个的label(第一个为0)
        feat_labels = imask * label + (1 - imask) * feat_labels
        
        #对于与当前GT的IOU大于前一个且大于阈值的Default Box,score为当前IOU,否则为上一个
        feat_scores = tf.where(mask, jaccard, feat_scores)

        #对于与当前GT的IOU大于前一个且大于阈值的Default Box,location为当前GT的location,否则为上一个的location
        feat_ymin = fmask * bbox[0] + (1 - fmask) * feat_ymin
        feat_xmin = fmask * bbox[1] + (1 - fmask) * feat_xmin
        feat_ymax = fmask * bbox[2] + (1 - fmask) * feat_ymax
        feat_xmax = fmask * bbox[3] + (1 - fmask) * feat_xmax

        # Check no annotation label: ignore these anchors...
        # interscts = intersection_with_anchors(bbox)
        # mask = tf.logical_and(interscts > ignore_threshold,
        #                       label == no_annotation_label)
        # # Replace scores by -1.
        # feat_scores = tf.where(mask, -tf.cast(mask, dtype), feat_scores)

        return [i+1, feat_labels, feat_scores,
                feat_ymin, feat_xmin, feat_ymax, feat_xmax]
    # Main loop definition.
    i = 0
    [i, feat_labels, feat_scores,
     feat_ymin, feat_xmin,
     feat_ymax, feat_xmax] = tf.while_loop(condition, body,
                                           [i, feat_labels, feat_scores,
                                            feat_ymin, feat_xmin,
                                            feat_ymax, feat_xmax])

    # Transform to center / size.
    #计算每一个Default Box的中心坐标和高与宽(作为训练网络预测location的真实值),即与其向匹配的GT的location
    feat_cy = (feat_ymax + feat_ymin) / 2.
    feat_cx = (feat_xmax + feat_xmin) / 2.
    feat_h = feat_ymax - feat_ymin
    feat_w = feat_xmax - feat_xmin

    # Encode features.
    #应该是用于调节x,y回归与w,h回归在loss中占的比例(调整后的location的真实值)
    #见论文关于location loss部分的计算
    feat_cy = (feat_cy - yref) / href / prior_scaling[0]
    feat_cx = (feat_cx - xref) / wref / prior_scaling[1]
    feat_h = tf.log(feat_h / href) / prior_scaling[2]
    feat_w = tf.log(feat_w / wref) / prior_scaling[3]
    # Use SSD ordering: x / y / w / h instead of ours.
    feat_localizations = tf.stack([feat_cx, feat_cy, feat_w, feat_h], axis=-1)    #n×n×m×4
    
    return feat_labels, feat_localizations, feat_scores
    #feat_labels:(n, n, m),其中n为feature map大小;m为当前feature map的一个点对应default box数
    #feat_localizations:[x,y,w,h],x:(n, n, m),y:(n, n, m),w:(m),h:(m)
  

计算某所有feature map对应的所有的default box的相应label,location和score:

#对所有feature map的所有Default Box进行label、location和score的标注
def tf_ssd_bboxes_encode(labels,
                         bboxes,
                         anchors,
                         num_classes,
                         no_annotation_label,
                         ignore_threshold=0.5,
                         prior_scaling=[0.1, 0.1, 0.2, 0.2],
                         dtype=tf.float32,
                         scope='ssd_bboxes_encode'):
    """Encode groundtruth labels and bounding boxes using SSD net anchors.
    Encoding boxes for all feature layers.

    Arguments:
      labels: 1D Tensor(int64) containing groundtruth labels;
      bboxes: Nx4 Tensor(float) with bboxes relative coordinates;
      anchors: List of Numpy array with layer anchors;
      matching_threshold: Threshold for positive match with groundtruth bboxes;
      prior_scaling: Scaling of encoded coordinates.

    Return:
      (target_labels, target_localizations, target_scores):
        Each element is a list of target Tensors.
    """

#anchors:6×[n×n×1,n×n×1,4,4]. 6表示6个feature map层; n×n表示feature map大小
    with tf.name_scope(scope):
        target_labels = []
        target_localizations = []
        target_scores = []
        for i, anchors_layer in enumerate(anchors):
            with tf.name_scope('bboxes_encode_block_%i' % i):
                t_labels, t_loc, t_scores = \
                    tf_ssd_bboxes_encode_layer(labels, bboxes, anchors_layer,
                                               num_classes, no_annotation_label,
                                               ignore_threshold,
                                               prior_scaling, dtype)
                target_labels.append(t_labels)
                target_localizations.append(t_loc)
                target_scores.append(t_scores)
         return target_labels, target_localizations, target_scores
 

3、ssd_anchors_all_layers和tf_ssd_bbox_encode都被SSDNet类(ssd_vgg_300.py)调用

class SSDNet(object):
    """Implementation of the SSD VGG-based 300 network.

    The default features layers with 300x300 image input are:
      conv4 ==> 38 x 38
      conv7 ==> 19 x 19
      conv8 ==> 10 x 10
      conv9 ==> 5 x 5
      conv10 ==> 3 x 3
      conv11 ==> 1 x 1
    The default image size used to train this network is 300x300.
    """
    default_params = SSDParams(
        img_shape=(300, 300),
        num_classes=21,								
        no_annotation_label=21,						
        feat_layers=['block4', 'block7', 'block8', 'block9', 'block10', 'block11'],
        feat_shapes=[(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)],
        anchor_size_bounds=[0.15, 0.90],
        # anchor_size_bounds=[0.20, 0.90],
        anchor_sizes=[(21., 45.),
                      (45., 99.),
                      (99., 153.),
                      (153., 207.),
                      (207., 261.),
                      (261., 315.)],
        # anchor_sizes=[(30., 60.),
        #               (60., 111.),
        #               (111., 162.),
        #               (162., 213.),
        #               (213., 264.),
        #               (264., 315.)],
        anchor_ratios=[[2, .5],
                       [2, .5, 3, 1./3],
                       [2, .5, 3, 1./3],
                       [2, .5, 3, 1./3],
                       [2, .5],
                       [2, .5]],
        anchor_steps=[8, 16, 32, 64, 100, 300],
        anchor_offset=0.5,
        normalizations=[20, -1, -1, -1, -1, -1],
        prior_scaling=[0.1, 0.1, 0.2, 0.2]
        )

    def __init__(self, params=None):
        """Init the SSD net with some parameters. Use the default ones
        if none provided.
        """
        if isinstance(params, SSDParams):
            self.params = params
        else:
            self.params = SSDNet.default_params

    # ======================================================================= #
    def net(self, inputs,
            is_training=True,
            update_feat_shapes=True,
            dropout_keep_prob=0.5,
            prediction_fn=slim.softmax,
            reuse=None,
            scope='ssd_300_vgg'):
        """SSD network definition.
        """
        r = ssd_net(inputs,
                    num_classes=self.params.num_classes,
                    feat_layers=self.params.feat_layers,
                    anchor_sizes=self.params.anchor_sizes,
                    anchor_ratios=self.params.anchor_ratios,
                    normalizations=self.params.normalizations,
                    is_training=is_training,
                    dropout_keep_prob=dropout_keep_prob,
                    prediction_fn=prediction_fn,
                    reuse=reuse,
                    scope=scope)
        # Update feature shapes (try at least!)
        if update_feat_shapes:
            shapes = ssd_feat_shapes_from_net(r[0], self.params.feat_shapes)
            self.params = self.params._replace(feat_shapes=shapes)
        return r

    def arg_scope(self, weight_decay=0.0005, data_format='NHWC'):
        """Network arg_scope.
        """
        return ssd_arg_scope(weight_decay, data_format=data_format)

    def arg_scope_caffe(self, caffe_scope):
        """Caffe arg_scope used for weights importing.
        """
        return ssd_arg_scope_caffe(caffe_scope)

    # ======================================================================= #
    def update_feature_shapes(self, predictions):
        """Update feature shapes from predictions collection (Tensor or Numpy
        array).
        """
        shapes = ssd_feat_shapes_from_net(predictions, self.params.feat_shapes)
        self.params = self.params._replace(feat_shapes=shapes)

    def anchors(self, img_shape, dtype=np.float32):
        """Compute the default anchor boxes, given an image shape.
        """
        return ssd_anchors_all_layers(img_shape,
                                      self.params.feat_shapes,
                                      self.params.anchor_sizes,
                                      self.params.anchor_ratios,
                                      self.params.anchor_steps,
                                      self.params.anchor_offset,
                                      dtype)

    def bboxes_encode(self, labels, bboxes, anchors,
                      scope=None):
        """Encode labels and bounding boxes.
        """
        return ssd_common.tf_ssd_bboxes_encode(
            labels, bboxes, anchors,
            self.params.num_classes,
            self.params.no_annotation_label,
            ignore_threshold=0.5,
            prior_scaling=self.params.prior_scaling,
            scope=scope)

    def bboxes_decode(self, feat_localizations, anchors,
                      scope='ssd_bboxes_decode'):
        """Encode labels and bounding boxes.
        """
        return ssd_common.tf_ssd_bboxes_decode(
            feat_localizations, anchors,
            prior_scaling=self.params.prior_scaling,
            scope=scope)

    def detected_bboxes(self, predictions, localisations,
                        select_threshold=None, nms_threshold=0.5,
                        clipping_bbox=None, top_k=400, keep_top_k=200):
        """Get the detected bounding boxes from the SSD network output.
        """
        # Select top_k bboxes from predictions, and clip
        rscores, rbboxes = \
            ssd_common.tf_ssd_bboxes_select(predictions, localisations,
                                            select_threshold=select_threshold,
                                            num_classes=self.params.num_classes)
        rscores, rbboxes = \
            tfe.bboxes_sort(rscores, rbboxes, top_k=top_k)
        # Apply NMS algorithm.
        rscores, rbboxes = \
            tfe.bboxes_nms_batch(rscores, rbboxes,
                                 nms_threshold=nms_threshold,
                                 keep_top_k=keep_top_k)
        if clipping_bbox is not None:
            rbboxes = tfe.bboxes_clip(clipping_bbox, rbboxes)
        return rscores, rbboxes

    def losses(self, logits, localisations,
               gclasses, glocalisations, gscores,
               match_threshold=0.5,
               negative_ratio=3.,
               alpha=1.,
               label_smoothing=0.,
               scope='ssd_losses'):
        """Define the SSD network losses.
        """
        return ssd_losses(logits, localisations,
                          gclasses, glocalisations, gscores,
                          match_threshold=match_threshold,
                          negative_ratio=negative_ratio,
                          alpha=alpha,
                          label_smoothing=label_smoothing,
                          scope=scope)

4、调用anchors生成default box(train_ssd_network.py):

ssd_anchors = ssd_net.anchors(ssd_shape)				#all layers' anchors box

并由此default box作为参数调用bbox_encode生所有feature map对应的所有label、location和score:

 #定义编码Default Box的类,位置和得分的操作(根据GT, GT Labels和anchors)gbboxes:N×4,其中N为一幅图中的GT数
 gclasses, glocalisations, gscores = \
     ssd_net.bboxes_encode(glabels, gbboxes, ssd_anchors)	#计算所有default box的类,位置和得分(作为标记后的训练样本)
 batch_shape = [1] + [len(ssd_anchors)] * 3                     #batch_shape=[1,n,n,n], n=len(ssd_anchors)

5、根据所有feature map对应的所有label、location和score计算损失(定义在ssd_vgg_300.py)

# =========================================================================== #
# SSD loss function.
# =========================================================================== #
#logits.shape=[(5,38,38,4,21),(5,19,19,6,21),(5,10,10,6,21),(5,5,5,6,21),(5,3,3,4,21),(5,1,1,4,21)]
#localisations.shape=[(5,38,38,4,4),(5,19,19,6,4),(5,10,10,6,4),(5,5,5,6,4),(5,3,3,4,4),(5,1,1,4,4)],glocalisations同
#gclasses.shape=[(5,38,38,4),.................], gscores同
def ssd_losses(logits, localisations,             #预测类别, 预测位置
               gclasses, glocalisations, gscores, #ground truth 类别, ground truth 位置, ground truth 分数
               match_threshold=0.5,
               negative_ratio=3.,
               alpha=1.,
               label_smoothing=0.,
               device='/cpu:0',
               scope=None):
    with tf.name_scope(scope, 'ssd_losses'):
        lshape = tfe.get_shape(logits[0], 5)
        num_classes = lshape[-1]
        batch_size = lshape[0]	#5

        # Flatten out all vectors!
        flogits = []
        fgclasses = []
        fgscores = []
        flocalisations = []
        fglocalisations = []
        for i in range(len(logits)):
            flogits.append(tf.reshape(logits[i], [-1, num_classes]))
            fgclasses.append(tf.reshape(gclasses[i], [-1]))
            fgscores.append(tf.reshape(gscores[i], [-1]))
            flocalisations.append(tf.reshape(localisations[i], [-1, 4]))
            fglocalisations.append(tf.reshape(glocalisations[i], [-1, 4]))

        #flogits.shape=[shape=(5×38×38×4,21), shape=(5×19×19×6,21), ......], 共6个feature map的组成
        #fgclasses.shape=[shape=(5×38×38×4), shape=(5×19×19×6), ......]其它相似

        # And concat the crap!
        #logits.shape=(5×38×38×4+5×19×19×6+...+5×1×1×4, 21)
        logits = tf.concat(flogits, axis=0) #将[flogits[1],flogits[2],...,flogits[i],...]按第一维组合在一起,下同
        gclasses = tf.concat(fgclasses, axis=0)
        gscores = tf.concat(fgscores, axis=0)
        localisations = tf.concat(flocalisations, axis=0)
        glocalisations = tf.concat(fglocalisations, axis=0)
        dtype = logits.dtype

        # Compute positive matching mask...
        pmask = gscores > match_threshold   #得分>0.5的为正样本(掩码)
        fpmask = tf.cast(pmask, dtype)
        n_positives = tf.reduce_sum(fpmask) #正样本数

        # Hard negative mining...
        no_classes = tf.cast(pmask, tf.int32)
        predictions = slim.softmax(logits)
        nmask = tf.logical_and(tf.logical_not(pmask), #得分>-0.5且<=0.5的样本
                               gscores > -0.5)
        fnmask = tf.cast(nmask, dtype) 
        
        #得分>-0.5且<=0.5的样本在第0类(负样本)处的预测值(softmax). 
        #nvalues=[[p1],[p2],...,[pN]],N为一个batch中的anchors的总数,
        #满足score>0.5的样本,pi=0
        nvalues = tf.where(nmask,                     
                           predictions[:, 0],
                           1. - fnmask)
                           
        #nvalues_flat=[p1,p2,...,pN]
        nvalues_flat = tf.reshape(nvalues, [-1])
        
        # Number of negative entries to select.
        #负样本数取满足-0.5<score<=0.5和3倍正样本数中的最小值(保证正负样本比例不小于1:3)
        max_neg_entries = tf.cast(tf.reduce_sum(fnmask), tf.int32)
        n_neg = tf.cast(negative_ratio * n_positives, tf.int32) + batch_size
        n_neg = tf.minimum(n_neg, max_neg_entries)

        #返回-nvalues_flat中最大的k(n_neg)值,和其索引(从0开始),即nvalues_flat中最小的k个值
        val, idxes = tf.nn.top_k(-nvalues_flat, k=n_neg)
        #nvalues_flat中最小的k个值中的最大值,对应样本记为Max Negative Hard样本
        max_hard_pred = -val[-1]
        # Final negative mask.
        #最终负样本为置信度小于Max Negative Hard的所有样本
        nmask = tf.logical_and(nmask, nvalues < max_hard_pred)
        fnmask = tf.cast(nmask, dtype)

        # Add cross-entropy loss.
        with tf.name_scope('cross_entropy_pos'):
            loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,	#-log(exp(logits[gclasses])/(exp(∑logits[gclasses])))
                                                                  labels=gclasses)
            loss = tf.div(tf.reduce_sum(loss * fpmask), batch_size, name='value') #loss乘正样本掩码得符合条件的正样本损失;损失除以batch_size(这里为5)
            tf.losses.add_loss(loss)		#将当前loss添加到总loss集合

        with tf.name_scope('cross_entropy_neg'):
            loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
                                                                  labels=no_classes)
            loss = tf.div(tf.reduce_sum(loss * fnmask), batch_size, name='value')
            tf.losses.add_loss(loss)		#将当前loss添加到总loss集合

        # Add localization loss: smooth L1, L2, ...
        with tf.name_scope('localization'):
            # Weights Tensor: positive mask + random negative.
            weights = tf.expand_dims(alpha * fpmask, axis=-1) #位置项损失权重α
            loss = custom_layers.abs_smooth(localisations - glocalisations)
            loss = tf.div(tf.reduce_sum(loss * weights), batch_size, name='value')
            tf.losses.add_loss(loss) 		#将当前loss添加到总loss集合,最后通过tf.losses.get_total_loss()计算所有的loss

损失在train_ssd_network.py中的实际使用:

ssd_net.losses(logits, localisations,						
                           b_gclasses, b_glocalisations, b_gscores,
                           match_threshold=FLAGS.match_threshold,
                           negative_ratio=FLAGS.negative_ratio,
                           alpha=FLAGS.loss_alpha,
                           label_smoothing=FLAGS.label_smoothing)

6、最后便是计算总损失,并开始训练

 

 

 

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值