SSD代码解读之二——default box生成及标签整理

最新推荐文章于 2021-08-09 18:53:28 发布

Peanut_X

最新推荐文章于 2021-08-09 18:53:28 发布

阅读量1.8k

点赞数 2

分类专栏：机器学习

本文链接：https://blog.csdn.net/xiezongsheng1990/article/details/88983931

版权

机器学习专栏收录该内容

37 篇文章 1 订阅

订阅专栏

文章目录

- Default box生成（anchors）
- 标签整理（bboxes_encode）

Default box生成（anchors）

Default box是对特定的feature map的每个点，生成特定数量、特定大小的box。生成的所有层的default box存入一个list中，每个feature map一个元素。

def ssd_anchors_all_layers(img_shape,
                           layers_shape,
                           anchor_sizes,
                           anchor_ratios,
                           anchor_steps,
                           offset=0.5,
                           dtype=np.float32):
    layers_anchors = []
    for i, s in enumerate(layers_shape):
        anchor_bboxes = ssd_anchor_one_layer(img_shape, s,
                                             anchor_sizes[i],
                                             anchor_ratios[i],
                                             anchor_steps[i],
                                             offset=offset, dtype=dtype)
        layers_anchors.append(anchor_bboxes)
    return layers_anchors

以第一个feature map 为例，代码解析如下。

返回值为元组（y, x, h, w ）
y 和 x 为所有的中心点的相对坐标，shape为[38, 38, 1]，范围[0, 1]
h 和 w 为不同尺寸的高和宽，shape为[4,]，范围[0, 1]

def ssd_anchor_one_layer(img_shape, #图像尺寸，(300， 300）
                         feat_shape, #特征图尺寸，（38， 38）
                         sizes, #anchor的尺寸， （21， 45）
                         ratios, #变形比例，[2, 0.5]
                         step, #特征图与原图的比例，300 / 38 = 8
                         offset=0.5, #偏移，使anchor的中点为特征图每个点的中心
                         dtype=np.float32):
    #按像素生成坐标,y=[[0, 0, ..., 0], 
    #			 	 [1, 1, ..., 1], 
    #				 	...
    #				 [37, 37, ..., 37]]
    y, x = np.mgrid[0:feat_shape[0], 0:feat_shape[1]] 
    #归一化， step=300/38=8，归一化后x和y的最大的坐标值为1
    y = (y.astype(dtype) + offset) * step / img_shape[0] 
    x = (x.astype(dtype) + offset) * step / img_shape[1]
    y = np.expand_dims(y, axis=-1)
    x = np.expand_dims(x, axis=-1)
	#anchor的长宽，包括尺寸为21和45的两个正方形，和两个在21的基础上进行变形的长方形
    num_anchors = len(sizes) + len(ratios)
    h = np.zeros((num_anchors, ), dtype=dtype)
    w = np.zeros((num_anchors, ), dtype=dtype)
    h[0] = sizes[0] / img_shape[0]
    w[0] = sizes[0] / img_shape[1]
    di = 1
    if len(sizes) > 1:
        h[1] = math.sqrt(sizes[0] * sizes[1]) / img_shape[0]
        w[1] = math.sqrt(sizes[0] * sizes[1]) / img_shape[1]
        di += 1
    for i, r in enumerate(ratios):
        h[i+di] = sizes[0] / img_shape[0] / math.sqrt(r)
        w[i+di] = sizes[0] / img_shape[1] * math.sqrt(r)
    return y, x, h, w #x和y为所有的中心点，h和w为不同尺寸的高和宽

标签整理（bboxes_encode）

原始的label和bbox不能直接用来计算loss，需要进行预处理，得到target_labels， target_localizations，和target_scores三个list，每个feature map对应一个元素。

def tf_ssd_bboxes_encode(labels,
                         bboxes,
                         anchors,
                         num_classes,
                         no_annotation_label,
                         ignore_threshold=0.5,
                         prior_scaling=[0.1, 0.1, 0.2, 0.2],
                         dtype=tf.float32,
                         scope='ssd_bboxes_encode'):
    with tf.name_scope(scope):
        target_labels = []
        target_localizations = []
        target_scores = []
        for i, anchors_layer in enumerate(anchors): #每个feature map一组anchors
            with tf.name_scope('bboxes_encode_block_%i' % i):
                t_labels, t_loc, t_scores = \
                    tf_ssd_bboxes_encode_layer(labels, bboxes, anchors_layer,
                                               num_classes, no_annotation_label,
                                               ignore_threshold,
                                               prior_scaling, dtype)
                target_labels.append(t_labels)
                target_localizations.append(t_loc)
                target_scores.append(t_scores)
        return target_labels, target_localizations, target_scores

具体每个元素的实现如下，解析以第一个feature map为例。

返回值（feat_labels, feat_localizations, feat_scores）
- feat_labels：每个default box对应的label，shape：(38, 38, 4)
- feat_localizations：每个default box对应的偏移量shape：(38, 38, 4, 4)，前3维对应每个default box，最后一维是[feat_cx, feat_cy, feat_w, feat_h]
- feat_scores：每个default box对应的IOU，shape：(38, 38, 4)

def tf_ssd_bboxes_encode_layer(labels, #ground true的类别标签
                               bboxes, #ground true的真实坐标
                               anchors_layer, # 一个feature map对应的default box
                               num_classes, # 类别数
                               no_annotation_label, #与类别数相等
                               ignore_threshold=0.5, #阈值
                               prior_scaling=[0.1, 0.1, 0.2, 0.2], #缩放比例，后两个是w和h的偏移量的权重，比坐标的权重高
                               dtype=tf.float32):
    # anchor的坐标
    # yref, xref: default box的中点，shape：(38， 38， 1），范围[0, 1]
    # href, wref: default box的高和宽，shape: (4,),范围[0, 1]
    yref, xref, href, wref = anchors_layer
    # default box左上和右下坐标
    ymin = yref - href / 2. 
    xmin = xref - wref / 2.
    ymax = yref + href / 2.
    xmax = xref + wref / 2.
    # default box面积
    vol_anchors = (xmax - xmin) * (ymax - ymin)
    
    # shape = (38, 38, 4)，feature map上的每个点有4个default box
    shape = (yref.shape[0], yref.shape[1], href.size)
    # 初始化，以下变量的shape都是（38，38，4），即每个default box一个元素
    feat_labels = tf.zeros(shape, dtype=tf.int64)
    feat_scores = tf.zeros(shape, dtype=dtype)
	# default box对应（IOU最大）的真实框
    feat_ymin = tf.zeros(shape, dtype=dtype)
    feat_xmin = tf.zeros(shape, dtype=dtype)
    feat_ymax = tf.ones(shape, dtype=dtype)
    feat_xmax = tf.ones(shape, dtype=dtype)
	
	#计算IOU
    def jaccard_with_anchors(bbox):
        """Compute jaccard score between a box and the anchors.
        """
        int_ymin = tf.maximum(ymin, bbox[0])
        int_xmin = tf.maximum(xmin, bbox[1])
        int_ymax = tf.minimum(ymax, bbox[2])
        int_xmax = tf.minimum(xmax, bbox[3])
        h = tf.maximum(int_ymax - int_ymin, 0.)
        w = tf.maximum(int_xmax - int_xmin, 0.)
        # Volumes.
        inter_vol = h * w
        union_vol = vol_anchors - inter_vol \
            + (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
        jaccard = tf.div(inter_vol, union_vol)
        return jaccard
        
	#计算重合部分占default box面积的比例
    def intersection_with_anchors(bbox):
        """Compute intersection between score a box and the anchors.
        """
        int_ymin = tf.maximum(ymin, bbox[0])
        int_xmin = tf.maximum(xmin, bbox[1])
        int_ymax = tf.minimum(ymax, bbox[2])
        int_xmax = tf.minimum(xmax, bbox[3])
        h = tf.maximum(int_ymax - int_ymin, 0.)
        w = tf.maximum(int_xmax - int_xmin, 0.)
        inter_vol = h * w
        scores = tf.div(inter_vol, vol_anchors)
        return scores
        
	#循环条件，对所有的ground truth进行循环
    def condition(i, feat_labels, feat_scores,
                  feat_ymin, feat_xmin, feat_ymax, feat_xmax):
        """Condition: check label index.
        """
        r = tf.less(i, tf.shape(labels))
        return r[0]
        
	# 寻找与default box的IOU最大的真实框，并更新feat_labels、feat_scores和对应的坐标
    def body(i, feat_labels, feat_scores,
             feat_ymin, feat_xmin, feat_ymax, feat_xmax):
             
        label = labels[i]
        bbox = bboxes[i]
        #计算IOU
        jaccard = jaccard_with_anchors(bbox)
        # 如果IOU大于历史feat_scores，说明与该组ground truth重合率较高，需进行更新，该default box的对应mask元素为True
        mask = tf.greater(jaccard, feat_scores)
        # mask = tf.logical_and(mask, tf.greater(jaccard, matching_threshold))
        #应该是feat_scores > 0.5?
        mask = tf.logical_and(mask, feat_scores > -0.5)
        mask = tf.logical_and(mask, label < num_classes)
        imask = tf.cast(mask, tf.int64)
        fmask = tf.cast(mask, dtype)
        # 更新mask为True位置的feat_labels和feat_scores
        feat_labels = imask * label + (1 - imask) * feat_labels #可以tf.where(mask, label, feat_labels)？
        feat_scores = tf.where(mask, jaccard, feat_scores)
		# 更新坐标为ground truth的bbox坐标
        feat_ymin = fmask * bbox[0] + (1 - fmask) * feat_ymin
        feat_xmin = fmask * bbox[1] + (1 - fmask) * feat_xmin
        feat_ymax = fmask * bbox[2] + (1 - fmask) * feat_ymax
        feat_xmax = fmask * bbox[3] + (1 - fmask) * feat_xmax

        # Check no annotation label: ignore these anchors...
        # interscts = intersection_with_anchors(bbox)
        # mask = tf.logical_and(interscts > ignore_threshold,
        #                       label == no_annotation_label)
        # # Replace scores by -1.
        # feat_scores = tf.where(mask, -tf.cast(mask, dtype), feat_scores)

        return [i+1, feat_labels, feat_scores,
                feat_ymin, feat_xmin, feat_ymax, feat_xmax]
    # Main loop definition.
    i = 0
    [i, feat_labels, feat_scores,
     feat_ymin, feat_xmin,
     feat_ymax, feat_xmax] = tf.while_loop(condition, body,
                                           [i, feat_labels, feat_scores,
                                            feat_ymin, feat_xmin,
                                            feat_ymax, feat_xmax])
    # 转化为中心点坐标和h，w的形式
    feat_cy = (feat_ymax + feat_ymin) / 2.
    feat_cx = (feat_xmax + feat_xmin) / 2.
    feat_h = feat_ymax - feat_ymin
    feat_w = feat_xmax - feat_xmin
    # 计算偏移量，实际计算loss时使用的是偏移量
    feat_cy = (feat_cy - yref) / href / prior_scaling[0]
    feat_cx = (feat_cx - xref) / wref / prior_scaling[1]
    feat_h = tf.log(feat_h / href) / prior_scaling[2]
    feat_w = tf.log(feat_w / wref) / prior_scaling[3]
    # Use SSD ordering: x / y / w / h instead of ours.
    feat_localizations = tf.stack([feat_cx, feat_cy, feat_w, feat_h], axis=-1)
    return feat_labels, feat_localizations, feat_scores

Peanut_X

关注

2
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
SSD代码解读之二——default box生成及标签整理

文章目录Default box生成（anchors）标签整理（bboxes_encode）Default box生成（anchors）Default box是对特定的feature map的每个点，生成特定数量、特定大小的box。生成的所有层的default box存入一个list中，每个feature map一个元素。def ssd_anchors_all_layers(img_shape...
复制链接

扫一扫

专栏目录