Default box生成(anchors)
Default box是对特定的feature map的每个点,生成特定数量、特定大小的box。生成的所有层的default box存入一个list中,每个feature map一个元素。
def ssd_anchors_all_layers(img_shape,
layers_shape,
anchor_sizes,
anchor_ratios,
anchor_steps,
offset=0.5,
dtype=np.float32):
layers_anchors = []
for i, s in enumerate(layers_shape):
anchor_bboxes = ssd_anchor_one_layer(img_shape, s,
anchor_sizes[i],
anchor_ratios[i],
anchor_steps[i],
offset=offset, dtype=dtype)
layers_anchors.append(anchor_bboxes)
return layers_anchors
以第一个feature map 为例,代码解析如下。
- 返回值为元组(y, x, h, w )
- y 和 x 为所有的中心点的相对坐标,shape为[38, 38, 1],范围[0, 1]
- h 和 w 为不同尺寸的高和宽,shape为[4,],范围[0, 1]
def ssd_anchor_one_layer(img_shape, #图像尺寸,(300, 300)
feat_shape, #特征图尺寸,(38, 38)
sizes, #anchor的尺寸, (21, 45)
ratios, #变形比例,[2, 0.5]
step, #特征图与原图的比例,300 / 38 = 8
offset=0.5, #偏移,使anchor的中点为特征图每个点的中心
dtype=np.float32):
#按像素生成坐标,y=[[0, 0, ..., 0],
# [1, 1, ..., 1],
# ...
# [37, 37, ..., 37]]
y, x = np.mgrid[0:feat_shape[0], 0:feat_shape[1]]
#归一化, step=300/38=8,归一化后x和y的最大的坐标值为1
y = (y.astype(dtype) + offset) * step / img_shape[0]
x = (x.astype(dtype) + offset) * step / img_shape[1]
y = np.expand_dims(y, axis=-1)
x = np.expand_dims(x, axis=-1)
#anchor的长宽,包括尺寸为21和45的两个正方形,和两个在21的基础上进行变形的长方形
num_anchors = len(sizes) + len(ratios)
h = np.zeros((num_anchors, ), dtype=dtype)
w = np.zeros((num_anchors, ), dtype=dtype)
h[0] = sizes[0] / img_shape[0]
w[0] = sizes[0] / img_shape[1]
di = 1
if len(sizes) > 1:
h[1] = math.sqrt(sizes[0] * sizes[1]) / img_shape[0]
w[1] = math.sqrt(sizes[0] * sizes[1]) / img_shape[1]
di += 1
for i, r in enumerate(ratios):
h[i+di] = sizes[0] / img_shape[0] / math.sqrt(r)
w[i+di] = sizes[0] / img_shape[1] * math.sqrt(r)
return y, x, h, w #x和y为所有的中心点,h和w为不同尺寸的高和宽
标签整理(bboxes_encode)
原始的label和bbox不能直接用来计算loss,需要进行预处理,得到target_labels, target_localizations,和target_scores三个list,每个feature map对应一个元素。
def tf_ssd_bboxes_encode(labels,
bboxes,
anchors,
num_classes,
no_annotation_label,
ignore_threshold=0.5,
prior_scaling=[0.1, 0.1, 0.2, 0.2],
dtype=tf.float32,
scope='ssd_bboxes_encode'):
with tf.name_scope(scope):
target_labels = []
target_localizations = []
target_scores = []
for i, anchors_layer in enumerate(anchors): #每个feature map一组anchors
with tf.name_scope('bboxes_encode_block_%i' % i):
t_labels, t_loc, t_scores = \
tf_ssd_bboxes_encode_layer(labels, bboxes, anchors_layer,
num_classes, no_annotation_label,
ignore_threshold,
prior_scaling, dtype)
target_labels.append(t_labels)
target_localizations.append(t_loc)
target_scores.append(t_scores)
return target_labels, target_localizations, target_scores
具体每个元素的实现如下,解析以第一个feature map为例。
- 返回值(feat_labels, feat_localizations, feat_scores)
- feat_labels:每个default box对应的label,shape:(38, 38, 4)
- feat_localizations:每个default box对应的偏移量shape:(38, 38, 4, 4),前3维对应每个default box,最后一维是[feat_cx, feat_cy, feat_w, feat_h]
- feat_scores:每个default box对应的IOU,shape:(38, 38, 4)
def tf_ssd_bboxes_encode_layer(labels, #ground true的类别标签
bboxes, #ground true的真实坐标
anchors_layer, # 一个feature map对应的default box
num_classes, # 类别数
no_annotation_label, #与类别数相等
ignore_threshold=0.5, #阈值
prior_scaling=[0.1, 0.1, 0.2, 0.2], #缩放比例,后两个是w和h的偏移量的权重,比坐标的权重高
dtype=tf.float32):
# anchor的坐标
# yref, xref: default box的中点,shape:(38, 38, 1),范围[0, 1]
# href, wref: default box的高和宽,shape: (4,),范围[0, 1]
yref, xref, href, wref = anchors_layer
# default box左上和右下坐标
ymin = yref - href / 2.
xmin = xref - wref / 2.
ymax = yref + href / 2.
xmax = xref + wref / 2.
# default box面积
vol_anchors = (xmax - xmin) * (ymax - ymin)
# shape = (38, 38, 4),feature map上的每个点有4个default box
shape = (yref.shape[0], yref.shape[1], href.size)
# 初始化,以下变量的shape都是(38,38,4),即每个default box一个元素
feat_labels = tf.zeros(shape, dtype=tf.int64)
feat_scores = tf.zeros(shape, dtype=dtype)
# default box对应(IOU最大)的真实框
feat_ymin = tf.zeros(shape, dtype=dtype)
feat_xmin = tf.zeros(shape, dtype=dtype)
feat_ymax = tf.ones(shape, dtype=dtype)
feat_xmax = tf.ones(shape, dtype=dtype)
#计算IOU
def jaccard_with_anchors(bbox):
"""Compute jaccard score between a box and the anchors.
"""
int_ymin = tf.maximum(ymin, bbox[0])
int_xmin = tf.maximum(xmin, bbox[1])
int_ymax = tf.minimum(ymax, bbox[2])
int_xmax = tf.minimum(xmax, bbox[3])
h = tf.maximum(int_ymax - int_ymin, 0.)
w = tf.maximum(int_xmax - int_xmin, 0.)
# Volumes.
inter_vol = h * w
union_vol = vol_anchors - inter_vol \
+ (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
jaccard = tf.div(inter_vol, union_vol)
return jaccard
#计算重合部分占default box面积的比例
def intersection_with_anchors(bbox):
"""Compute intersection between score a box and the anchors.
"""
int_ymin = tf.maximum(ymin, bbox[0])
int_xmin = tf.maximum(xmin, bbox[1])
int_ymax = tf.minimum(ymax, bbox[2])
int_xmax = tf.minimum(xmax, bbox[3])
h = tf.maximum(int_ymax - int_ymin, 0.)
w = tf.maximum(int_xmax - int_xmin, 0.)
inter_vol = h * w
scores = tf.div(inter_vol, vol_anchors)
return scores
#循环条件,对所有的ground truth进行循环
def condition(i, feat_labels, feat_scores,
feat_ymin, feat_xmin, feat_ymax, feat_xmax):
"""Condition: check label index.
"""
r = tf.less(i, tf.shape(labels))
return r[0]
# 寻找与default box的IOU最大的真实框,并更新feat_labels、feat_scores和对应的坐标
def body(i, feat_labels, feat_scores,
feat_ymin, feat_xmin, feat_ymax, feat_xmax):
label = labels[i]
bbox = bboxes[i]
#计算IOU
jaccard = jaccard_with_anchors(bbox)
# 如果IOU大于历史feat_scores,说明与该组ground truth重合率较高,需进行更新,该default box的对应mask元素为True
mask = tf.greater(jaccard, feat_scores)
# mask = tf.logical_and(mask, tf.greater(jaccard, matching_threshold))
#应该是feat_scores > 0.5?
mask = tf.logical_and(mask, feat_scores > -0.5)
mask = tf.logical_and(mask, label < num_classes)
imask = tf.cast(mask, tf.int64)
fmask = tf.cast(mask, dtype)
# 更新mask为True位置的feat_labels和feat_scores
feat_labels = imask * label + (1 - imask) * feat_labels #可以tf.where(mask, label, feat_labels)?
feat_scores = tf.where(mask, jaccard, feat_scores)
# 更新坐标为ground truth的bbox坐标
feat_ymin = fmask * bbox[0] + (1 - fmask) * feat_ymin
feat_xmin = fmask * bbox[1] + (1 - fmask) * feat_xmin
feat_ymax = fmask * bbox[2] + (1 - fmask) * feat_ymax
feat_xmax = fmask * bbox[3] + (1 - fmask) * feat_xmax
# Check no annotation label: ignore these anchors...
# interscts = intersection_with_anchors(bbox)
# mask = tf.logical_and(interscts > ignore_threshold,
# label == no_annotation_label)
# # Replace scores by -1.
# feat_scores = tf.where(mask, -tf.cast(mask, dtype), feat_scores)
return [i+1, feat_labels, feat_scores,
feat_ymin, feat_xmin, feat_ymax, feat_xmax]
# Main loop definition.
i = 0
[i, feat_labels, feat_scores,
feat_ymin, feat_xmin,
feat_ymax, feat_xmax] = tf.while_loop(condition, body,
[i, feat_labels, feat_scores,
feat_ymin, feat_xmin,
feat_ymax, feat_xmax])
# 转化为中心点坐标和h,w的形式
feat_cy = (feat_ymax + feat_ymin) / 2.
feat_cx = (feat_xmax + feat_xmin) / 2.
feat_h = feat_ymax - feat_ymin
feat_w = feat_xmax - feat_xmin
# 计算偏移量,实际计算loss时使用的是偏移量
feat_cy = (feat_cy - yref) / href / prior_scaling[0]
feat_cx = (feat_cx - xref) / wref / prior_scaling[1]
feat_h = tf.log(feat_h / href) / prior_scaling[2]
feat_w = tf.log(feat_w / wref) / prior_scaling[3]
# Use SSD ordering: x / y / w / h instead of ours.
feat_localizations = tf.stack([feat_cx, feat_cy, feat_w, feat_h], axis=-1)
return feat_labels, feat_localizations, feat_scores