在上一章节中,我们得到了一个end_points,里面包含多个层,现在我们需要针对每一个层,得到框。这里需要提出一个概念,锚。也就是需要对每一个像素生成多个框。假如layer_1层,大小为36*36*3,每一个像素生成6个框,那么就有layer_1就有36*36*6个框。同时每个像素的6个框是不一样的。如何得到这6个框。
首先我们必须对每一个layer得到相应的6个默认框。计算公式如下
上述公式中常量为
S
m
i
n
S_{min}
Smin论文中为0.2代码中为1.5,
S
m
a
x
S_{max}
Smax论文中代码中都为0.9。
m
m
m是用于预测的特征图个数,
k
k
k代表层数根据上述公式,我们得到每一层的默认框21, 45,99,153,207,261,315(这些是长度=宽度
S
k
S_k
Sk)。根据默认框调整成不同的长宽比,一般有6个。
α
r
=
{
1
,
2
,
3
,
1
/
2
,
1
/
3
}
\alpha_r=\{1, 2, 3, 1/2, 1/3\}
αr={1,2,3,1/2,1/3},这里有5个计算公式如下:
w
k
a
=
s
k
α
r
w_k^a =s_k\sqrt{\alpha_r}
wka=skαr,
h
k
a
=
s
k
/
α
r
h_k^a =s_k/\sqrt{\alpha_r}
hka=sk/αr
还有一个还会设置一个尺度为
s
k
′
=
s
k
s
k
+
1
s^′_k=\sqrt{s_ks_{k+1}}
sk′=sksk+1且
a
r
=
1
a_r=1
ar=1的先验框。
详细代码如下:
default_params = {
"img_shape":(300, 300, 3), # 图片输入尺寸
"num_classes":21, # 预测类别20+1(背景)
"no_annotation_label":21,
"feat_layers":['block4', 'block7', 'block8', 'block9', 'block10', 'block11'], # 用于生成default box的特征层
"feat_shapes":[(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)], # 对应特征层的特征图尺寸
"anchor_size_bounds":[0.15, 0.90], # Smin = 0.15, Smax = 0.9
# anchor_size_bounds=[0.20, 0.90],
"anchor_sizes":[(21., 45.), # 当前层与下一层的预测默认矩形边框尺寸,即Sk的值,与论文中的计算公式并不对应
(45., 99.),
(99., 153.),
(153., 207.),
(207., 261.),
(261., 315.)],
"anchor_ratios":[[2, .5], # 生成默认框的形状比例,不包含1:1的比例
[2, .5, 3, 1./3],
[2, .5, 3, 1./3],
[2, .5, 3, 1./3],
[2, .5],
[2, .5]],
"anchor_steps":[8, 16, 32, 64, 100, 300], # 特征图上一步对应在原图上的跨度 anchor_step*feat_shapey与等于300
"anchor_offset":0.5, # 偏移
"normalizations":[20, -1, -1, -1, -1, -1], # 特征层是否正则处理
"prior_scaling":[0.1, 0.1, 0.2, 0.2] # 默认框与真实框的差异缩放比例
}
#生成框
def ssd_anchor_one_layer(img_shape, # 原始图像shape
feat_shape, # 特征图shape
sizes, # 默认box大小
ratios, # 默认box长宽比
step, # 特征图上一步对应在原图上的跨度
offset=0.5,
dtype=np.float32):
"""Computer SSD default anchor boxes for one feature layer.
Determine the relative position grid of the centers, and the relative
width and height.
Arguments:
feat_shape: Feature shape, used for computing relative position grids;
size: Absolute reference sizes;
ratios: Ratios to use on these features;
img_shape: Image shape, used for computing height, width relatively to the
former;
offset: Grid offset.
Return:
y, x, h, w: Relative x and y grids, and height and width.
"""
# Compute the position grid: simple way.
# y, x = np.mgrid[0:feat_shape[0], 0:feat_shape[1]]
# y = (y.astype(dtype) + offset) / feat_shape[0]
# x = (x.astype(dtype) + offset) / feat_shape[1]
# Weird SSD-Caffe computation using steps values...
# 计算默认框中心坐标(相对原图)
y, x = np.mgrid[0:feat_shape[0], 0:feat_shape[1]]
y = (y.astype(dtype) + offset) * step / img_shape[0]
x = (x.astype(dtype) + offset) * step / img_shape[1]
# Expand dims to support easy broadcasting.
y = np.expand_dims(y, axis=-1)
x = np.expand_dims(x, axis=-1)
# Compute relative height and width.
# Tries to follow the original implementation of SSD for the order.
num_anchors = len(sizes) + len(ratios) # 默认框的个数
print(num_anchors)
h = np.zeros((num_anchors, ), dtype=dtype) # 初始化高
w = np.zeros((num_anchors, ), dtype=dtype) # 初始化宽
# Add first anchor boxes with ratio=1.
h[0] = sizes[0] / img_shape[0] # 添加长宽比为1的默认框
w[0] = sizes[0] / img_shape[1]
di = 1
if len(sizes) > 1:
h[1] = math.sqrt(sizes[0] * sizes[1]) / img_shape[0] # 添加一组特殊的默认框,长宽比为1,大小为sqrt(s(i) + s(i+1))
w[1] = math.sqrt(sizes[0] * sizes[1]) / img_shape[1]
di += 1
for i, r in enumerate(ratios): # 添加不同比例的默认框(ratios中不含1)
h[i+di] = sizes[0] / img_shape[0] / math.sqrt(r)
w[i+di] = sizes[0] / img_shape[1] * math.sqrt(r)
return y, x, h, w
#为每一个框打标签
def bboxes_encode(self, labels, bboxes, anchors, # lables是GT box对应的标签, bboxes是GT box对应的坐标信息
scope=None): # anchors是生成的默认框
"""Encode labels and bounding boxes.
"""
return tf_ssd_bboxes_encode(
labels, bboxes, anchors,
self.params.num_classes,
self.params.no_annotation_label,
ignore_threshold=0.5,
prior_scaling=self.params.prior_scaling,
scope=scope)
def tf_ssd_bboxes_encode(labels, # 真实标签
bboxes, # 真实bbox
anchors, # 存放每一个预测层生成的默认框
num_classes,
no_annotation_label,
ignore_threshold=0.5,
prior_scaling=[0.1, 0.1, 0.2, 0.2],
dtype=tf.float32,
scope='ssd_bboxes_encode'):
"""Encode groundtruth labels and bounding boxes using SSD net anchors.
Encoding boxes for all feature layers.
Arguments:
labels: 1D Tensor(int64) containing groundtruth labels;
bboxes: Nx4 Tensor(float) with bboxes relative coordinates;
anchors: List of Numpy array with layer anchors;
matching_threshold: Threshold for positive match with groundtruth bboxes;
prior_scaling: Scaling of encoded coordinates.
Return:
(target_labels, target_localizations, target_scores):
Each element is a list of target Tensors.
"""
with tf.name_scope(scope):
target_labels = [] # 存放匹配到的GTbox的label的容器
target_localizations = [] # 存放匹配到的GTbox的位置信息的容器
target_scores = [] # 存放默认框与匹配到的GTbox的IOU(交并比)
for i, anchors_layer in enumerate(anchors): # 遍历每个预测层的默认框
with tf.name_scope('bboxes_encode_block_%i' % i):
t_labels, t_loc, t_scores = \
tf_ssd_bboxes_encode_layer(labels, bboxes, anchors_layer, # 匹配默认框的ground truth box并计算偏差
num_classes, no_annotation_label,
ignore_threshold,
prior_scaling, dtype)
target_labels.append(t_labels) # 匹配到的ground truth box对应标签
target_localizations.append(t_loc) # 默认框与匹配到的ground truth box的坐标差异
target_scores.append(t_scores) # 默认框与匹配到的ground truth box的IOU(交并比)
return target_labels, target_localizations, target_scores
def tf_ssd_bboxes_encode_layer(labels, # GTbox类别
bboxes, # GTbox的位置信息
anchors_layer, # 默认框坐标信息(中心点坐标以及宽、高)
num_classes,
no_annotation_label,
ignore_threshold=0.5,
prior_scaling=[0.1, 0.1, 0.2, 0.2],
dtype=tf.float32):
"""Encode groundtruth labels and bounding boxes using SSD anchors from
one layer.
Arguments:
labels: 1D Tensor(int64) containing groundtruth labels;
bboxes: Nx4 Tensor(float) with bboxes relative coordinates;
anchors_layer: Numpy array with layer anchors;
matching_threshold: Threshold for positive match with groundtruth bboxes;
prior_scaling: Scaling of encoded coordinates.
Return:
(target_labels, target_localizations, target_scores): Target Tensors.
"""
# Anchors coordinates and volume.
yref, xref, href, wref = anchors_layer
ymin = yref - href / 2. # 转换到默认框的左上角坐标以及右下角坐标
xmin = xref - wref / 2.
ymax = yref + href / 2.
xmax = xref + wref / 2.
vol_anchors = (xmax - xmin) * (ymax - ymin) # 默认框的面积
# Initialize tensors...
# 初始化各参数
shape = (yref.shape[0], yref.shape[1], href.size)
feat_labels = tf.zeros(shape, dtype=tf.int64) # 存放默认框匹配的GTbox标签
feat_scores = tf.zeros(shape, dtype=dtype) # 存放默认框与匹配的GTbox的IOU(交并比)
feat_ymin = tf.zeros(shape, dtype=dtype) # 存放默认框匹配到的GTbox的坐标信息
feat_xmin = tf.zeros(shape, dtype=dtype)
feat_ymax = tf.ones(shape, dtype=dtype)
feat_xmax = tf.ones(shape, dtype=dtype)
def jaccard_with_anchors(bbox): # 计算重叠度函数
"""Compute jaccard score between a box and the anchors.
"""
int_ymin = tf.maximum(ymin, bbox[0])
int_xmin = tf.maximum(xmin, bbox[1])
int_ymax = tf.minimum(ymax, bbox[2])
int_xmax = tf.minimum(xmax, bbox[3])
h = tf.maximum(int_ymax - int_ymin, 0.)
w = tf.maximum(int_xmax - int_xmin, 0.)
# Volumes.
inter_vol = h * w
union_vol = vol_anchors - inter_vol \
+ (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
jaccard = tf.div(inter_vol, union_vol)
return jaccard
def intersection_with_anchors(bbox):
"""Compute intersection between score a box and the anchors.
"""
int_ymin = tf.maximum(ymin, bbox[0])
int_xmin = tf.maximum(xmin, bbox[1])
int_ymax = tf.minimum(ymax, bbox[2])
int_xmax = tf.minimum(xmax, bbox[3])
h = tf.maximum(int_ymax - int_ymin, 0.)
w = tf.maximum(int_xmax - int_xmin, 0.)
inter_vol = h * w
scores = tf.div(inter_vol, vol_anchors)
return scores
def condition(i, feat_labels, feat_scores, # 循环条件
feat_ymin, feat_xmin, feat_ymax, feat_xmax):
"""Condition: check label index.
"""
r = tf.less(i, tf.shape(labels)) # tf.shape(labels)GTbox的个数,当i<=tf.shape(labels)是返回True
return r[0]
def body(i, feat_labels, feat_scores, # 循环执行主体
feat_ymin, feat_xmin, feat_ymax, feat_xmax):
"""Body: update feature labels, scores and bboxes.
Follow the original SSD paper for that purpose:
- assign values when jaccard > 0.5;
- only update if beat the score of other bboxes.
寻找该层所有默认框匹配满足条件的GTbox
"""
# Jaccard score.
label = labels[i]
bbox = bboxes[i]
jaccard = jaccard_with_anchors(bbox) # 计算该层所有的默认框与该真实框的交并比
# Mask: check threshold + scores + no annotations + num_classes.
mask = tf.greater(jaccard, feat_scores) # 交并比是否比之前匹配的GTbox大
# mask = tf.logical_and(mask, tf.greater(jaccard, matching_threshold))
mask = tf.logical_and(mask, feat_scores > -0.5) # 暂不清楚意义,但这里并不是为了获取正样本所以并不是大于0.5
mask = tf.logical_and(mask, label < num_classes) # 感觉没有任何意义真实标签label肯定小于num_classes,防止出错?
imask = tf.cast(mask, tf.int64) # 转型
fmask = tf.cast(mask, dtype) # dtype float32
# Update values using mask. 根据mask更新标签和交并比
feat_labels = imask * label + (1 - imask) * feat_labels # 当imask为1时更新标签
feat_scores = tf.where(mask, jaccard, feat_scores) # 当mask为true时更新为jaccard,否则为feat_score
feat_ymin = fmask * bbox[0] + (1 - fmask) * feat_ymin # 当fmask为1.0时更新坐标信息
feat_xmin = fmask * bbox[1] + (1 - fmask) * feat_xmin
feat_ymax = fmask * bbox[2] + (1 - fmask) * feat_ymax
feat_xmax = fmask * bbox[3] + (1 - fmask) * feat_xmax
# Check no annotation label: ignore these anchors...
# interscts = intersection_with_anchors(bbox)
# mask = tf.logical_and(interscts > ignore_threshold,
# label == no_annotation_label)
# # Replace scores by -1.
# feat_scores = tf.where(mask, -tf.cast(mask, dtype), feat_scores)
return [i+1, feat_labels, feat_scores,
feat_ymin, feat_xmin, feat_ymax, feat_xmax]
# Main loop definition.
i = 0
[i, feat_labels, feat_scores,
feat_ymin, feat_xmin,
feat_ymax, feat_xmax] = tf.while_loop(condition, body, # tf.while_loop是一个循环函数condition是循环条件,body是循环体
[i, feat_labels, feat_scores, # 第三项是参数
feat_ymin, feat_xmin,
feat_ymax, feat_xmax])
# Transform to center / size. 转换回中心坐标以及宽高
feat_cy = (feat_ymax + feat_ymin) / 2.
feat_cx = (feat_xmax + feat_xmin) / 2.
feat_h = feat_ymax - feat_ymin
feat_w = feat_xmax - feat_xmin
# Encode features.
feat_cy = (feat_cy - yref) / href / prior_scaling[0] # 默认框中心与匹配的真实框中心坐标偏差
feat_cx = (feat_cx - xref) / wref / prior_scaling[1]
feat_h = tf.log(feat_h / href) / prior_scaling[2] # 高和宽的偏差
feat_w = tf.log(feat_w / wref) / prior_scaling[3]
# Use SSD ordering: x / y / w / h instead of ours.
feat_localizations = tf.stack([feat_cx, feat_cy, feat_w, feat_h], axis=-1)
return feat_labels, feat_localizations, feat_scores
接下来看如何loss如何组成,代码如下:
def ssd_losses(logits, localisations, # logits预测类别 localisation预测偏移位置
gclasses, glocalisations, gscores, # gclasses正确类别 glocalisation实际偏移位置 gscores与GT的交并比
match_threshold=0.5,
negative_ratio=3.,
alpha=1.,
label_smoothing=0.,
device='/cpu:0',
scope=None):
with tf.name_scope(scope, 'ssd_losses'):
lshape = logits[0].shape
num_classes = lshape[-1]
batch_size = lshape[0]
# Flatten out all vectors! 展平所有向量
flogits = []
fgclasses = []
fgscores = []
flocalisations = []
fglocalisations = []
for i in range(len(logits)):
flogits.append(tf.reshape(logits[i], [-1, num_classes]))
fgclasses.append(tf.reshape(gclasses[i], [-1]))
fgscores.append(tf.reshape(gscores[i], [-1]))
flocalisations.append(tf.reshape(localisations[i], [-1, 4]))
fglocalisations.append(tf.reshape(glocalisations[i], [-1, 4]))
# And concat the crap!
logits = tf.concat(flogits, axis=0) #prediction
gclasses = tf.concat(fgclasses, axis=0) #true class
gscores = tf.concat(fgscores, axis=0) #true scores
localisations = tf.concat(flocalisations, axis=0) #predict localisation
glocalisations = tf.concat(fglocalisations, axis=0) #true localisation
dtype = logits.dtype
# Compute positive matching mask... 计算正样本数目
pmask = gscores > match_threshold # 交并比是否大于0.5
fpmask = tf.cast(pmask, dtype)
n_positives = tf.reduce_sum(fpmask) # 正样本数目
# Hard negative mining...
no_classes = tf.cast(pmask, tf.int32)
predictions = Softmax()(logits)
nmask = tf.logical_and(tf.logical_not(pmask), # 交并比小于0.5并大于-0.5的负样本
gscores > -0.5)
fnmask = tf.cast(nmask, dtype) # 转成float型
nvalues = tf.where(nmask, # True时为背景概率,False时为1.0
predictions[:, 0], # 0 是 background
1. - fnmask)
nvalues_flat = tf.reshape(nvalues, [-1])
# Number of negative entries to select.
max_neg_entries = tf.cast(tf.reduce_sum(fnmask), tf.int32) # 所有供选择的负样本数目
n_neg = tf.cast(negative_ratio * n_positives, tf.int32) + batch_size
n_neg = tf.minimum(n_neg, max_neg_entries) # 负样本的个数
val, idxes = tf.nn.top_k(-nvalues_flat, k=n_neg) # 按顺序排获取前k个值,以及对应id
max_hard_pred = -val[-1] # 负样本的背景概率阈值
# Final negative mask.
nmask = tf.logical_and(nmask, nvalues < max_hard_pred) # 交并比小于0.5并大于-0.5的负样本,且概率小于max_hard_pred
fnmask = tf.cast(nmask, dtype)
# Add cross-entropy loss.
with tf.name_scope('cross_entropy_pos'):
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
labels=gclasses)
loss = tf.div(tf.reduce_sum(loss * fpmask), batch_size, name='value') # fpmask是正样本的mask,正1,负0
tf.losses.add_loss(loss)
with tf.name_scope('cross_entropy_neg'):
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
labels=no_classes)
loss = tf.div(tf.reduce_sum(loss * fnmask), batch_size, name='value') # fnmask是负样本的mask,负为1,正为0
tf.losses.add_loss(loss)
# Add localization loss: smooth L1, L2, ...
with tf.name_scope('localization'):
# Weights Tensor: positive mask + random negative.
weights = tf.expand_dims(alpha * fpmask, axis=-1)
loss = abs_smooth(localisations - glocalisations)
loss = tf.div(tf.reduce_sum(loss * weights), batch_size, name='value')
tf.losses.add_loss(loss)
def abs_smooth(diss):
if abs(diss)<1:
return 0.5*(diss**2)
else:
return abs(diss) - 0.5
tensorflow官方文档
https://blog.csdn.net/qq_37541097/article/details/80917536