def ssd_losses(logits, localisations,
gclasses, glocalisations, gscores,
match_threshold=0.5,
negative_ratio=3.,
alpha=1.,
label_smoothing=0.,
device='/cpu:0',
scope=None):
with tf.name_scope(scope, 'ssd_losses'):
lshape = tfe.get_shape(logits[0], 5)
num_classes = lshape[-1]
batch_size = lshape[0]
# Flatten out all vectors!
flogits = []
fgclasses = []
fgscores = []
flocalisations = []
fglocalisations = []
for i in range(len(logits)):
flogits.append(tf.reshape(logits[i], [-1, num_classes]))
fgclasses.append(tf.reshape(gclasses[i], [-1]))
fgscores.append(tf.reshape(gscores[i], [-1]))
flocalisations.append(tf.reshape(localisations[i], [-1, 4]))
fglocalisations.append(tf.reshape(glocalisations[i], [-1, 4]))
# And concat the crap!
logits = tf.concat(flogits, axis=0)
gclasses = tf.concat(fgclasses, axis=0)
gscores = tf.concat(fgscores, axis=0)
localisations = tf.concat(flocalisations, axis=0)
glocalisations = tf.concat(fglocalisations, axis=0)
dtype = logits.dtype
# Compute positive matching mask...
pmask = gscores > match_threshold
fpmask = tf.cast(pmask, dtype)
n_positives = tf.reduce_sum(fpmask)
# Hard negative mining...
# 分类为0和1,背景和目标
no_classes = tf.cast(pmask, tf.int32)
predictions = slim.softmax(logits)
# gscore小于阈值,或者未0的地方
nmask = tf.logical_and(tf.logical_not(pmask),
gscores > -0.5)
fnmask = tf.cast(nmask, dtype)
# 在非目标(背景)区域判断为背景的概率
# tf.where的作用,将nmask中为1的部分替换为prediction[:,0],false的部分替换为1,相当于
#是给出每个背景锚点的预测概率
nvalues = tf.where(nmask,
predictions[:, 0],
1. - fnmask)
nvalues_flat = tf.reshape(nvalues, [-1])
# Number of negative entries to select.
max_neg_entries = tf.cast(tf.reduce_sum(fnmask), tf.int32)
# 负样本的数量 为正样本的3倍加上batch_size
n_neg = tf.cast(negative_ratio * n_positives, tf.int32) + batch_size
n_neg = tf.minimum(n_neg, max_neg_entries)
# 选择最容易发生将背景判断为目标的n_neg个背景anchor作为负样本,绝对值最小的前n个
val, idxes = tf.nn.top_k(-nvalues_flat, k=n_neg)
max_hard_pred = -val[-1]
# Final negative mask.
nmask = tf.logical_and(nmask, nvalues < max_hard_pred)
fnmask = tf.cast(nmask, dtype)
# Add cross-entropy loss.
with tf.name_scope('cross_entropy_pos'):
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
labels=gclasses)
loss = tf.div(tf.reduce_sum(loss * fpmask), batch_size, name='value')
tf.losses.add_loss(loss)
with tf.name_scope('cross_entropy_neg'):
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
labels=no_classes)
loss = tf.div(tf.reduce_sum(loss * fnmask), batch_size, name='value')
tf.losses.add_loss(loss)
# Add localization loss: smooth L1, L2, ...
with tf.name_scope('localization'):
# Weights Tensor: positive mask + random negative.
weights = tf.expand_dims(alpha * fpmask, axis=-1)
# smooth L1 loss:smooth L1 loss让loss对于离群点更加鲁棒,
# 即:相比于L2损失函数,其对离群点、异常值(outlier)不敏感,梯度变化相对更小,训练时不容易跑飞
loss = custom_layers.abs_smooth(localisations - glocalisations)
loss = tf.div(tf.reduce_sum(loss * weights), batch_size, name='value')
tf.losses.add_loss(loss)
SSD的损失函数主要包括两部分,一部分是目标分类,一部分是边框回归。边框回归这一部分比较简单,就是求预测结果与标签之间的smooth L1 loss。
目标分类的损失函数部分要稍微复杂一点,也分为两部分,一部分是正样本,也就是标记出来类别的那部分,另一部分是负样本,也就是背景。为了保证样本的均衡性,要求负样本与正样本的比例大约为3:1。实现的方法是先计算每个锚点bbox的loss,然后乘以对应的mask。这里,正样本的标签是gclasses,mask是fpmask,pmask的取值是类别分数大于阈值的框才取1,正样本的标签是no_classes,它的值和fpmask是一样的,背景是0,前景是1。
对于负样本,首先计算出图像中所有背景bbox预测为背景的概率,然后将预测概率最小的前3*正样本数量的筛选出来,最后形成这些负样本的mask,即fn_mask