YOLOv7 loss
BCE
#这个函数是一个标签平滑的策略(trick),是一种在 分类/检测 问题中,防止过拟合的方法。通常会用在分类损失当中,如
# ComputeLoss类的__init__函数定义
def smooth_BCE(eps=0.1): # https://github.com/ultralytics/yolov3/issues/238#issuecomment-598028441
# return positive, negative label smoothing BCE targets
"""用在ComputeLoss类中
标签平滑操作 [1, 0] => [0.95, 0.05]
https://github.com/ultralytics/yolov3/issues/238#issuecomment-598028441
:params eps: 平滑参数
:return positive, negative label smoothing BCE targets 两个值分别代表正样本和负样本的标签取值
原先的正样本=1 负样本=0 改为 正样本=1.0 - 0.5 * eps 负样本=0.5 * eps
"""
return 1.0 - 0.5 * eps, 0.5 * eps
#这个函数是BCE函数的一个替代
class BCEBlurWithLogitsLoss(nn.Module):
# BCEwithLogitLoss() with reduced missing label effects.
def __init__(self, alpha=0.05):
super(BCEBlurWithLogitsLoss, self).__init__()
self.loss_fcn = nn.BCEWithLogitsLoss(reduction='none') # must be nn.BCEWithLogitsLoss()
self.alpha = alpha
def forward(self, pred, true):
loss = self.loss_fcn(pred, true)
pred = torch.sigmoid(pred) # prob from logits
dx = pred - true # reduce only missing label effects
# dx = (pred - true).abs() # reduce missing label and false label effects
# dx = [-1, 1] 当pred=1 true=0时(网络预测说这里有个obj但是gt说这里没有), dx=1 => alpha_factor=0 => loss=0
# 这种就是检测成正样本了但是检测错了(false positive)或者missing label的情况 这种情况不应该过多的惩罚->loss=0
# 如果采样绝对值的话 会减轻pred和gt差异过大而造成的影响
# dx = (pred - true).abs() # reduce missing label and false label effects
alpha_factor = 1 - torch.exp((dx - 1) / (self.alpha + 1e-4))
loss *= alpha_factor
return loss.mean()
Focal loss
class FocalLoss(nn.Module):
# Wraps focal loss around existing loss_fcn(), i.e. criteria = FocalLoss(nn.BCEWithLogitsLoss(), gamma=1.5)
def __init__(self, loss_fcn, gamma=1.5, alpha=0.25):
super(FocalLoss, self).__init__()
self.loss_fcn = loss_fcn # must be nn.BCEWithLogitsLoss()=Sigmoid+BCELoss 定义为多分类交叉熵损失函数
self.gamma = gamma # 参数gamma 用于削弱简单样本对loss的贡献程度
self.alpha = alpha # 参数alpha 用于平衡正负样本个数不均衡的问题
# self.reduction: 控制FocalLoss损失输出模式 sum/mean/none 默认是Mean
self.reduction = loss_fcn.reduction
# focalloss中的BCE函数的reduction='None' BCE不使用Sum或者Mean
self.loss_fcn.reduction = 'none' # required to apply FL to each element
def forward(self, pred, true):
loss = self.loss_fcn(pred, true) # 正常BCE的loss: loss = -log(p_t)
# p_t = torch.exp(-loss)
# loss *= self.alpha * (1.000001 - p_t) ** self.gamma # non-zero power for gradient stability
# TF implementation https://github.com/tensorflow/addons/blob/v0.7.1/tensorflow_addons/losses/focal_loss.py
pred_prob = torch.sigmoid(pred) # prob from logits
p_t = true * pred_prob + (1 - true) * (1 - pred_prob)
alpha_factor = true * self.alpha + (1 - true) * (1 - self.alpha)
modulating_factor = (1.0 - p_t) ** self.gamma # 这里代表Focal loss中的指数项
# 返回最终的loss=BCE * 两个参数 (看看公式就行了 和公式一模一样)
loss *= alpha_factor * modulating_factor
# 最后选择focalloss返回的类型 默认是mean
if self.reduction == 'mean':
return loss.mean()
elif self.reduction == 'sum':
return loss.sum()
else: # 'none'
return loss
ComputeLoss
class ComputeLoss:
# Compute losses
def __init__(self, model, autobalance=False):
super(ComputeLoss, self).__init__()
device = next(model.parameters()).device # get model device
h = model.hyp # hyperparameters
# Define criteria 定义分类损失和置信度损失
# BCEcls = BCEBlurWithLogitsLoss()
# BCEobj = BCEBlurWithLogitsLoss()
# h['cls_pw']=1 BCEWithLogitsLoss默认的正样本权重也是1
BCEcls = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([h['cls_pw']], device=device))
BCEobj = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([h['obj_pw']], device=device))
# 标签平滑 eps=0代表不做标签平滑-> cp=1 cn=0 eps!=0代表做标签平滑 cp代表positive的标签值 cn代表negative的标签值
# Class label smoothing https://arxiv.org/pdf/1902.04103.pdf eqn 3
self.cp, self.cn = smooth_BCE(eps=h.get('label_smoothing', 0.0)) # positive, negative BCE targets
# Focal loss g = 0 代表不用focal loss
g = h['fl_gamma'] # focal loss gamma
if g > 0:
# g>0 将分类损失和置信度损失(BCE)都换成focalloss损失函数
BCEcls, BCEobj = FocalLoss(BCEcls, g), FocalLoss(BCEobj, g)
# BCEcls, BCEobj = QFocalLoss(BCEcls, g), FocalLoss(BCEobj, g) 替换
# det: 返回的是模型的检测头 Detector 3个 分别对应产生三个输出feature map
det = model.module.model[-1] if is_parallel(model) else model.model[-1] # Detect() module
# balance用来设置三个feature map对应输出的置信度损失系数(平衡三个feature map的置信度损失)
# 从左到右分别对应大feature map(检测小目标)到小feature map(检测大目标)
# 思路: It seems that larger output layers may overfit earlier, so those numbers may need a bit of adjustment
# 一般来说,检测小物体的难度大一点,所以会增加大特征图的损失系数,让模型更加侧重小物体的检测
# 如果det.nl=3就返回[4.0, 1.0, 0.4]否则返回[4.0, 1.0, 0.25, 0.06, .02]
# self.balance = {3: [4.0, 1.0, 0.4], 4: [4.0, 1.0, 0.25, 0.06], 5: [4.0, 1.0, 0.25, 0.06, .02]}[det.nl]
self.balance = {
3: [4.0, 1.0, 0.4]}.get(det.nl, [4.0, 1.0, 0.25, 0.06, .02]) # P3-P7
# 三个预测头的下采样率det.stride: [8, 16, 32] .index(16): 求出下采样率stride=16的索引
# 这个参数会用来自动计算更新3个feature map的置信度损失系数self.balance
# self.balance = {3: [4.0, 1.0, 0.4]}.get(det.nl, [4.0, 1.0, 0.25, 0.1, .05]) # P3-P7
# self.balance = {3: [4.0, 1.0, 0.4]}.get(det.nl, [4.0, 1.0, 0.5, 0.4, .1]) # P3-P7
self.ssi = list(det.stride).index(16) if autobalance else 0 # stride 16 index
# self.BCEcls: 类别损失函数 self.BCEobj: 置信度损失函数 self.hyp: 超参数
# self.gr: 计算真实框的置信度标准的iou ratio self.autobalance: 是否自动更新各feature map的置信度损失平衡系数 默认False
self.BCEcls, self.BCEobj, self.gr, self.hyp, self.autobalance = BCEcls, BCEobj, model.gr, h, autobalance
# na: number of anchors 每个grid_cell的anchor数量 = 3
# nc: number of classes 数据集的总类别 = 80
# nl: number of detection layers Detect的个数 = 3
# anchors: [3, 3, 2] 3个feature map 每个feature map上有3个anchor(w,h) 这里的anchor尺寸是相对feature map的
for k in 'na', 'nc', 'nl', 'anchors':
# setattr: 给对象self的属性k赋值为getattr(det, k)
# getattr: 返回det对象的k属性
# 所以这句话的意思: 讲det的k属性赋值给self.k属性 其中k in 'na', 'nc', 'nl', 'anchors'
setattr(self, k, getattr(det, k))
def __call__(self, p, targets): # predictions, targets, model
"""
:params p: 预测框 由模型构建中的三个检测头Detector返回的三个yolo层的输出
tensor格式 list列表 存放三个tensor 对应的是三个yolo层的输出
如: [4, 3, 112, 112, 85]、[4, 3, 56, 56, 85]、[4, 3, 28, 28, 85]
[bs, anchor_num, grid_h, grid_w, xywh+class+classes]
可以看出来这里的预测值p是三个yolo层每个grid_cell(每个grid_cell有三个预测值)的预测值,后面肯定要进行正样本筛选
:params targets: 数据增强后的真实框 [63, 6] [num_object, batch_index+class+xywh]
:params loss * bs: 整个batch的总损失 进行反向传播
:params torch.cat((lbox, lobj, lcls, loss)).detach(): 回归损失、置信度损失、分类损失和总损失 这个参数只用来可视化参数或保存信息
"""
device = targets.device # 确定运行的设备
# 初始化lcls, lbox, lobj三种损失值 tensor([0.])
lcls, lbox, lobj = torch.zeros(1, device=device), torch.zeros(1, device=device), torch.zeros(1, device=device)
# 每一个都是append的 有feature map个 每个都是当前这个feature map中3个anchor筛选出的所有的target(3个grid_cell进行预测)
# tcls: 表示这个target所属的class index
# tbox: xywh 其中xy为这个target对当前grid_cell左上角的偏移量
# indices: b: 表示这个target属于的image index
# a: 表示这个target使用的anchor index
# gj: 经过筛选后确定某个target在某个网格中进行预测(计算损失) gj表示这个网格的左上角y坐标
# gi: 表示这个网格的左上角x坐标
# anch: 表示这个target所使用anchor的尺度(相对于这个feature map) 注意可能一个target会使用大小不同anchor进行计算
tcls, tbox, indices, anchors = self.build_targets(p, targets) # targets
# Losses
# 依次遍历三个feature map的预测输出pi
for i, pi in enumerate(p): # layer index, layer predictions
b, a, gj, gi = indices[i] # image, anchor, gridy, gridx
# 初始化target置信度(先全是负样本 后面再筛选正样本赋值)
tobj = torch.zeros_like(pi[..., 0], device=device) # target obj
n = b.shape[0] # number of targets
if n:
# 精确得到第b张图片的第a个feature map的grid_cell(gi, gj)对应的预测值
# 用这个预测值与我们筛选的这个grid_cell的真实框进行预测(计算损失)
ps = pi[b, a, gj, gi] # prediction subset corresponding to targets
# Regression
# Regression loss 只计算所有正样本的回归损失
# 新的公式: pxy = [-0.5 + cx, 1.5 + cx] pwh = [0, 4pw] 这个区域内都是正样本
# Get more positive samples, accelerate convergence and be more stable
pxy = ps[:, :2].sigmoid() * 2. - 0.5 # 一个归一化操作 和论文里不同
pwh = (ps[:, 2:4].sigmoid() * 2)