Towards-Realtime-MOT源代码学习之YOLOLayer中的forward()函数

本文链接：https://blog.csdn.net/Ji_HON/article/details/122023856

本文详细介绍了在PyTorch的Darknet模型中，如何计算置信度、目标框和ID的损失，包括batchsize调整后的变量输出、网格创建、锚点计算，以及losses的计算步骤。重点讲解了SoftmaxLoss、SmoothL1Loss和IDLoss的运用及其权重调整策略。

摘要由CSDN通过智能技术生成

该函数用于计算置信度、目标框和id的损失。训练时，该函数在models.py中的Darknet()类中的forward()函数中调用

将batchsize改为1，依次输出各变量的值

def forward(self, p_cat,  img_size, targets=None, classifier=None, test_emb=False):
    p, p_emb = p_cat[:, :24, ...], p_cat[:, 24:, ...]
    nB, nGh, nGw = p.shape[0], p.shape[-2], p.shape[-1]

p_cat.shape=torch.Size([1, 536, 10, 18])，为模型的输出
p.shape=torch.Size([1, 24, 10, 18])
p_emb.shape=torch.Size([1, 512, 10, 18])，为embedding部分
nB=1，表示batchsize
nGh=18
nGw =10

if self.img_size != img_size:
    create_grids(self, img_size, nGh, nGw)

进入create_grids()函数后，create函数如下：

def create_grids(self, img_size, nGh, nGw):
    self.stride = img_size[0]/nGw
    assert self.stride == img_size[1] / nGh, \
            "{} v.s. {}/{}".format(self.stride, img_size[1], nGh)

    # build xy offsets
    grid_x = torch.arange(nGw).repeat((nGh, 1)).view((1, 1, nGh, nGw)).float()
    grid_y = torch.arange(nGh).repeat((nGw, 1)).transpose(0,1).view((1, 1, nGh,
             nGw)).float()
    #grid_y = grid_x.permute(0, 1, 3, 2)
    self.grid_xy = torch.stack((grid_x, grid_y), 4)

    # build wh gains
    self.anchor_vec = self.anchors / self.stride
    self.anchor_wh = self.anchor_vec.view(1, self.nA, 1, 1, 2)

img_size={list: 2} [576, 320]
self.stride=576/18=32.0
grid_x.shape=torch.Size([1, 1, 10, 18])
grid_y.shape=torch.Size([1, 1, 10, 18])
self.grid_xy.shape = torch.Size([1, 1, 10, 18, 2])
self.anchors=tensor([ [ 85., 255.], [120., 360.], [170., 420.], [340., 320.]])
self.anchor_vec=tensor([ [ 2.6562, 7.9688],[ 3.7500, 11.2500],[ 5.3125, 13.1250],[10.6250, 10.0000] ])
self.nA=4
self.anchor_wh = self.anchor_vec.view(1, self.nA, 1, 1, 2)=tensor([[[[[ 2.6562, 7.9688]]],[[[ 3.7500, 11.2500]]],[[[ 5.3125, 13.1250]]],[[[10.6250, 10.0000]]]]])

if p.is_cuda:
    self.grid_xy = self.grid_xy.cuda()
    self.anchor_wh = self.anchor_wh.cuda()

把数据放到GPU上

# prediction
p = p.view(nB, self.nA, self.nC + 5, nGh, nGw).permute(0, 1, 3, 4, 2).contiguous()
        
p_emb = p_emb.permute(0,2,3,1).contiguous()# p_emb包含embedding信息
p_box = p[..., :4] # p_box包含检测框位置信息
p_conf = p[..., 4:6].permute(0, 4, 1, 2, 3)  # 包含前景背景分类置信度的p_conf

self.nC=1
p.shape=torch.Size([1, 4, 10, 18, 6])
p_emb.shape = torch.Size([1, 10, 18, 512])
p_box.shape = torch.Size([1, 4, 10, 18, 4])
p_conf.shape =torch.Size([1, 2, 4, 10, 18])

if targets is not None:
    if test_emb:
        tconf, tbox, tids = build_targets_max(targets, self.anchor_vec.cuda(), 
                            self.nA, self.nC, nGh, nGw)
    else:
        tconf, tbox, tids = build_targets_thres(targets, self.anchor_vec.cuda(), 
                            self.nA, self.nC, nGh, nGw)
    tconf, tbox, tids = tconf.cuda(), tbox.cuda(), tids.cuda()
    mask = tconf > 0

这里调用了build_targets_thres()函数，build_targets_thres()函数的运行过程参考：Towards-Realtime-MOT源代码学习之build_targets_thres()函数_Ji_HON的博客-CSDN博客
tconf = tensor([[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],

[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],

...

[[0., 0., 0., 0.],
...,
[0., 0., 0., 0.]]]]], device='cuda:0')
tids = tensor([[[[[0., 0., 0., 0.],
...,
[0., 0., 0., 0.]],

[[0., 0., 0., 0.],
...,
[0., 0., 0., 0.]]]]], device='cuda:0')
然后将tconf, tbox, tids放到GPU上，
mask.shape=torch.Size([1, 4, 10, 18])，里面的值全为False

# Compute losses计算检测框回归损失和前景背景分类损失
nT = sum([len(x) for x in targets])  # number of targets
nM = mask.sum().float()  # number of anchors (assigned to targets)
nP = torch.ones_like(mask).sum().float()
if nM > 0:
    lbox = self.SmoothL1Loss(p_box[mask], tbox[mask]) # 计算目标框的损失
else:
    FT = torch.cuda.FloatTensor if p_conf.is_cuda else torch.FloatTensor
    lbox, lconf =  FT([0]), FT([0])
lconf =  self.SoftmaxLoss(p_conf, tconf) # 计算置信度的损失
lid = torch.Tensor(1).fill_(0).squeeze().cuda()
emb_mask,_ = mask.max(1)

nT = 36
nM = tensor(0., device='cuda:0')
nP = tensor(720., device='cuda:0')
进入else条件语句：
FT=<class 'torch.cuda.FloatTensor'>
lbox = lconf = tensor([0.], device='cuda:0')
lconf = self.SoftmaxLoss(p_conf, tconf)=tensor(0.6107, device='cuda:0', grad_fn=<NllLoss2DBackward>)，这里的self.SoftmaxLoss为交叉熵损失函数nn.CrossEntropyLoss
lid = tensor(0., device='cuda:0')
emb_mask.shape= torch.Size([1, 10, 18])，其中的值全为False
_.shape= torch.Size([1, 10, 18])，其中的值全为0

#embedding损失的计算 we use max(1) to decide the id,  TODO: more reseanable strategy
tids,_ = tids.max(1) 
tids = tids[emb_mask]
embedding = p_emb[emb_mask].contiguous()
embedding = self.emb_scale * F.normalize(embedding)
nI = emb_mask.sum().float()
            
if  test_emb:
    if np.prod(embedding.shape)==0  or np.prod(tids.shape) == 0:
        return torch.zeros(0, self.emb_dim+1).cuda()
    emb_and_gt = torch.cat([embedding, tids.float()], dim=1)
    return emb_and_gt
            
if len(embedding) > 1:
    logits = classifier(embedding).contiguous()
    lid =  self.IDLoss(logits, tids.squeeze())#计算id的损失

tids.shape=torch.Size([1, 10, 18, 1])，其中的值全为-1
tids,_ = tids.max(1)
tids = tids[emb_mask]=tensor([], device='cuda:0', size=(0, 1), dtype=torch.int64)
embedding = tensor([], device='cuda:0', size=(0, 512), grad_fn=<IndexBackward>)
self.emb_scale = 10.467784747196607
embedding = tensor([], device='cuda:0', size=(0, 512), grad_fn=<MulBackward0>)
nI = tensor(0., device='cuda:0')
这里不进入第一个if条件语句，也不进入第二个条件语句

# 求总的损失
loss = torch.exp(-self.s_r)*lbox + torch.exp(-self.s_c)*lconf + \
       torch.exp(-self.s_id)*lid + (self.s_r + self.s_c + self.s_id)
loss *= 0.5

self.s_r = tensor([-4.8500], device='cuda:0')
self.s_c = tensor([-4.1500], device='cuda:0')
self.s_id = tensor([-2.3000], device='cuda:0')
torch.exp(-self.s_r) = torch.exp(torch.tensor([-4.8500]))= tensor([0.0078])，即e^(-4.8500)
torch.exp(-self.s_c) = torch.exp(torch.tensor([-4.1500]))= tensor([0.0158])
torch.exp(-self.s_id) = torch.exp(torch.tensor([-2.3000]))= tensor([0.1003])
loss = tensor([27.4388], device='cuda:0', grad_fn=<AddBackward0>)
loss * 0.5 = tensor([13.7194], device='cuda:0', grad_fn=<MulBackward0>)

return loss, loss.item(), lbox.item(), lconf.item(), lid.item(), nT

最后返回

else:
    p_conf = torch.softmax(p_conf, dim=1)[:,1,...].unsqueeze(-1)
    p_emb = F.normalize(p_emb.unsqueeze(1).repeat(1,self.nA,1,1,1).contiguous(),
             dim=-1)
    #p_emb_up = F.normalize(shift_tensor_vertically(p_emb, -self.shift[self.layer]),
                dim=-1)
    #p_emb_down = F.normalize(shift_tensor_vertically(p_emb, self.shift[self.layer]),
                    dim=-1)
    p_cls = torch.zeros(nB,self.nA,nGh,nGw,1).cuda()               # Temp
    p = torch.cat([p_box, p_conf, p_cls, p_emb], dim=-1)
    #p = torch.cat([p_box, p_conf, p_cls, p_emb, p_emb_up, p_emb_down], dim=-1)
    p[..., :4] = decode_delta_map(p[..., :4], self.anchor_vec.to(p))
    p[..., :4] *= self.stride

    return p.view(nB, -1, p.shape[-1])

如果没有真实目标，则进入else条件语句