基于PyTorch的SSD训练SAR数据集(一)

本文篇幅过于冗长,建议有耐心者食用,代码各模块基本全覆盖,若有错误之处烦请大佬们指正!
在这里插入图片描述

一、环境:

博主用的环境是windows10+anaconda3+pytorch3.7+pycharm

二、SSD-Pytorch代码

源码链接:SSD-Pytorch

三、SSD训练算法代码解读(根据自己的数据集有部分修改)

这是我的文件目录
在这里插入图片描述
data/VOCdevkit存放训练数据
data/config.py默认的一些配置
data/voc0712.py重写dateset类,提取voc的数据并规则化
layers/functions/detection.py对识别的结果的数据进行部分筛选,传送给Test.py文件,供其调用使用
layers/modules/multibox_loss.py计算损失函数
utils/augmentation.py data augmentation的py文件,主要功能是扩大训练数据
weights 存放权重文件
test.py 是测试单张照片的识别
train.py 训练的Py文件
ssd.py 是模型的实现
eval.py 评估训练后的值

1.从train.py入手,分析各个部分

代码里均有详细注释,不额外解释
A.导入包

#train.py
from data import *
from utils.augmentations import SSDAugmentation     
from layers.modules import MultiBoxLoss
from ssd import build_ssd              #模型构建
import os
import sys
import time
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.optim as optim
import torch.backends.cudnn as cudnn
import torch.nn.init as init
import torch.utils.data as data
import numpy as np
import argparse    #命令行参数解析包

B.argparse------命令行选项、参数和子命令解析器。

#train.py
parser = argparse.ArgumentParser(
    description='Single Shot MultiBox Detector Training With Pytorch')   #加载模型初始参数
train_set = parser.add_mutually_exclusive_group()
parser.add_argument('--dataset', default='VOC', choices=['VOC', 'COCO'],
                    type=str, help='VOC or COCO')     #默认加载VOC数据集
parser.add_argument('--dataset_root', default=VOC_ROOT,
                    help='Dataset root directory path')         #设置VOC数据集路径
parser.add_argument('--basenet', default='vgg16_reducedfc.pth',
                    help='Pretrained base model')                   #设置预训练模型vgg16_reducedfc.pth

parser.add_argument('--batch_size', default=32, type=int,
                    help='Batch size for training')          # 设置批大小,依据显卡能力设置
parser.add_argument('--resume', default=None, type=str,       #是否恢复中断的训练,默认不恢复
                    help='Checkpoint state_dict file to resume training from')
parser.add_argument('--start_iter', default=0, type=int,       # 恢复训练iter数,默认从第0次迭代开始
                    help='Resume training at this iter')
parser.add_argument('--num_workers', default=0, type=int,         #数据加载线程数,根据cpu个数设置
                    help='Number of workers used in dataloading')
parser.add_argument('--cuda', default=False, type=str2bool,         #是否启用cuda加速,博主没有cuda,设置False不启用
                    help='Use CUDA to train model')
parser.add_argument('--lr', '--learning-rate', default=1e-5, type=float,
                    help='initial learning rate')                       #学习率,此处按照论文里设为0.00001

parser.add_argument('--momentum', default=0.99, type=float,
                    help='Momentum value for optim')                  #最佳动量值,默认0.9,此处根据论文设为0.99(动量是梯度下降法中一种常用的加速技术,用于加速梯度下降,减少收敛耗时)
parser.add_argument('--weight_decay', default=5e-4, type=float,
                    help='Weight decay for SGD')                   # 权重衰减,即正则化项前面的系数,用于防止过拟合;SGD,即随机梯度下降,多指mini-batch梯度下降
parser.add_argument('--gamma', default=0.1, type=float,
                    help='Gamma update for SGD')                    #gamma更新,默认值为0.1
parser.add_argument('--visdom', default=False, type=str2bool,
                    help='Use visdom for loss visualization')                  #使用visdom将训练过程loss图像可视化
parser.add_argument('--save_folder', default='weights/',
                    help='Directory for saving checkpoint models')         #权重保存位置,默认存在weights/下
args = parser.parse_args()

C.定义训练函数
其中部分名称含义:
batchsize:批大小,比如SGD中的BP一次就是用的一批的数据
iterator:迭代次数,1个iterator相当于用一个batch训练一次
epoch:训练集中全部的数据被用过一次,叫一个epoch
eg:在不考虑每个batch之间发生样本交叉的情况下,假设有100个样本,batchsize是10,那么用过全部的样本训练模型,需要1个epoch,10次iterator

#train.py
def train():
    if args.dataset == 'COCO':
        if args.dataset_root == VOC_ROOT:
            if not os.path.exists(COCO_ROOT):
                parser.error('Must specify dataset_root if specifying dataset')
            print("WARNING: Using default COCO dataset_root because " +
                  "--dataset_root was not specified.")
            args.dataset_root = COCO_ROOT
        cfg = coco
        dataset = COCODetection(root=args.dataset_root,
                                transform=SSDAugmentation(cfg['min_dim'],
                                                          MEANS))
    elif args.dataset == 'VOC':
        cfg = voc
        dataset = VOCDetection(root=args.dataset_root,
                               transform=SSDAugmentation(cfg['min_dim'],
                                                         MEANS))

    if args.visdom:      #数据可视化
        import visdom
        viz = visdom.Visdom()

    ssd_net = build_ssd('train', cfg['min_dim'], cfg['num_classes'])    #建立SSD网络模型
    net = ssd_net

    if args.cuda:          #使用cuda加速器
        net = torch.nn.DataParallel(ssd_net)
        cudnn.benchmark = True

    if args.resume:            #是否恢复中断
        print('Resuming training, loading {}...'.format(args.resume))
        ssd_net.load_weights(args.resume)
    else:
        vgg_weights = torch.load(args.save_folder + args.basenet)
        print('Loading base network...')
        ssd_net.vgg.load_state_dict(vgg_weights)

    if args.cuda:
        net = net.cuda()

    if not args.resume:
        print('Initializing weights...')
        # initialize newly added layers' weights with xavier method
        ssd_net.extras.apply(weights_init)
        ssd_net.loc.apply(weights_init)
        ssd_net.conf.apply(weights_init)

    optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum,
                          weight_decay=args.weight_decay)
    criterion = MultiBoxLoss(cfg['num_classes'], 0.5, True, 0, True, 3, 0.5,
                             False, args.cuda)

    net.train()      #网络训练
    # loss counters     #损失计数器
    loc_loss = 0
    conf_loss = 0
    epoch = 0      #训练集中全部的数据被用过一次,叫一个epoch
    print('Loading the dataset...')

    epoch_size = len(dataset) // args.batch_size    #batch_size即为批大小,batch_size一次就是用的一批的数据
    print('Training SSD on:', dataset.name)
    print('Using the specified args:')
    print(args)

    step_index = 0

    if args.visdom:
        vis_title = 'SSD.PyTorch on ' + dataset.name
        vis_legend = ['Loc Loss', 'Conf Loss', 'Total Loss']
        iter_plot = create_vis_plot('Iteration', 'Loss', vis_title, vis_legend)
        epoch_plot = create_vis_plot('Epoch', 'Loss', vis_title, vis_legend)

    data_loader = data.DataLoader(dataset, args.batch_size,
                                  num_workers=args.num_workers,
                                  shuffle=True, collate_fn=detection_collate,  #Shuffle即洗牌,在数据加载时将其设为true,则神经网络读取数据的时候,打乱顺序读取;训练时一般打开,预测阶段关闭
                                  pin_memory=True)
    # create batch iterator
    batch_iterator = iter(data_loader)       #iterator:迭代次数,1个iterator相当于用一个batch训练一次
    for iteration in range(args.start_iter, cfg['max_iter']):
        if args.visdom and iteration != 0 and (iteration % epoch_size == 0):
            update_vis_plot(epoch, loc_loss, conf_loss, epoch_plot, None,
                            'append', epoch_size)
            # reset epoch loss counters
            loc_loss = 0
            conf_loss = 0
            epoch += 1

        if iteration in cfg['lr_steps']:
            step_index += 1
            adjust_learning_rate(optimizer, args.gamma, step_index)

        # load train data  加载训练数据
        images, targets = next(batch_iterator

        if args.cuda:
            images = Variable(images.cuda())
            targets = [Variable(ann.cuda(), volatile=True) for ann in targets]
        else:
            images = Variable(images)
            targets = [Variable(ann, volatile=True) for ann in targets]
        # forward
        t0 = time.time()
        out = net(images)
        # backprop 反向传播
        optimizer.zero_grad()   #梯度置零,也就是把loss关于weight的导数变成0.
        loss_l, loss_c = criterion(out, targets)  #求loss
        loss = loss_l + loss_c
        loss.backward()   #反向传播求梯度
        optimizer.step()   #更新所有参数
        t1 = time.time()    #计算代码运行时间
        loc_loss += loss_l.data[0]
        conf_loss += loss_c.data[0]

        if iteration % 10 == 0:
            print('timer: %.4f sec.' % (t1 - t0))
            print('iter ' + repr(iteration) + ' || Loss: %.4f ||' % (loss.data[0]), end=' ')
           
        if args.visdom:
            update_vis_plot(iteration, loss_l.data[0], loss_c.data[0],
            #update_vis_plot(iteration, loss_l.data, loss_c.data,
                            iter_plot, epoch_plot, 'append')

        #if iteration != 0 and iteration % 5000 == 0:
        if iteration != 0 and iteration % 500 == 0:
            print('Saving state, iter:', iteration)
            #torch.save(ssd_net.state_dict(), 'weights/ssd300_COCO_' +
            torch.save(ssd_net.state_dict(), 'weights/ssd300_VOC_' +
                       repr(iteration) + '.pth')
    torch.save(ssd_net.state_dict(),
               args.save_folder + '' + args.dataset + '.pth')

D.学习率衰减函数:
学习率呈指数型衰减,每训练一个epoch,就将学习率调整为lr=lr*gamma**epoch。

#train.py
def adjust_learning_rate(optimizer, gamma, step):
    """Sets the learning rate to the initial LR decayed by 10 at every
        specified step
    # Adapted from PyTorch Imagenet example:
    # https://github.com/pytorch/examples/blob/master/imagenet/main.py
    """
    lr = args.lr * (gamma ** (step))
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

2.训练之前先搭建网络模型,见SSD.py:

A.主干网络

#ssd.py
def vgg(cfg, i, batch_norm=False):
    layers = []     #用于存放vgg网络的list
    in_channels = i     #最前面那层的维度--300*300*3,因此i=3,即输入的图片是三通道
    for v in cfg:        #循环建立多层,数据信息存放在一个字典中
        if v == 'M':     #M即为maxpooling 
            layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
        elif v == 'C':     #C是进行ceil_mode,不足2*2的滤波器进行maxpooling 
            layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)]
        else:            #卷积前后维度可以通过字典中数据设置好
            conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
            if batch_norm:
                layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
            else:
                layers += [conv2d, nn.ReLU(inplace=True)]
            in_channels = v
    pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
    conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6)
   # dilation = 卷积核元素之间的间距, 扩大卷积感受野的范围,没有增加卷积size
    conv7 = nn.Conv2d(1024, 1024, kernel_size=1)
    layers += [pool5, conv6,
               nn.ReLU(inplace=True), conv7, nn.ReLU(inplace=True)]
    return layers

为了后续的多尺度提取,在VGG Backbone后面添加卷积网络,网络层次如下:
在这里插入图片描述
红框的网络需要进行多尺度分析,输入到multi-box网络

#ssd.py
def add_extras(cfg, i, batch_norm=False):   #i是1024,FC7输出的是1024维
    # Extra layers added to VGG for feature scaling
    layers = []
    in_channels = i   #初始输入通道为1024
    flag = False   #flag 用来选择 kernel_size= 1 or 3
    for k, v in enumerate(cfg):       #S代表stride,为2时候就相当于缩小feature map
        if in_channels != 'S':
            if v == 'S':
                layers += [nn.Conv2d(in_channels, cfg[k + 1],
                           kernel_size=(1, 3)[flag], stride=2, padding=1)]
            else:
                layers += [nn.Conv2d(in_channels, v, kernel_size=(1, 3)[flag])]
            flag = not flag   #反转flag
        in_channels = v   #更新in_channels
    return layers

#loc_layers的输出维度是default box的种类(4or6)*4
#conf_layers的输出维度是default box的种类(4or6)*num_class

SSD一共有6层多尺度提取的网络,每层分别对 loc 和 conf 进行卷积,得到相应的输出

#ssd.py
def multibox(vgg, extra_layers, cfg, num_classes):
'''
 Args:
      vgg:修改fc后的vgg网络
      extra_layers: 加在vgg后面的4层网络
      cfg: 网络参数,eg:[4, 6, 6, 6, 4, 4]
      num_classes: 类别,VOC为 1+背景=2
  Return:
        vgg, extra_layers
        loc_layers: 多尺度分支的回归网络
        conf_layers: 多尺度分支的分类网络
'''
    loc_layers = []
    conf_layers = []
    vgg_source = [21, -2]  #第一部分,vgg网络的Conv2d-4_3(21层), Conv2d-7_1(-2层)
    for k, v in enumerate(vgg_source): 
    #回归 box*4(坐标)   
        loc_layers += [nn.Conv2d(vgg[v].out_channels,
                                 cfg[k] * 4, kernel_size=3, padding=1)]
    #置信度 box*(num_classes)
        conf_layers += [nn.Conv2d(vgg[v].out_channels,
                        cfg[k] * num_classes, kernel_size=3, padding=1)]
#第二部分,cfg从第三个开始作为box的个数,而且用于多尺度提取的网络分别为1,3,5,7层
    for k, v in enumerate(extra_layers[1::2], 2):    #找到对应的层
        loc_layers += [nn.Conv2d(v.out_channels, cfg[k]
                                 * 4, kernel_size=3, padding=1)]
        conf_layers += [nn.Conv2d(v.out_channels, cfg[k]
                                  * num_classes, kernel_size=3, padding=1)]
    return vgg, extra_layers, (loc_layers, conf_layers)

#基础网络部分可更换
base = {
    '300': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M',
            512, 512, 512],
    '512': [],
}
#基础网络后降采样的部分
extras = {
    '300': [256, 'S', 512, 128, 'S', 256, 128, 256, 128, 256],
    '512': [],
}
mbox = {
    '300': [4, 6, 6, 6, 4, 4],  # number of boxes per feature map location
    '512': [],
}


B.特征提取
由前面三个主干网络结合+后面的prior_box和detection方法得出完整的SSD类,用于提取图片特征
其中append()函数用于在列表末尾添加新的对象

#ssd.py
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from layers import *
from data import voc, coco
import os


class SSD(nn.Module):
    """Single Shot Multibox Architecture
    Args:
        phase: string, 可选"train" 和 "test"
        size:输入网络的图片大小
        base:VGG16的网络层(修改fc后的)
        extras: 用于多尺度增加的网络
        head: 包含了各个分支的loc(类别)和conf(位置偏移量)
        num_classes: 类别数
        
        return:
        output: List, 返回loc, conf 和 候选框
    """

    def __init__(self, phase, size, base, extras, head, num_classes):
        super(SSD, self).__init__()   #继承自父类的属性进行初始化,super函数用于调用父类的一个方法
        self.phase = phase
        self.num_classes = num_classes
        #self.cfg = (coco, voc)[num_classes == 21]
        self.cfg = (coco, voc)[num_classes == 2]     #配置config,修改num_classes为自己的检测目标类别数+1,我的数据集只有ship一类+背景
        self.priorbox = PriorBox(self.cfg)            #初始化先验框
        self.priors = Variable(self.priorbox.forward(), volatile=True)
        self.size = size
#nn.Modulelist中的顺序无所谓,网络的执行顺序是根据forward函数来决定,要用for循环实现;nn.Sequential,网络执行的顺序是按照Sequential的顺序执行的
       
        self.vgg = nn.ModuleList(base)   # basebone 网络,把vgg网络成ModuleList类型
        # conv4_3后面的网络,L2正则化
        self.L2Norm = L2Norm(512, 20)
        self.extras = nn.ModuleList(extras)
           #回归和分类网络
        self.loc = nn.ModuleList(head[0])
        self.conf = nn.ModuleList(head[1])

        if phase == 'test':
       # 预测使用
            self.softmax = nn.Softmax(dim=-1)
            self.detect = Detect(num_classes, 0, 200, 0.01, 0.45)

    def forward(self, x):
        sources = list()
        loc = list()
        conf = list()

        # vgg网络到conv4_3,即获取conv4_3的内容
        for k in range(23):   # conv4_3在vgg中是第23层
            x = self.vgg[k](x)

        s = self.L2Norm(x)    #L2正则化,因为网络比较浅,对输出量进行调整
        sources.append(s)

        #  conv4_3 到 fc,即获得fc7的内容
        for k in range(23, len(self.vgg)):   #共有35层,range(23,35)
            x = self.vgg[k](x)
        sources.append(x)

        # 获得后面的内容
        for k, v in enumerate(self.extras):
        #计算forward,每一步用relu激活,relu是深度学习的一个非饱和激活函数,它的优点在于能解决深度神经网络【层数非常多!!】的“梯度消失”问题,且能加快收敛速度
            x = F.relu(v(x), inplace=True)
            if k % 2 == 1:              #把需要进行多尺度的网络输出存入sources
                sources.append(x)

        # 添加回归层和分类层
        for (x, l, c) in zip(sources, self.loc, self.conf):
            loc.append(l(x).permute(0, 2, 3, 1).contiguous())
            conf.append(c(x).permute(0, 2, 3, 1).contiguous())   # contiguous是对transpose,permute等进行深拷贝

#进行调整
        loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1)    #变成[batch,38*38*16+19*19*24+……+1*1*16]=[batch,8732 * 4]
        conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1)
        if self.phase == "test":
        #loc会调整到batch_size,num_anchors,4
           #conf会调整到batch_size,num_anchors,21
            output = self.detect(
                loc.view(loc.size(0), -1, 4),                   # anchor变成[batch,8732,4],意味着输出8732个先验框和每个先验框调整坐标
                self.softmax(conf.view(conf.size(0), -1,
                             self.num_classes)),                # 变成[batch, 8732, 21],对每一行进行softmax
                self.priors.type(type(x.data))                  # default boxes
            )
        else:
            output = (
                loc.view(loc.size(0), -1, 4),
                conf.view(conf.size(0), -1, self.num_classes),
                self.priors
            )
        return output
#加载模型参数
    def load_weights(self, base_file):
        other, ext = os.path.splitext(base_file)
        if ext == '.pkl' or '.pth':
            print('Loading weights into state dict...')
            self.load_state_dict(torch.load(base_file,
                                 map_location=lambda storage, loc: storage))
            print('Finished!')
        else:
            print('Sorry only .pth and .pkl files supported.')

C.使用下面的函数封装,增加可读性

#ssd.py
#def build_ssd(phase, size=300, num_classes=21):
def build_ssd(phase, size=300, num_classes=2):
    if phase != "test" and phase != "train":
        print("ERROR: Phase: " + phase + " not recognized")
        return
    if size != 300:
        print("ERROR: You specified size " + repr(size) + ". However, " +
              "currently only SSD300 (size=300) is supported!")
        return
    base_, extras_, head_ = multibox(vgg(base[str(size)], 3),
                                     add_extras(extras[str(size)], 1024),
                                     mbox[str(size)], num_classes)
    return SSD(phase, size, base_, extras_, head_, num_classes)

在这里插入图片描述

3.生成先验框

详细解释参考先验框描述

from __future__ import division
from math import sqrt as sqrt
from itertools import product as product
import torch


class PriorBox(object):
    """
    A.计算先验框,依据feature map的每个像素生成box
    B.框中的个数为: 38×38×4+19×19×6+10×10×6+5×5×6+3×3×4+1×1×4=8732
    C、 cfg: SSD的参数配置,字典类型
    """
    def __init__(self, cfg):
        super(PriorBox, self).__init__()
        self.image_size = cfg['min_dim']
        # number of priors for feature map location (either 4 or 6)
        self.num_priors = len(cfg['aspect_ratios'])
        self.variance = cfg['variance'] or [0.1]
        self.feature_maps = cfg['feature_maps']
        self.min_sizes = cfg['min_sizes']
        self.max_sizes = cfg['max_sizes']
        self.steps = cfg['steps']
        self.aspect_ratios = cfg['aspect_ratios']
        self.clip = cfg['clip']
        self.version = cfg['name']
        for v in self.variance:
            if v <= 0:
                raise ValueError('Variances must be greater than 0')

    def forward(self):
        mean = []  #用来存放 box的参数
        #遍多尺度的 map: [38, 19, 10, 5, 3, 1]
        for k, f in enumerate(self.feature_maps):
         # 遍历每个像素
            for i, j in product(range(f), repeat=2):
            # k-th 层的feature map 大小
                f_k = self.image_size / self.steps[k]
                # 每个框的中心坐标
                cx = (j + 0.5) / f_k
                cy = (i + 0.5) / f_k

                # aspect_ratio: 1  此时产生两个box
                # rel size: min_size
                s_k = self.min_sizes[k]/self.image_size
                mean += [cx, cy, s_k, s_k]

                # aspect_ratio: 1
                # rel size: sqrt(s_k * s_(k+1))  正方形
                s_k_prime = sqrt(s_k * (self.max_sizes[k]/self.image_size))
                mean += [cx, cy, s_k_prime, s_k_prime]

                # ratio !=1 产生box是矩形
                for ar in self.aspect_ratios[k]:
                    mean += [cx, cy, s_k*sqrt(ar), s_k/sqrt(ar)]
                    mean += [cx, cy, s_k/sqrt(ar), s_k*sqrt(ar)]
        # 转化为 torch
        output = torch.Tensor(mean).view(-1, 4)
         # 归一化,把输出设置在 [0,1]
        if self.clip:
            output.clamp_(max=1, min=0)
        return output

4.损失函数

此处只贴出了代码注释,具体解释参考损失函数详解

#multilbox_loss.py
    def forward(self, predictions, targets):
    #回归信息,置信度,先验框
        loc_data, conf_data, priors = predictions
        # 计算出batch_size
        num = loc_data.size(0)
        # 取出所有的先验框
        priors = priors[:loc_data.size(1), :]
        # 先验框的数量
        num_priors = (priors.size(0))
        num_classes = self.num_classes

        # 创建一个tensor(张量)进行处理
        loc_t = torch.Tensor(num, num_priors, 4)
        conf_t = torch.LongTensor(num, num_priors)
        for idx in range(num):
        # 获得框
            truths = targets[idx][:, :-1].data
        # 获得标签
            labels = targets[idx][:, -1].data
        # 获得先验框
            defaults = priors.data
        # 找到标签对应的先验框
            match(self.threshold, truths, defaults, self.variance, labels,
                  loc_t, conf_t, idx)
            #match这个函数给每个ground truth匹配了最好的priors,给每个priors匹配最好的ground truth

        if self.use_gpu:
            loc_t = loc_t.cuda()
            conf_t = conf_t.cuda()
        # 转化成Variable
        loc_t = Variable(loc_t, requires_grad=False)   #requires_grad=False意思为不要求梯度
        conf_t = Variable(conf_t, requires_grad=False)
#所有conf_t>0的地方,代表内部包含物体
        pos = conf_t > 0
        #求和得到每一个图片内部有多少正样本
        num_pos = pos.sum(dim=1, keepdim=True)

        # 计算回归loss
        # loc_loss是只考虑正样本的
        # Shape: [batch,num_priors,4]
        pos_idx = pos.unsqueeze(pos.dim()).expand_as(loc_data)
        loc_p = loc_data[pos_idx].view(-1, 4)  #预测得到的偏移量
        loc_t = loc_t[pos_idx].view(-1, 4)  #真实的偏移量
        loss_l = F.smooth_l1_loss(loc_p, loc_t, size_average=False)  #我们回归的就是相对default box的偏移

       # 转化形式
        batch_conf = conf_data.view(-1, self.num_classes)
         # 你可以把softmax函数看成一种接受任何数字并转换为概率分布的非线性方法
        # 获得每个框预测到真实框的类的概率
        loss_c = log_sum_exp(batch_conf) - batch_conf.gather(1, conf_t.view(-1, 1))

        # Hard Negative Mining
        loss_c = loss_c.view(num, -1)
        loss_c[pos] = 0  # 获得每一张图新的softmax的结果
        #两次sort排序,能够得到每个元素在降序排列中的位置idx_rank
        _, loss_idx = loss_c.sort(1, descending=True)
        _, idx_rank = loss_idx.sort(1)
        #计算每一张图的正样本数量
        num_pos = pos.long().sum(1, keepdim=True)
        # 限制负样本数量
        num_neg = torch.clamp(self.negpos_ratio*num_pos, max=pos.size(1)-1)
        neg = idx_rank < num_neg.expand_as(idx_rank)

       # 计算正样本的loss和负样本的loss
        pos_idx = pos.unsqueeze(2).expand_as(conf_data)
        neg_idx = neg.unsqueeze(2).expand_as(conf_data)
        conf_p = conf_data[(pos_idx+neg_idx).gt(0)].view(-1, self.num_classes)
        targets_weighted = conf_t[(pos+neg).gt(0)]
        loss_c = F.cross_entropy(conf_p, targets_weighted, size_average=False)

        # Sum of losses: L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N

        N = num_pos.data.sum()
        #N = num_pos.data.sum.double()
        #loss_l = loss_l.double()
        #loss_c = loss_c.double()
        loss_l /= N
        loss_c /= N
        return loss_l, loss_c

  • 5
    点赞
  • 27
    收藏
    觉得还不错? 一键收藏
  • 10
    评论
评论 10
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值