YOLOv3代码复现

最新推荐文章于 2023-11-01 21:28:39 发布

WALL-SQ

最新推荐文章于 2023-11-01 21:28:39 发布

阅读量3.2k

点赞数 1

分类专栏：深度学习网络复现文章标签： YOLO

本文链接：https://blog.csdn.net/qq_39304630/article/details/121373526

版权

深度学习网络复现专栏收录该内容

4 篇文章 7 订阅

订阅专栏

1. 数据集：包括数据集选取与数据增强方案确定

笔者使用COCO2014数据集进行Darknet-53的预训练，使用VOC2007+2012的混合数据集进行目标检测的训练[有条件的同学当然也可以使用ImageNet数据集预训练，再用COCO2014进行目标检测的训练]。

笔者这么安排的目的在于，对骨干网络预训练的时候，我们需要训练的是提取特征的能力，因此数据越丰富越好，模型可以从丰富的数据中挖掘共性，提高泛化能力。而在进行目标检测的任务训练时，我们需要借助骨干网络抽取的特征来完成目标检测任务，此时是特定任务，数据集可以小一些，并且此时的数据不再只是单纯的分类数据，而是着重检测任务本身带有的bounding box与box对应的类别。

A.预训练部分

Darknet-53定义：

import torch
import torch.nn as nn

class CBL(nn.Module):
    def __init__(self, in_channels, out_channels, kernal_size, stride, padding, inplace=True):
        super(CBL, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernal_size, stride, padding, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.LeakyReLU(0.1, inplace=inplace),
        )

    def forward(self, x):
        return self.conv(x)

    def weight_init(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                torch.nn.init.kaiming_normal_(m.weight.data)
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

class ResUnit(nn.Module):
    def __init__(self, in_channels, out_channels, kernal_size = 3, stride = 1, padding = 1):
        super(ResUnit,self).__init__()
        self.conv_feature = nn.Sequential(
            CBL(in_channels, out_channels, kernal_size, stride, padding),
            CBL(out_channels, out_channels, kernal_size, stride, padding)
        )
        self.conv_redisual = nn.Conv2d(in_channels, out_channels, 1, 1, 0)

    def forward(self, x):
        x_redisual = self.conv_redisual(x)
        x = self.conv_feature(x)
        x = torch.add(x, x_redisual)
        return x

    def weight_init(self):
        for m in self.modules():
            if isinstance(m, CBL):
                m.weight_init()
            elif isinstance(m, nn.Conv2d):
                torch.nn.init.kaiming_normal_(m.weight.data)


class ResX(nn.Module):
    def __init__(self, in_channels, out_channels_1, kernal_size_1, stride_1, padding_1, out_channels_2, kernal_size_2, stride_2, padding_2):
        super(ResX,self).__init__()
        self.conv = nn.Sequential(
            CBL(in_channels, out_channels_1, kernal_size_1, stride_1, padding_1),# down sample
            ResUnit(out_channels_1, out_channels_2, kernal_size_2, stride_2, padding_2),
        )

    def forward(self, x):
        return self.conv(x)

    def weight_init(self):
        for m in self.modules():
            if isinstance(m, CBL):
                m.weight_init()
            elif isinstance(m, ResUnit):
                m.weight_init()

class DarkNet53(nn.Module):
    def __init__(self, class_num):
        super(DarkNet53, self).__init__()

        self.conv_pre = nn.Sequential(
            CBL(3, 32, 3, 1, 1),
            CBL(32, 64, 3, 2, 1),
        )

        self.Res_1_64 = ResX(64, 32, 1, 1, 0, 64, 3, 1, 1)
        self.Res_2_128 = nn.Sequential(
            CBL(64, 128, 3, 2, 1),
            ResX(128, 64, 1, 1, 0, 128, 3, 1, 1),
            ResX(128, 64, 1, 1, 0, 128, 3, 1, 1),
        )
        self.Res_8_256 = nn.Sequential(
            CBL(128, 256, 3, 2, 1),
            ResX(256, 128, 1, 1, 0, 256, 3, 1, 1),
            ResX(256, 128, 1, 1, 0, 256, 3, 1, 1),
            ResX(256, 128, 1, 1, 0, 256, 3, 1, 1),
            ResX(256, 128, 1, 1, 0, 256, 3, 1, 1),
            ResX(256, 128, 1, 1, 0, 256, 3, 1, 1),
            ResX(256, 128, 1, 1, 0, 256, 3, 1, 1),
            ResX(256, 128, 1, 1, 0, 256, 3, 1, 1),
            ResX(256, 128, 1, 1, 0, 256, 3, 1, 1),
        )
        self.Res_8_512 = nn.Sequential(
            CBL(256, 512, 3, 2, 1),
            ResX(512, 256, 1, 1, 0, 512, 3, 1, 1),
            ResX(512, 256, 1, 1, 0, 512, 3, 1, 1),
            ResX(512, 256, 1, 1, 0, 512, 3, 1, 1),
            ResX(512, 256, 1, 1, 0, 512, 3, 1, 1),
            ResX(512, 256, 1, 1, 0, 512, 3, 1, 1),
            ResX(512, 256, 1, 1, 0, 512, 3, 1, 1),
            ResX(512, 256, 1, 1, 0, 512, 3, 1, 1),
            ResX(512, 256, 1, 1, 0, 512, 3, 1, 1),
        )
        self.Res_4_1024 = nn.Sequential(
            CBL(512, 1024, 3, 2, 1),
            ResX(1024, 512, 1, 1, 0, 1024, 3, 1, 1),
            ResX(1024, 512, 1, 1, 0, 1024, 3, 1, 1),
            ResX(1024, 512, 1, 1, 0, 1024, 3, 1, 1),
            ResX(1024, 512, 1, 1, 0, 1024, 3, 1, 1),
        )

        self.global_pooling = nn.AdaptiveAvgPool2d((1, 1))
        self.predict = nn.Linear(1024, class_num)

    def forward(self, x):
        x = self.conv_pre(x)
        x = self.Res_1_64(x)
        x = self.Res_2_128(x)
        x = self.Res_8_256(x)
        x = self.Res_8_512(x)
        x = self.Res_4_1024(x)
        x = self.global_pooling(x)
        x = torch.flatten(x, start_dim=1, end_dim=3)
        x = self.predict(x)
        return x

    def weight_init(self):
        for m in self.modules():
            if isinstance(m, ResX):
                m.weight_init()
            elif isinstance(m, nn.Conv2d):
                torch.nn.init.kaiming_normal_(m.weight.data)
            elif isinstance(m, nn.Linear):
                torch.nn.init.kaiming_normal_(m.weight.data)

B.COCO数据集用于分类，定义数据集类

coco_classify.py

import cv2
import os
import time
import random
import imagesize
import numpy as np
from utils import image
from torch.utils.data import Dataset
import torchvision.transforms as transforms

class coco_classify(Dataset):
    def __init__(self,imgs_path = "../DataSet/COCO2017/Train/Images", txts_path = "../DataSet/COCO2017/Train/Labels", is_train = True, edge_threshold=200, class_num=80, input_size=256):  # input_size:输入图像的尺度
        img_names = os.listdir(txts_path)
        self.is_train = is_train

        self.transform_common = transforms.Compose([
            transforms.ToTensor(),  # height * width * channel -> channel * height * width
            transforms.Normalize(mean=(0.408, 0.448, 0.471), std=(0.242, 0.239, 0.234))  # 归一化后.不容易产生梯度爆炸的问题
        ])

        self.input_size = input_size
        self.train_data = []  # [img_path,[[coord, class_id]]]

        for img_name in img_names:
            img_path = os.path.join(imgs_path, img_name.replace(".txt", ".jpg"))
            txt_path = os.path.join(txts_path, img_name)

            coords = []

            with open(txt_path, 'r') as label_txt:
                for label in label_txt:
                    label = label.replace("\n", "").split(" ")
                    class_id = int(label[4])

                    if class_id >= class_num:
                        continue

                    xmin = round(float(label[0]))
                    ymin = round(float(label[1]))
                    xmax = round(float(label[2]))
                    ymax = round(float(label[3]))

                    if (xmax - xmin) < edge_threshold or (ymax - ymin) < edge_threshold:
                        continue

                    coords.append([xmin, ymin, xmax, ymax, class_id])

            if len(coords) != 0:
                self.train_data.append([img_path, coords])

    def __getitem__(self, item):

        img_path, coords = self.train_data[item]
        img = cv2.imread(img_path)
        random.seed(int(time.time()))
        random_index = random.randint(0, len(coords) - 1)
        xmin, ymin, xmax, ymax, class_index = coords[random_index]
        img = img[ymin: ymax, xmin: xmax]

        #cv2.imshow(str(class_index), img)
        #print("height:{} width:{}".format(ymax - ymin, xmax - xmin))
        #cv2.waitKey(1000)

        if self.is_train:
            transform_seed = random.randint(0, 2)

            if transform_seed == 0:  # 原图
                img = image.resize_image_without_annotation(img, self.input_size, self.input_size)

            elif transform_seed == 1:  # 明度调整 YOLO在论文中称曝光度为明度
                img = image.resize_image_without_annotation(img, self.input_size, self.input_size)
                img = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
                H, S, V = cv2.split(img)
                cv2.merge([np.uint8(H), np.uint8(S), np.uint8(V * 1.5)], dst=img)
                cv2.cvtColor(src=img, dst=img, code=cv2.COLOR_HSV2BGR)

            else:  # 饱和度调整
                img = image.resize_image_without_annotation(img, self.input_size, self.input_size)
                H, S, V = cv2.split(img)
                cv2.merge([np.uint8(H), np.uint8(S * 1.5), np.uint8(V)], dst=img)
                cv2.cvtColor(src=img, dst=img, code=cv2.COLOR_HSV2BGR)

        else:
            img = image.resize_image_without_annotation(img, self.input_size, self.input_size)

        img = self.transform_common(img)
        return img, class_index

    def __len__(self):
        return len(self.train_data)

C.训练

Darknet_Pre_Train.py

#---------------step0:Common Definition-----------------
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
if torch.cuda.is_available():
    device = torch.device('cuda:0')
    torch.backends.cudnn.benchmark = True
else:
    device = torch.device('cpu')

lr = 3e-4
img_size = 256
momentum = 0.9
batch_size = 16
epoch_num = 1000
weight_decay = 5e-4
min_val_loss = 9999999999
epoch_interval = 1
class_num = 80
num_workers = 4

def accuracy(output, target, topk=(1, 5)):

    maxk = max(topk)
    batch_size = target.size(0)
    _, pred = output.topk(maxk, 1, True, True)
    pred = pred.t()
    correct = pred.eq(target.view(1, -1).expand_as(pred)).contiguous()
    res = []
    for k in topk:
        correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
        res.append(correct_k / batch_size)
    return res

#---------------step1:Dataset-------------------
#from ImageNet_DataSet import ImageNetMini
#dataSet = ImageNetMini(dataSetDir="../DataSet/imagenet-mini/train",classesFilePath="../DataSet/imagenet-mini/classDict.pth", img_size=256)
#from VOC_Classify import voc_classify
#dataSet = voc_classify(imgs_path="../DataSet/VOC2007+2012/Train/JPEGImages", annotations_path="../DataSet/VOC2007+2012/Train/Annotations", classes_file="../DataSet/VOC2007+2012/Train/class.data")
from COCO_Classify import  coco_classify
train_dataSet = coco_classify(imgs_path="../DataSet/COCO2017/Train/Images", txts_path= "../DataSet/COCO2017/Train/Labels", is_train=True)
val_dataSet = coco_classify(imgs_path="../DataSet/COCO2017/Val/Images", txts_path= "../DataSet/COCO2017/Val/Labels", is_train=False)
#---------------step2:Model-------------------
from DarkNet53 import DarkNet53
darkNet53 = DarkNet53(class_num=class_num).to(device=device)
darkNet53.weight_init()

#---------------step3:LossFunction-------------------
loss_function = nn.CrossEntropyLoss().to(device=device)

#---------------step4:Optimizer-------------------
import torch.optim as optim
#optimizer_Adam = optim.Adam(darkNet53.parameters(),lr=lr,weight_decay=weight_decay)
optimizer = optim.SGD(darkNet53.parameters(), lr=lr, weight_decay=weight_decay, momentum=momentum)
#使用余弦退火动态调整学习率
#lr_reduce_scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer=optimizer_Adam , T_max=20, eta_min=1e-4, last_epoch=-1)
#lr_reduce_scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer=optimizer_Adam, T_0=2, T_mult=2)

#--------------step5:Tensorboard Feature Map------------
import torchvision.utils as vutils
def feature_map_visualize(img_data, writer):
    img_data = img_data.unsqueeze(0)
    img_grid = vutils.make_grid(img_data, normalize=True, scale_each=True)
    for i,m in enumerate(darkNet53.modules()):
        if isinstance(m, nn.Conv2d) or isinstance(m, nn.BatchNorm2d) or \
                isinstance(m, nn.ReLU) or isinstance(m, nn.MaxPool2d) or isinstance(m, nn.AdaptiveAvgPool2d):
            img_data = m(img_data)
            x1 = img_data.transpose(0,1)
            img_grid = vutils.make_grid(x1, normalize=True, scale_each=True)
            writer.add_image('feature_map_' + str(i), img_grid)

#---------------step6:Train-------------------
from tqdm import tqdm
from tensorboardX import SummaryWriter
if __name__ == "__main__":

    epoch = 0

    param_dict = {}

    writer = SummaryWriter(logdir='./log', filename_suffix=' [' + str(epoch) + '~' + str(epoch + epoch_interval) + ']')

    while epoch < epoch_num:

        epoch_train_loss = 0
        epoch_val_loss = 0
        epoch_train_top1_acc = 0
        epoch_train_top5_acc = 0
        epoch_val_top1_acc = 0
        epoch_val_top5_acc = 0

        train_loader = DataLoader(dataset=train_dataSet, batch_size=batch_size, shuffle=True, num_workers=num_workers,
                                  pin_memory=True)
        train_len = train_loader.__len__()
        darkNet53.train()
        with tqdm(total=train_len) as tbar:

            for batch_index, batch_train in enumerate(train_loader):
                train_data = batch_train[0].float().to(device=device, non_blocking=True)
                label_data = batch_train[1].long().to(device=device, non_blocking=True)
                net_out = darkNet53(train_data)
                loss = loss_function(net_out, label_data)
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                batch_loss = loss.item() * batch_size
                epoch_train_loss = epoch_train_loss + batch_loss

                # 计算准确率
                net_out = net_out.detach()
                [top1_acc, top5_acc] = accuracy(net_out, label_data)
                top1_acc = top1_acc.item()
                top5_acc = top5_acc.item()

                epoch_train_top1_acc = epoch_train_top1_acc + top1_acc
                epoch_train_top5_acc = epoch_train_top5_acc + top5_acc

                tbar.set_description(
                    "train: class_loss:{} top1-acc:{} top5-acc:{}".format(loss.item(), round(top1_acc, 4),
                                                                          round(top5_acc, 4), refresh=True))
                tbar.update(1)

                # feature_map_visualize(train_data[0][0], writer)
                # print("batch_index : {} ; batch_loss : {}".format(batch_index, batch_loss))
            print(
                "train-mean: batch_loss:{} batch_top1_acc:{} batch_top5_acc:{}".format(round(epoch_train_loss / train_loader.__len__(), 4), round(
                    epoch_train_top1_acc / train_loader.__len__(), 4), round(
                    epoch_train_top5_acc / train_loader.__len__(), 4)))

        # lr_reduce_scheduler.step()

        val_loader = DataLoader(dataset=val_dataSet, batch_size=batch_size, shuffle=True, num_workers=num_workers,
                                pin_memory=True)
        val_len = val_loader.__len__()
        darkNet53.eval()
        with tqdm(total=val_len) as tbar:
            with torch.no_grad():
                for batch_index, batch_train in enumerate(val_loader):
                    train_data = batch_train[0].float().to(device=device, non_blocking=True)
                    label_data = batch_train[1].long().to(device=device, non_blocking=True)
                    net_out = darkNet53(train_data)
                    loss = loss_function(net_out, label_data)
                    batch_loss = loss.item() * batch_size
                    epoch_val_loss = epoch_val_loss + batch_loss

                    # 计算准确率
                    net_out = net_out.detach()
                    [top1_acc, top5_acc] = accuracy(net_out, label_data)
                    top1_acc = top1_acc.item()
                    top5_acc = top5_acc.item()

                    epoch_val_top1_acc = epoch_val_top1_acc + top1_acc
                    epoch_val_top5_acc = epoch_val_top5_acc + top5_acc

                    tbar.set_description(
                        "val: class_loss:{} top1-acc:{} top5-acc:{}".format(loss.item(), round(top1_acc, 4),
                                                                            round(top5_acc, 4), refresh=True))
                    tbar.update(1)

            # feature_map_visualize(train_data[0][0], writer)
                # print("batch_index : {} ; batch_loss : {}".format(batch_index, batch_loss))
            print(
                "train-mean: batch_loss:{} batch_top1_acc:{} batch_top5_acc:{}".format(round(epoch_val_loss / val_loader.__len__(), 4), round(
                    epoch_val_top1_acc / val_loader.__len__(), 4), round(
                    epoch_val_top5_acc / val_loader.__len__(), 4)))
        epoch = epoch + 1

        if min_val_loss > epoch_val_loss:
            min_val_loss = epoch_val_loss
            param_dict['min_val_loss'] = min_val_loss
            param_dict['min_loss_model'] = darkNet53.state_dict()

        if epoch % epoch_interval == 0:
            param_dict['model'] = darkNet53.state_dict()
            param_dict['optim'] = optimizer
            param_dict['epoch'] = epoch
            torch.save(param_dict, './weights/Darknet-53_' + str(epoch) + '.pth')
            writer.close()
            writer = SummaryWriter(logdir='log', filename_suffix='[' + str(epoch) + '~' + str(epoch + epoch_interval) + ']')
        print("epoch : {} ; train-loss : {}".format(epoch, {epoch_train_loss}))

        for i, (name, layer) in enumerate(darkNet53.named_parameters()):
            if 'bn' not in name:
                writer.add_histogram(name + '_grad', layer, epoch)

        writer.add_scalar('Train/Loss_sum', epoch_train_loss, epoch)
        writer.add_scalar('Val/Loss_sum', epoch_val_loss, epoch)
    writer.close()

2. 聚类得到Anchor尺度

A.k-means

anchor_k_means.py

import numpy as np

def iou(cluster, boxes):
    Area_culster = cluster[0] * cluster[1]
    Area_boxes = boxes[:,0] * boxes[:,1]
    Area_inter = np.minimum(cluster[0], boxes[:,0]) * np.minimum(cluster[1], boxes[:,1])
    return Area_inter / (Area_culster + Area_boxes - Area_inter)

def kmeans(boxes, k, dist=np.median, seed=1):
    """
    计算k-均值聚类与交集的联合(IoU)指标
    :param boxes:形状(r, 2)的numpy数组，其中r是行数
    :param k: 集群的数量
    :param dist: 距离函数
    :返回:形状的numpy数组(k, 2)
    """
    rows = boxes.shape[0] # 样本数

    distances = np.empty((rows, k))  # N row x N cluster  distance[row][k]:第row个样本到第k个聚类中心的距离
    last_clusters = np.zeros((rows,))

    np.random.seed(seed) # 设置随机种子

    # 将集群中心初始化为k个项 np.random.choice(rows, k, replace=False) 从0~rows-1的均匀分布中随机采样k个点并保证不重复
    clusters = boxes[np.random.choice(rows, k, replace=False)]

    while True:
        # 为每个点指定聚类的类别（如果这个点距离某类别最近，那么就指定它是这个类别)
        for icluster in range(k):  # I made change to lars76's code here to make the code faster
            distances[:, icluster] = 1 - iou(clusters[icluster], boxes)

        nearest_clusters = np.argmin(distances, axis=1) # 找到每一个样本距离最近的聚类中心
        # 如果聚类簇的中心位置基本不变了，那么迭代终止。
        if (last_clusters == nearest_clusters).all(): # 所有的聚类中心不变
            break

        # 重新计算每个聚类簇的平均中心位置，并它作为聚类中心点
        for cluster in range(k):
            clusters[cluster] = dist(boxes[nearest_clusters == cluster], axis=0) #聚类中心对每个维度求均值

        last_clusters = nearest_clusters

    return clusters, nearest_clusters, distances


import os
import cv2
import time
import image
target_size = 608
k = 9
txts_path = "../DataSet/COCO2017/Train/Labels"
imgs_path = "../DataSet/COCO2017/Train/Images"
txts_name = os.listdir(txts_path)
bounding_boxes = []

for txt_name in txts_name:
    img_path = os.path.join(imgs_path, txt_name.replace(".txt", ".jpg"))
    img = cv2.imread(img_path)

    coords = []
    with open(os.path.join(txts_path, txt_name), 'r') as file:
        for line_context in file:
            line_context = line_context.split(' ')

            class_id = int(line_context[4])
            xmin = round(float(line_context[0]))
            ymin = round(float(line_context[1]))
            xmax = round(float(line_context[2]))
            ymax = round(float(line_context[3]))
            coords.append([xmin, ymin, xmax, ymax, class_id])
    img, coords = image.resize_image_with_coords(img, target_size, target_size, coords)

    for coord in coords:
        coord[0] = round(coord[0] * target_size)
        coord[1] = round(coord[1] * target_size)
        coord[2] = round(coord[2] * target_size)
        coord[3] = round(coord[3] * target_size)
        box = [coord[2] - coord[0], coord[3] - coord[1]]
        bounding_boxes.append(box)

clusters, nearest_clusters, distances = kmeans(np.array(bounding_boxes), k, seed=int(time.time()))
import matplotlib.pyplot as plt
colors = ['peru', 'dodgerblue', 'turquoise', 'brown', 'red', 'lightsalmon', 'orange', 'springgreen' , 'orchid']
point_x = [list() for i in range(k)]
point_y = [list() for i in range(k)]

for index in range(len(nearest_clusters)):
    point_x[nearest_clusters[index]].append(bounding_boxes[index][0])
    point_y[nearest_clusters[index]].append(bounding_boxes[index][1])

for cluster_index in range(k):
    plt.scatter(point_x[cluster_index], point_y[cluster_index], color=colors[cluster_index])

clusters.sort(lambda: x[0] * x[1] for x in clusters)
plt.show()
print(clusters)

笔者聚类得到的结果为：[10, 11], [15, 28], [36, 22], [30, 60], [61, 125], [67, 46], [129, 88], [162, 211], [391, 336]

3. 多尺度训练：

以32为步长间隔，设置了 [320, 352, 384, 416, 448, 480, 512, 544, 576, 608] 这十种尺度的输入图像分辨率，每隔10个epoch随机选取一种尺度进行训练。

YOLO_V3_Train.py

#---------------step0:Common Definitaion-------------
import torch
import random
from datetime import datetime
random.seed(datetime.now())

if torch.cuda.is_available():
    device = torch.device("cuda:0")
    #torch.backends.cudnn.benchmark = True
else:
    device = torch.device("cpu")

#torch.autograd.set_detect_anomaly(True)
# train hype parameter
batch_size = 16
lr = 1e-3
weight_decay = 5e-4
momentum = 0.9
pre_weight_file = "../PreTrain/darknet53_901.pth"
class_num = 20
epoch_interval = 50
epoch_num = 200
num_workers = 4
min_val_loss = 9999999999
# train img parameter
img_sizes = [320, 352, 384, 416, 448, 480, 512, 544, 576, 608]
base_img_size = 608 #基准尺度是608
anchor_boxes = [[7, 10], [14, 30], [23, 14], [30, 61], [46, 30], [61, 118], [98, 61], [148, 188], [350, 330]]
now_img_size = 416
#---------------step1:Dataset-------------------
import torch
from COCO_DataSet import COCODataSet
from VOC_DataSet import VOCDataSet
train_dataSet = VOCDataSet(imgs_path="../DataSet/VOC2007+2012/Train/JPEGImages",annotations_path="../DataSet/VOC2007+2012/Train/Annotations",classes_file="../DataSet/VOC2007+2012/class.data", is_train=True, class_num=class_num)
val_dataSet = VOCDataSet(imgs_path="../DataSet/VOC2007+2012/Val/JPEGImages",annotations_path="../DataSet/VOC2007+2012/Val/Annotations",classes_file="../DataSet/VOC2007+2012/class.data", is_train=False, class_num=class_num)
train_dataSet.setInputSize(now_img_size, anchor_boxes)
val_dataSet.setInputSize(now_img_size, anchor_boxes)
#dataSet = COCODataSet(imgs_path="../DataSet/COCO2017/Train/JPEGImages",txts_path="../DataSet/COCO2017/Train/Labels", class_num=80)

#---------------step2:Model-------------------
from YOLO_V3_Model import YOLO_V3
from model import set_freeze_by_idxs
YOLO = YOLO_V3(class_num=80).to(device=device)
YOLO.initialize_weights(pre_weight_file)
set_freeze_by_idxs(YOLO,[0, 1, 2, 3, 4])

#---------------step3:LossFunction-------------------
from YOLO_V3_LossFunction import YOLO_V3_Loss
loss_function = YOLO_V3_Loss(anchor_boxes=anchor_boxes, class_num=class_num).to(device=device)
loss_function.setImgSize(now_img_size, anchor_boxes)

#---------------step4:Optimizer-------------------
import torch.optim as optim
#optimizer_Adam = optim.Adam(YOLO.parameters(),lr=1e-4,weight_decay=0.005)
optimizer_SGD = optim.SGD(YOLO.parameters(),lr=lr,weight_decay=weight_decay, momentum=momentum)
#使用余弦退火动态调整学习率
#lr_reduce_scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer=optimizer_Adam , T_max=20, eta_min=1e-4, last_epoch=-1)
#lr_reduce_scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer=optimizer_Adam, T_0=2, T_mult=2)

#--------------step5:Tensorboard Feature Map------------
import torch.nn as nn
import torchvision.utils as vutils
def feature_map_visualize(img_data, writer):
    img_data = img_data.unsqueeze(0)
    img_grid = vutils.make_grid(img_data, normalize=True, scale_each=True)
    for i,m in enumerate(YOLO.modules()):
        if isinstance(m, nn.Conv2d) or isinstance(m, nn.BatchNorm2d) or \
                isinstance(m, nn.ReLU) or isinstance(m, nn.MaxPool2d) or isinstance(m, nn.AdaptiveAvgPool2d):
            img_data = m(img_data)
            x1 = img_data.transpose(0,1)
            img_grid = vutils.make_grid(x1, normalize=True, scale_each=True)
            writer.add_image('feature_map_' + str(i), img_grid)

#---------------step6:Train-------------------
from tqdm import tqdm
from tensorboardX import SummaryWriter
from torch.utils.data import DataLoader

if __name__ == '__main__':

    epoch = 0
    param_dict = {}
    writer = SummaryWriter(logdir='./log', filename_suffix=' [' + str(epoch) + '~' + str(epoch + epoch_interval) + ']')

    while epoch <= epoch_num:

        epoch_train_loss = 0
        epoch_val_loss = 0
        epoch_train_iou = 0
        epoch_val_iou = 0
        epoch_train_object_num = 0
        epoch_val_object_num = 0
        epoch_train_loss_coord = 0
        epoch_val_loss_coord = 0
        epoch_train_loss_confidence = 0
        epoch_val_loss_confidence = 0
        epoch_train_loss_classes = 0
        epoch_val_loss_classes = 0

        train_loader = DataLoader(train_dataSet, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True)
        train_len = train_loader.__len__()
        YOLO.train()
        with tqdm(total=train_len) as tbar:
            for batch_index, batch_datas in enumerate(train_loader):
                optimizer_SGD.zero_grad()
                for data_index in range(len(batch_datas)):
                    batch_datas[data_index] = batch_datas[data_index].to(device=device,non_blocking=True)
                #small_bounding_boxes, middle_bounding_boxes, big_bounding_boxes = YOLO(batch_datas[0].to(device=device,non_blocking=True))
                #loss = loss_function(small_bounding_boxes, middle_bounding_boxes, big_bounding_boxes, batch_datas[1].to(device=device,non_blocking=True), batch_datas[2].float().to(device=device,non_blocking=True), batch_datas[3].to(device=device,non_blocking=True), batch_datas[4].float().to(device=device,non_blocking=True), batch_datas[5].to(device=device,non_blocking=True), batch_datas[6].float().to(device=device,non_blocking=True))

                small_bounding_boxes, middle_bounding_boxes, big_bounding_boxes = YOLO(batch_datas[0])
                loss = loss_function(small_bounding_boxes, middle_bounding_boxes, big_bounding_boxes,
                                     batch_datas[1],
                                     batch_datas[2],
                                     batch_datas[3],
                                     batch_datas[4],
                                     batch_datas[5],
                                     batch_datas[6],
                                     batch_datas[7],
                                     batch_datas[8],
                                     batch_datas[9],
                                     batch_datas[10],
                                     batch_datas[11],
                                     batch_datas[12],
                                     batch_datas[13],
                                     batch_datas[14],
                                     batch_datas[15],
                                     )

                batch_loss = loss[0]

                epoch_train_loss_coord = epoch_train_loss_coord + loss[1]
                epoch_train_loss_confidence = epoch_train_loss_confidence + loss[2]
                epoch_train_loss_classes = epoch_train_loss_classes + loss[3]
                epoch_train_iou = epoch_train_iou + loss[4]
                epoch_train_object_num = epoch_train_object_num + loss[5]

                batch_loss.backward()
                optimizer_SGD.step()

                batch_loss = batch_loss.item()
                epoch_train_loss = epoch_train_loss + batch_loss
                tbar.set_description(
                    "train: coord_loss:{} confidence_loss:{} class_loss:{} avg_iou:{}".format(round(loss[1], 4),
                                                                                              round(loss[2], 4),
                                                                                              round(loss[3], 4),
                                                                                              round(loss[4] / loss[5], 4)),
                    refresh=True)
                tbar.update(1)

                #feature_map_visualize(train_data[0][0], writer)
            print("train-batch-mean loss:{} coord_loss:{} confidence_loss:{} class_loss:{} iou:{}".format(round(epoch_train_loss / train_len, 4), round(epoch_train_loss_coord / train_len, 4), round(epoch_train_loss_confidence / train_len, 4), round(epoch_train_loss_classes / train_len, 4), round(epoch_train_iou / epoch_train_object_num, 4)))

        #lr_reduce_scheduler.step()

        val_loader = DataLoader(val_dataSet, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)
        val_len = val_loader.__len__()
        YOLO.eval()
        with tqdm(total=val_len) as tbar:
            with torch.no_grad():
                for batch_index, batch_datas in enumerate(val_loader):
                    for data_index in range(len(batch_datas)):
                        batch_datas[data_index] = batch_datas[data_index].float().to(device=device, non_blocking=True)
                    small_bounding_boxes, middle_bounding_boxes, big_bounding_boxes = YOLO(batch_datas[0])
                    loss = loss_function(small_bounding_boxes, middle_bounding_boxes, big_bounding_boxes,
                                         batch_datas[1],
                                         batch_datas[2],
                                         batch_datas[3],
                                         batch_datas[4],
                                         batch_datas[5],
                                         batch_datas[6])
                    batch_loss = loss[0] / batch_size
                    epoch_val_loss_coord = epoch_val_loss_coord + loss[1]
                    epoch_val_loss_confidence = epoch_val_loss_confidence + loss[2]
                    epoch_val_loss_classes = epoch_val_loss_classes + loss[3]
                    epoch_val_iou = epoch_val_iou + loss[4]
                    epoch_val_object_num = epoch_val_object_num + loss[5]
                    batch_loss = batch_loss.item()
                    epoch_val_loss = epoch_val_loss + batch_loss

                    tbar.set_description("val: coord_loss:{} confidence_loss:{} class_loss:{} iou:{}".format(round(loss[1], 4), round(loss[2], 4), round(loss[3], 4), round(loss[4] / loss[5], 4)), refresh=True)
                    tbar.update(1)

                # feature_map_visualize(train_data[0][0], writer)
                # print("batch_index : {} ; batch_loss : {}".format(batch_index, batch_loss))
            print("val-batch-mean loss:{} coord_loss:{} confidence_loss:{} class_loss:{} iou:{}".format(round(epoch_val_loss / val_len, 4), round(epoch_val_loss_coord / val_len, 4), round(epoch_val_loss_confidence / val_len, 4), round(epoch_val_loss_classes / val_len, 4), round(epoch_val_iou / epoch_val_object_num, 4)))

        epoch = epoch + 1

        if min_val_loss > epoch_val_loss:
            min_val_loss = epoch_val_loss
            param_dict['min_val_loss'] = min_val_loss
            param_dict['min_loss_model'] = YOLO.state_dict()

        if epoch % epoch_interval == 0:
            param_dict['model'] = YOLO.state_dict()
            param_dict['optim'] = optimizer_SGD
            param_dict['epoch'] = epoch
            torch.save(param_dict, './weights/YOLO_V1_PreTrain_' + str(epoch) + '.pth')
            writer.close()
            writer = SummaryWriter(logdir='log', filename_suffix='[' + str(epoch) + '~' + str(epoch + epoch_interval) + ']')
        print("epoch : {} ; loss : {}".format(epoch, {epoch_train_loss}))

        # ------------怎么保存？？？？？？？------------
        if epoch % 10 == 0:
            transforms_seed = random.randint(0, 9)
            temp_input_size = img_sizes[transforms_seed]
            scale_factor = temp_input_size / base_img_size
            temp_anchors = []
            for anchor_box in anchor_boxes:
                temp_anchors.append([round(anchor_box[0] * scale_factor), round(anchor_box[1])])

            train_dataSet.setInputSize(temp_input_size, temp_anchors)
            val_dataSet.setInputSize(temp_input_size, temp_anchors)
            loss_function.setImgSize(temp_input_size, temp_anchors)

        if min_val_loss > epoch_val_loss:
            min_val_loss = epoch_val_loss
            param_dict['min_val_loss'] = min_val_loss
            param_dict['min_loss_model'] = YOLO.state_dict()

        if epoch % epoch_interval == 0:
            dict = {}
            dict['model'] = YOLO.state_dict()
            dict['optim'] = optimizer_SGD
            dict['epoch'] = epoch

            torch.save(dict, './YOLO_V3_' + str(epoch) + '.pth')
            writer.close()
            writer = SummaryWriter(logdir='log',filename_suffix='[' + str(epoch) + '~' + str(epoch + epoch_interval) + ']')

        print("epoch : {} ; loss : {}".format(epoch,{epoch_train_loss}))
        for name, layer in YOLO.named_parameters():
            writer.add_histogram(name + '_grad', layer.grad.cpu().data.numpy(), epoch)
            writer.add_histogram(name + '_data', layer.cpu().data.numpy(), epoch)

        writer.add_scalar('Train/Loss_sum', epoch_train_loss, epoch)
        writer.add_scalar('Train/Loss_coord', epoch_train_loss_coord, epoch)
        writer.add_scalar('Train/Loss_confidenct', epoch_train_loss_confidence, epoch)
        writer.add_scalar('Train/Loss_classes', epoch_train_loss_classes, epoch)
        writer.add_scalar('Train/Epoch_iou', epoch_train_iou / epoch_train_object_num, epoch)

        writer.add_scalar('Val/Loss_sum', epoch_val_loss, epoch)
        writer.add_scalar('Val/Loss_coord', epoch_val_loss_coord, epoch)
        writer.add_scalar('Val/Loss_confidenct', epoch_val_loss_confidence, epoch)
        writer.add_scalar('Val/Loss_classes', epoch_val_loss_classes, epoch)
        writer.add_scalar('Val/Epoch_iou', epoch_val_iou / epoch_val_object_num, epoch)

    writer.close()

4. Loss设计

A.正负样本的选取

YOLOv3与YOLOv1对于正负样本的选取是非常不同的，YOLOv1是在训练的过程中，如果某一个grid cell含有物体的中心，那么让这个grid cell所预测的两个bounding box中，那个与真实框拥有更大IoU的box来预测真实框。

然而YOLOv3的方案是在制作ground truth的时候就已经分配好正负样本了，具体步骤如下：对于每一个物体，在浅、中、深三的层次下会对应3中尺度的grid cell，每一个grid cell拥有3个anchor尺度，此时让这总共9个anchor与ground truth计算IoU(注意这里的IoU计算只考虑形状不考虑中心点，因此应该将anchor与gt左上角对齐或者是中心点对齐后再计算IoU，其实这两种对齐方式得到的IoU值是一样的)，将其分配给拥有最大IoU值的anchor，其他的如果IoU值大于阈值，则直接忽略。除了正样本和忽略样本以外的其他都是负样本。

B.损失函数

YOLOv3并没有官方的损失函数，这边贴一下大佬们整理的损失函数：

$\lambda_{coord}$ ：是定位损失的权重，类别和置信度损失的收敛是以定位准确为前提的，因此需要让网络优先学会定位。

$\lambda_{noobj}$ ：负样本置信度损失的权重，在YOLOv3中，由于引入了9种Anchor，以416为例，总的输出预测结果共有(13 × 13 + 26 ×26 + 52 × 52) × 3 = 10647个预测框，但是实际上一副图片中真实物体是远没有这么多的，也就是说，正样本实际上远没有这么多，我们训练目标检测的最终目标是检出物体(对正样本的训练)而不是能够判断是不是背景(负样本)，因此不能让过多的负样本淹没了正样本。

$2-w_i*h_i$ ：此处的 $w_i$ 表示的是真实框的宽相对于整副图像的相对值， $h_i$ 类似。这个权重是为了提高小物体占所有损失的权重，对于小物体来说， $2-w_i*h_i$ 比大物体大。

注意：笔者注意到网上很多地方都有人说，对于没有分配到正样本的预测框，让其向Anchor靠齐，笔者暂时理解不能，并提出一些问题，如有大佬解答将万分感谢

1.Anchor只是一个形状锚框，本身是不具有任何形状信息的，那么Anchor的中心点来自哪里？

2.我们知道最终的预测框是通过对Anchor进行畸变得到的，畸变公式为

$b_w=p_w*e^{t_w}$

那么向Anchor靠齐指的是 $t_w=0,t_h=0$ 吗？

3.笔者明白12800表明这种方案只用于网络迭代的初期，但是12800这个设定本身的争议性有多大？需要这么多次迭代让网络记忆Anchor吗？

YOLO_V3_Loss.py

import time
import torch.nn as nn
import math
import torch

class YOLO_V3_Loss(nn.Module):

    def __init__(self, anchor_boxes, small_downsample=8, middle_downsample=16, big_downsample=32, class_num=80, B=3, l_coord=50, l_noobj=0.5):
        # 有物体的box损失权重设为l_coord,没有物体的box损失权重设置为l_noobj
        super(YOLO_V3_Loss, self).__init__()
        self.B = B
        self.class_num = class_num
        self.l_coord = l_coord
        self.l_noobj = l_noobj
        self.anchor_boxes = anchor_boxes
        self.small_downsmaple = small_downsample
        self.middle_downsmaple = middle_downsample
        self.big_downsmaple = big_downsample

    def iou(self, predict_coord, ground_coord):  # 计算两个box的IoU值 存储格式 xmin ymin xmax ymax

        predict_Area = (predict_coord[2] - predict_coord[0]) * (predict_coord[3] - predict_coord[1])
        ground_Area = (ground_coord[2] - ground_coord[0]) * (ground_coord[3] - ground_coord[1])

        CrossLX = max(predict_coord[0], ground_coord[0])
        CrossRX = min(predict_coord[2], ground_coord[2])
        CrossUY = max(predict_coord[1], ground_coord[1])
        CrossDY = min(predict_coord[3], ground_coord[3])

        if CrossRX < CrossLX or CrossDY < CrossUY:  # 没有交集
            return 0

        interSection = (CrossRX - CrossLX) * (CrossDY - CrossUY)

        return interSection / (predict_Area + ground_Area - interSection)



    def forward(self, samll_bounding_boxes, middle_bounding_boxes, big_bounding_boxes, small_ground_truth, small_positive_modulus, small_anchor_mark_positive, small_anchor_mark_negative, small_positive_modulus_mark, middle_ground_truth, middle_positive_modulus, middle_anchor_mark_positive, middle_anchor_mark_negative, middle_positive_modulus_mark, big_ground_truth, big_positive_modulus, big_anchor_mark_positive, big_anchor_mark_negative, big_positive_modulus_mark):  # 输入是 S * S * ( 2 * B + Classes)
        # 定义三个计算损失的变量 正样本定位损失 样本置信度损失 样本类别损失
        batch_size = len(samll_bounding_boxes[0])
        loss = 0
        loss_coord = 0
        loss_confidence = 0
        loss_classes = 0
        iou_sum = 0
        object_num = 0
        #mse_loss = nn.MSELoss()
        #bce_loss = nn.BCELoss()
        positives_num = 0
        negatives_num = 0
        bce_loss = nn.BCEWithLogitsLoss()

        small_grid_feature_size = round(self.img_size / self.small_downsmaple)
        middle_grid_feature_size = round(self.img_size / self.middle_downsmaple)
        big_grid_feature_size = round(self.img_size / self.big_downsmaple)

        time_start = time.time()

        # ground_size, batch_size, width, height, 3个anchor
        # small_ground_truth = small_ground_truth.permute(4, 0, 1, 2, 3)
        # samll_bounding_boxes = samll_bounding_boxes.permute(4, 0, 1, 2, 3)
        #<=================small loss==============>
        small_ground_positive = torch.masked_select(small_ground_truth, small_anchor_mark_positive)
        object_num = object_num + len(small_ground_positive)

        if len(small_ground_positive) > 0:

            small_predict_positive = torch.masked_select(samll_bounding_boxes, small_anchor_mark_positive)
            small_box_param = torch.masked_select(small_positive_modulus, small_positive_modulus_mark)

            small_ground_positive = small_ground_positive.view([-1, 5 + self.class_num])
            small_predict_positive = small_predict_positive.view([-1, 5 + self.class_num])
            small_box_param = small_box_param.view([-1, 6])

            for ground_index in range(len(small_ground_positive)):
                ground_box = small_box_param[ground_index][1:5]
                grid_x = int((ground_box[0] + ground_box[2]) / 2 / self.small_downsmaple)
                grid_y = int((ground_box[1] + ground_box[3]) / 2 / self.small_downsmaple)
                anchor_index = small_box_param[ground_index][5].int().item()
                anchor_width, anchor_height = self.anchors_size[anchor_index]
                predict_center_x = (grid_x + small_predict_positive[ground_index][0].item()) * self.small_downsmaple
                predict_center_y = (grid_y + small_predict_positive[ground_index][1].item()) * self.small_downsmaple
                predict_width = anchor_width * math.pow(math.e, small_predict_positive[ground_index][2].item())
                predict_height = anchor_height * math.pow(math.e, small_predict_positive[ground_index][3].item())
                predict_box = [round(predict_center_x - predict_width / 2),
                               round(predict_center_y - predict_height / 2),
                               round(predict_center_x + predict_width - predict_width / 2),
                               round(predict_center_y + predict_height - predict_height / 2)]
                iou_sum = iou_sum + self.iou(predict_box, ground_box)
                #print("iou:{}".format(self.iou(predict_box, ground_box)))
            # positive samples
            coord = self.l_coord * (torch.pow(small_ground_positive[:, 0:2] - small_predict_positive[:, 0:2], 2).sum() / batch_size + \
                    (torch.pow(small_ground_positive[:, 2] - small_predict_positive[:, 2], 2) * small_box_param[:,0]).sum() / batch_size + \
                    (torch.pow(small_ground_positive[:, 3] - small_predict_positive[:, 3], 2) * small_box_param[:, 0]).sum() / batch_size)

            loss = loss + coord
            loss_coord = loss_coord + coord.item()

            confidence = torch.pow(small_ground_positive[:, 4] - small_predict_positive[:, 4], 2).sum() / batch_size
            loss = loss + confidence
            loss_confidence = loss_confidence + confidence.item()

            #small_predict_classes = torch.clamp(small_predict_positive[:, 5:].clone(), min=1e-5, max=1-1e-5)
            classify = bce_loss(small_predict_positive[:, 5:], small_ground_positive[:, 5:])
            loss = loss + classify
            loss_classes = loss_classes + classify.item()

        # negative
        small_ground_negative = torch.masked_select(small_ground_truth, small_anchor_mark_negative)
        if len(small_ground_negative) > 0:
            small_predict_negative = torch.masked_select(samll_bounding_boxes, small_anchor_mark_negative)

            confidence = self.l_noobj * torch.pow(small_ground_negative - small_predict_negative, 2).sum() / batch_size
            loss = loss + confidence
            loss_confidence = loss_confidence + confidence.item()

        #print("loss-1:{} coord:{} conf:{} class:{}".format(loss, coord, confidence, classify))

        #<================middle loss==============>
        middle_ground_positive = torch.masked_select(middle_ground_truth, middle_anchor_mark_positive)
        object_num = object_num + len(middle_ground_positive)
        if len(middle_ground_positive) > 0:
            middle_predict_positive = torch.masked_select(middle_bounding_boxes, middle_anchor_mark_positive)
            middle_box_param = torch.masked_select(middle_positive_modulus, middle_positive_modulus_mark)

            middle_ground_positive = middle_ground_positive.view([-1, 5 + self.class_num])
            middle_predict_positive = middle_predict_positive.view([-1, 5 + self.class_num])
            middle_box_param = middle_box_param.view([-1, 6])
            # positive samples
            for ground_index in range(len(middle_ground_positive)):
                ground_box = middle_box_param[ground_index][1:5]
                grid_x = int((ground_box[0] + ground_box[2]) / 2 / self.middle_downsmaple)
                grid_y = int((ground_box[1] + ground_box[3]) / 2 / self.middle_downsmaple)
                anchor_index = middle_box_param[ground_index][5].int().item()
                anchor_width, anchor_height = self.anchors_size[anchor_index]
                predict_center_x = (grid_x + middle_predict_positive[ground_index][0].item()) * self.middle_downsmaple
                predict_center_y = (grid_y + middle_predict_positive[ground_index][1].item()) * self.middle_downsmaple
                predict_width = anchor_width * math.pow(math.e, middle_predict_positive[ground_index][2].item())
                predict_height = anchor_height * math.pow(math.e, middle_predict_positive[ground_index][3].item())
                predict_box = [round(predict_center_x - predict_width / 2),
                               round(predict_center_y - predict_height / 2),
                               round(predict_center_x + predict_width - predict_width / 2),
                               round(predict_center_y + predict_height - predict_height / 2)]
                iou_sum = iou_sum + self.iou(predict_box, ground_box)
                #print("iou:{}".format(self.iou(predict_box, ground_box)))

            coord = self.l_coord * (torch.pow(middle_ground_positive[:, 0:2] - middle_predict_positive[:, 0:2], 2).sum() / batch_size + \
                    (torch.pow(middle_ground_positive[:, 2] - middle_predict_positive[:, 2], 2) * middle_box_param[:, 0]).sum() / batch_size + \
                    (torch.pow(middle_ground_positive[:, 3] - middle_predict_positive[:, 3], 2) * middle_box_param[:, 0]).sum() / batch_size)

            loss = loss + coord
            loss_coord = loss_coord + coord.item()

            confidence = torch.pow(middle_ground_positive[:, 4] - middle_predict_positive[:, 4], 2).sum() / batch_size
            loss = loss + confidence
            loss_confidence = loss_confidence + confidence.item()

            #middle_predict_classes = torch.clamp(middle_predict_positive[:, 5:], min=1e-5, max=1 - 1e-5)
            classify = bce_loss(middle_predict_positive[:, 5:], middle_ground_positive[:, 5:])
            loss = loss + classify
            loss_classes = loss_classes + classify.item()

        # negative
        middle_ground_negative = torch.masked_select(middle_ground_truth, middle_anchor_mark_negative)
        if len(middle_ground_negative) > 0:
            middle_predict_negative = torch.masked_select(middle_bounding_boxes, middle_anchor_mark_negative)

            confidence = self.l_noobj * torch.pow(middle_ground_negative - middle_predict_negative, 2).sum() / batch_size
            loss = loss + confidence
            loss_confidence = loss_confidence + confidence.item()

        #print("loss-2:{} coord:{} conf:{} class:{}".format(loss, coord, confidence, classify))

        #<=================big loss==============>
        big_ground_positive = torch.masked_select(big_ground_truth, big_anchor_mark_positive)
        big_predict_positive = torch.masked_select(big_bounding_boxes, big_anchor_mark_positive)
        big_box_param = torch.masked_select(big_positive_modulus, big_positive_modulus_mark)

        big_ground_positive = big_ground_positive.view([-1, 5 + self.class_num])
        object_num = object_num + len(big_ground_positive)

        if len(big_ground_positive) > 0:

            big_predict_positive = big_predict_positive.view([-1, 5 + self.class_num])
            big_box_param = big_box_param.view([-1, 6])
            # positive samples
            for ground_index in range(len(big_ground_positive)):
                ground_box = big_box_param[ground_index][1:5]
                grid_x = int((ground_box[0] + ground_box[2]) / 2 / self.big_downsmaple)
                grid_y = int((ground_box[1] + ground_box[3]) / 2 / self.big_downsmaple)
                anchor_index = big_box_param[ground_index][5].int().item()
                anchor_width, anchor_height = self.anchors_size[anchor_index]
                predict_center_x = (grid_x + big_predict_positive[ground_index][0].item()) * self.big_downsmaple
                predict_center_y = (grid_y + big_predict_positive[ground_index][1].item()) * self.big_downsmaple
                predict_width = anchor_width * math.pow(math.e, big_predict_positive[ground_index][2].item())
                predict_height = anchor_height * math.pow(math.e, big_predict_positive[ground_index][3].item())
                predict_box = [round(predict_center_x - predict_width / 2),
                               round(predict_center_y - predict_height / 2),
                               round(predict_center_x + predict_width - predict_width / 2),
                               round(predict_center_y + predict_height - predict_height / 2)]
                iou_sum = iou_sum + self.iou(predict_box, ground_box)
                #print("iou:{}".format(self.iou(predict_box, ground_box)))

            coord = self.l_coord * (torch.pow(big_ground_positive[:, 0:2] - big_predict_positive[:, 0:2], 2).sum() / batch_size + \
                    (torch.pow(big_ground_positive[:, 2] - big_predict_positive[:, 2], 2) * big_box_param[:,0]).sum() / batch_size + \
                    (torch.pow(big_ground_positive[:, 3] - big_predict_positive[:, 3], 2) * big_box_param[:,0]).sum() / batch_size)
            loss = loss + coord
            loss_coord = loss_coord + coord.item()

            confidence = torch.pow(big_ground_positive[:, 4] - big_predict_positive[:, 4], 2).sum() / batch_size
            loss = loss + confidence
            loss_confidence = loss_confidence + confidence.item()

            #big_predict_classes = torch.clamp(big_predict_positive[:, 5:], min=1e-7, max=1 - 1e-7)
            classify = bce_loss(big_predict_positive[:, 5:], big_ground_positive[:, 5:])
            loss = loss + classify
            loss_classes = loss_classes + classify.item()

        # negative
        big_ground_negative = torch.masked_select(big_ground_truth, big_anchor_mark_negative)
        if len(big_ground_negative) > 0:
            big_predict_negative = torch.masked_select(big_bounding_boxes, big_anchor_mark_negative)

            confidence = self.l_noobj * torch.pow(big_ground_negative - big_predict_negative, 2).sum() / batch_size
            loss = loss + confidence
            loss_confidence = loss_confidence + confidence.item()

        #print("loss-3:{} coord:{} conf:{} class:{}".format(loss, coord, confidence, classify))
        #time_end = time.time()
        #print('loss_middle:totally cost:{} loss:{}'.format(time_end - time_start, loss))

        #print("iou:{} num:{}".format(iou_sum, object_num))
        return loss, loss_coord, loss_confidence, loss_classes, iou_sum.item(), object_num

    def setImgSize(self, img_size, anchors_size):
        self.img_size = img_size
        self.anchors_size = anchors_size

5.踩坑实况：

A.尝试使用ImageNet-mini进行预训练

离谱的是，网络迭代到后期直接发散了=-=

原因分析：ImageNet-mini数据集中的数据如下所示

原因分析如下：

1.全局池化层带来的信息损失：实际上这个类别对应的物体是那条鱼，然后我们这边进行预训练的模型是Darknet-53，很关键的一点在于，这个backbone含有全局池化层GAP，而这条鱼占图片的信息太少了，经过GAP之后信息损失太严重了，试想另一个类别也是被人拿在手里，经过全局池化后，这两副图片保留的更多信息是人的信息，结果我们要将他们分配到两个不是人这个类别的细粒度分类里(分为不同的类别)，对网络来说是很困难的，因此也不容易训练。

2.数据量不足：ImageNet-mini是细粒度分类数据集，但是每一个类别只有30～50张左右的图片

验证猜想：使用VOC2007+2012数据集，结合标注的数据，拿出宽高均不小于200的bbox，resize到256尺度下后进行训练