1. 数据集:包括数据集选取与数据增强方案确定
笔者使用COCO2014数据集进行Darknet-53的预训练,使用VOC2007+2012的混合数据集进行目标检测的训练[有条件的同学当然也可以使用ImageNet数据集预训练,再用COCO2014进行目标检测的训练]。
笔者这么安排的目的在于,对骨干网络预训练的时候,我们需要训练的是提取特征的能力,因此数据越丰富越好,模型可以从丰富的数据中挖掘共性,提高泛化能力。而在进行目标检测的任务训练时,我们需要借助骨干网络抽取的特征来完成目标检测任务,此时是特定任务,数据集可以小一些,并且此时的数据不再只是单纯的分类数据,而是着重检测任务本身带有的bounding box与box对应的类别。
A.预训练部分
Darknet-53定义:
import torch
import torch.nn as nn
class CBL(nn.Module):
def __init__(self, in_channels, out_channels, kernal_size, stride, padding, inplace=True):
super(CBL, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(in_channels, out_channels, kernal_size, stride, padding, bias=False),
nn.BatchNorm2d(out_channels),
nn.LeakyReLU(0.1, inplace=inplace),
)
def forward(self, x):
return self.conv(x)
def weight_init(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
torch.nn.init.kaiming_normal_(m.weight.data)
elif isinstance(m, nn.BatchNorm2d):
m.weight.data.fill_(1)
m.bias.data.zero_()
class ResUnit(nn.Module):
def __init__(self, in_channels, out_channels, kernal_size = 3, stride = 1, padding = 1):
super(ResUnit,self).__init__()
self.conv_feature = nn.Sequential(
CBL(in_channels, out_channels, kernal_size, stride, padding),
CBL(out_channels, out_channels, kernal_size, stride, padding)
)
self.conv_redisual = nn.Conv2d(in_channels, out_channels, 1, 1, 0)
def forward(self, x):
x_redisual = self.conv_redisual(x)
x = self.conv_feature(x)
x = torch.add(x, x_redisual)
return x
def weight_init(self):
for m in self.modules():
if isinstance(m, CBL):
m.weight_init()
elif isinstance(m, nn.Conv2d):
torch.nn.init.kaiming_normal_(m.weight.data)
class ResX(nn.Module):
def __init__(self, in_channels, out_channels_1, kernal_size_1, stride_1, padding_1, out_channels_2, kernal_size_2, stride_2, padding_2):
super(ResX,self).__init__()
self.conv = nn.Sequential(
CBL(in_channels, out_channels_1, kernal_size_1, stride_1, padding_1),# down sample
ResUnit(out_channels_1, out_channels_2, kernal_size_2, stride_2, padding_2),
)
def forward(self, x):
return self.conv(x)
def weight_init(self):
for m in self.modules():
if isinstance(m, CBL):
m.weight_init()
elif isinstance(m, ResUnit):
m.weight_init()
class DarkNet53(nn.Module):
def __init__(self, class_num):
super(DarkNet53, self).__init__()
self.conv_pre = nn.Sequential(
CBL(3, 32, 3, 1, 1),
CBL(32, 64, 3, 2, 1),
)
self.Res_1_64 = ResX(64, 32, 1, 1, 0, 64, 3, 1, 1)
self.Res_2_128 = nn.Sequential(
CBL(64, 128, 3, 2, 1),
ResX(128, 64, 1, 1, 0, 128, 3, 1, 1),
ResX(128, 64, 1, 1, 0, 128, 3, 1, 1),
)
self.Res_8_256 = nn.Sequential(
CBL(128, 256, 3, 2, 1),
ResX(256, 128, 1, 1, 0, 256, 3, 1, 1),
ResX(256, 128, 1, 1, 0, 256, 3, 1, 1),
ResX(256, 128, 1, 1, 0, 256, 3, 1, 1),
ResX(256, 128, 1, 1, 0, 256, 3, 1, 1),
ResX(256, 128, 1, 1, 0, 256, 3, 1, 1),
ResX(256, 128, 1, 1, 0, 256, 3, 1, 1),
ResX(256, 128, 1, 1, 0, 256, 3, 1, 1),
ResX(256, 128, 1, 1, 0, 256, 3, 1, 1),
)
self.Res_8_512 = nn.Sequential(
CBL(256, 512, 3, 2, 1),
ResX(512, 256, 1, 1, 0, 512, 3, 1, 1),
ResX(512, 256, 1, 1, 0, 512, 3, 1, 1),
ResX(512, 256, 1, 1, 0, 512, 3, 1, 1),
ResX(512, 256, 1, 1, 0, 512, 3, 1, 1),
ResX(512, 256, 1, 1, 0, 512, 3, 1, 1),
ResX(512, 256, 1, 1, 0, 512, 3, 1, 1),
ResX(512, 256, 1, 1, 0, 512, 3, 1, 1),
ResX(512, 256, 1, 1, 0, 512, 3, 1, 1),
)
self.Res_4_1024 = nn.Sequential(
CBL(512, 1024, 3, 2, 1),
ResX(1024, 512, 1, 1, 0, 1024, 3, 1, 1),
ResX(1024, 512, 1, 1, 0, 1024, 3, 1, 1),
ResX(1024, 512, 1, 1, 0, 1024, 3, 1, 1),
ResX(1024, 512, 1, 1, 0, 1024, 3, 1, 1),
)
self.global_pooling = nn.AdaptiveAvgPool2d((1, 1))
self.predict = nn.Linear(1024, class_num)
def forward(self, x):
x = self.conv_pre(x)
x = self.Res_1_64(x)
x = self.Res_2_128(x)
x = self.Res_8_256(x)
x = self.Res_8_512(x)
x = self.Res_4_1024(x)
x = self.global_pooling(x)
x = torch.flatten(x, start_dim=1, end_dim=3)
x = self.predict(x)
return x
def weight_init(self):
for m in self.modules():
if isinstance(m, ResX):
m.weight_init()
elif isinstance(m, nn.Conv2d):
torch.nn.init.kaiming_normal_(m.weight.data)
elif isinstance(m, nn.Linear):
torch.nn.init.kaiming_normal_(m.weight.data)
B.COCO数据集用于分类,定义数据集类
coco_classify.py
import cv2
import os
import time
import random
import imagesize
import numpy as np
from utils import image
from torch.utils.data import Dataset
import torchvision.transforms as transforms
class coco_classify(Dataset):
def __init__(self,imgs_path = "../DataSet/COCO2017/Train/Images", txts_path = "../DataSet/COCO2017/Train/Labels", is_train = True, edge_threshold=200, class_num=80, input_size=256): # input_size:输入图像的尺度
img_names = os.listdir(txts_path)
self.is_train = is_train
self.transform_common = transforms.Compose([
transforms.ToTensor(), # height * width * channel -> channel * height * width
transforms.Normalize(mean=(0.408, 0.448, 0.471), std=(0.242, 0.239, 0.234)) # 归一化后.不容易产生梯度爆炸的问题
])
self.input_size = input_size
self.train_data = [] # [img_path,[[coord, class_id]]]
for img_name in img_names:
img_path = os.path.join(imgs_path, img_name.replace(".txt", ".jpg"))
txt_path = os.path.join(txts_path, img_name)
coords = []
with open(txt_path, 'r') as label_txt:
for label in label_txt:
label = label.replace("\n", "").split(" ")
class_id = int(label[4])
if class_id >= class_num:
continue
xmin = round(float(label[0]))
ymin = round(float(label[1]))
xmax = round(float(label[2]))
ymax = round(float(label[3]))
if (xmax - xmin) < edge_threshold or (ymax - ymin) < edge_threshold:
continue
coords.append([xmin, ymin, xmax, ymax, class_id])
if len(coords) != 0:
self.train_data.append([img_path, coords])
def __getitem__(self, item):
img_path, coords = self.train_data[item]
img = cv2.imread(img_path)
random.seed(int(time.time()))
random_index = random.randint(0, len(coords) - 1)
xmin, ymin, xmax, ymax, class_index = coords[random_index]
img = img[ymin: ymax, xmin: xmax]
#cv2.imshow(str(class_index), img)
#print("height:{} width:{}".format(ymax - ymin, xmax - xmin))
#cv2.waitKey(1000)
if self.is_train:
transform_seed = random.randint(0, 2)
if transform_seed == 0: # 原图
img = image.resize_image_without_annotation(img, self.input_size, self.input_size)
elif transform_seed == 1: # 明度调整 YOLO在论文中称曝光度为明度
img = image.resize_image_without_annotation(img, self.input_size, self.input_size)
img = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
H, S, V = cv2.split(img)
cv2.merge([np.uint8(H), np.uint8(S), np.uint8(V * 1.5)], dst=img)
cv2.cvtColor(src=img, dst=img, code=cv2.COLOR_HSV2BGR)
else: # 饱和度调整
img = image.resize_image_without_annotation(img, self.input_size, self.input_size)
H, S, V = cv2.split(img)
cv2.merge([np.uint8(H), np.uint8(S * 1.5), np.uint8(V)], dst=img)
cv2.cvtColor(src=img, dst=img, code=cv2.COLOR_HSV2BGR)
else:
img = image.resize_image_without_annotation(img, self.input_size, self.input_size)
img = self.transform_common(img)
return img, class_index
def __len__(self):
return len(self.train_data)
C.训练
Darknet_Pre_Train.py
#---------------step0:Common Definition-----------------
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
if torch.cuda.is_available():
device = torch.device('cuda:0')
torch.backends.cudnn.benchmark = True
else:
device = torch.device('cpu')
lr = 3e-4
img_size = 256
momentum = 0.9
batch_size = 16
epoch_num = 1000
weight_decay = 5e-4
min_val_loss = 9999999999
epoch_interval = 1
class_num = 80
num_workers = 4
def accuracy(output, target, topk=(1, 5)):
maxk = max(topk)
batch_size = target.size(0)
_, pred = output.topk(maxk, 1, True, True)
pred = pred.t()
correct = pred.eq(target.view(1, -1).expand_as(pred)).contiguous()
res = []
for k in topk:
correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
res.append(correct_k / batch_size)
return res
#---------------step1:Dataset-------------------
#from ImageNet_DataSet import ImageNetMini
#dataSet = ImageNetMini(dataSetDir="../DataSet/imagenet-mini/train",classesFilePath="../DataSet/imagenet-mini/classDict.pth", img_size=256)
#from VOC_Classify import voc_classify
#dataSet = voc_classify(imgs_path="../DataSet/VOC2007+2012/Train/JPEGImages", annotations_path="../DataSet/VOC2007+2012/Train/Annotations", classes_file="../DataSet/VOC2007+2012/Train/class.data")
from COCO_Classify import coco_classify
train_dataSet = coco_classify(imgs_path="../DataSet/COCO2017/Train/Images", txts_path= "../DataSet/COCO2017/Train/Labels", is_train=True)
val_dataSet = coco_classify(imgs_path="../DataSet/COCO2017/Val/Images", txts_path= "../DataSet/COCO2017/Val/Labels", is_train=False)
#---------------step2:Model-------------------
from DarkNet53 import DarkNet53
darkNet53 = DarkNet53(class_num=class_num).to(device=device)
darkNet53.weight_init()
#---------------step3:LossFunction-------------------
loss_function = nn.CrossEntropyLoss().to(device=device)
#---------------step4:Optimizer-------------------
import torch.optim as optim
#optimizer_Adam = optim.Adam(darkNet53.parameters(),lr=lr,weight_decay=weight_decay)
optimizer = optim.SGD(darkNet53.parameters(), lr=lr, weight_decay=weight_decay, momentum=momentum)
#使用余弦退火动态调整学习率
#lr_reduce_scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer=optimizer_Adam , T_max=20, eta_min=1e-4, last_epoch=-1)
#lr_reduce_scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer=optimizer_Adam, T_0=2, T_mult=2)
#--------------step5:Tensorboard Feature Map------------
import torchvision.utils as vutils
def feature_map_visualize(img_data, writer):
img_data = img_data.unsqueeze(0)
img_grid = vutils.make_grid(img_data, normalize=True, scale_each=True)
for i,m in enumerate(darkNet53.modules()):
if isinstance(m, nn.Conv2d) or isinstance(m, nn.BatchNorm2d) or \
isinstance(m, nn.ReLU) or isinstance(m, nn.MaxPool2d) or isinstance(m, nn.AdaptiveAvgPool2d):
img_data = m(img_data)
x1 = img_data.transpose(0,1)
img_grid = vutils.make_grid(x1, normalize=True, scale_each=True)
writer.add_image('feature_map_' + str(i), img_grid)
#---------------step6:Train-------------------
from tqdm import tqdm
from tensorboardX import SummaryWriter
if __name__ == "__main__":
epoch = 0
param_dict = {}
writer = SummaryWriter(logdir='./log', filename_suffix=' [' + str(epoch) + '~' + str(epoch + epoch_interval) + ']')
while epoch < epoch_num:
epoch_train_loss = 0
epoch_val_loss = 0
epoch_train_top1_acc = 0
epoch_train_top5_acc = 0
epoch_val_top1_acc = 0
epoch_val_top5_acc = 0
train_loader = DataLoader(dataset=train_dataSet, batch_size=batch_size, shuffle=True, num_workers=num_workers,
pin_memory=True)
train_len = train_loader.__len__()
darkNet53.train()
with tqdm(total=train_len) as tbar:
for batch_index, batch_train in enumerate(train_loader):
train_data = batch_train[0].float().to(device=device, non_blocking=True)
label_data = batch_train[1].long().to(device=device, non_blocking=True)
net_out = darkNet53(train_data)
loss = loss_function(net_out, label_data)
loss.backward()
optimizer.step()
optimizer.zero_grad()
batch_loss = loss.item() * batch_size
epoch_train_loss = epoch_train_loss + batch_loss
# 计算准确率
net_out = net_out.detach()
[top1_acc, top5_acc] = accuracy(net_out, label_data)
top1_acc = top1_acc.item()
top5_acc = top5_acc.item()
epoch_train_top1_acc = epoch_train_top1_acc + top1_acc
epoch_train_top5_acc = epoch_train_top5_acc + top5_acc
tbar.set_description(
"train: class_loss:{} top1-acc:{} top5-acc:{}".format(loss.item(), round(top1_acc, 4),
round(top5_acc, 4), refresh=True))
tbar.update(1)
# feature_map_visualize(train_data[0][0], writer)
# print("batch_index : {} ; batch_loss : {}".format(batch_index, batch_loss))
print(
"train-mean: batch_loss:{} batch_top1_acc:{} batch_top5_acc:{}".format(round(epoch_train_loss / train_loader.__len__(), 4), round(
epoch_train_top1_acc / train_loader.__len__(), 4), round(
epoch_train_top5_acc / train_loader.__len__(), 4)))
# lr_reduce_scheduler.step()
val_loader = DataLoader(dataset=val_dataSet, batch_size=batch_size, shuffle=True, num_workers=num_workers,
pin_memory=True)
val_len = val_loader.__len__()
darkNet53.eval()
with tqdm(total=val_len) as tbar:
with torch.no_grad():
for batch_index, batch_train in enumerate(val_loader):
train_data = batch_train[0].float().to(device=device, non_blocking=True)
label_data = batch_train[1].long().to(device=device, non_blocking=True)
net_out = darkNet53(train_data)
loss = loss_function(net_out, label_data)
batch_loss = loss.item() * batch_size
epoch_val_loss = epoch_val_loss + batch_loss
# 计算准确率
net_out = net_out.detach()
[top1_acc, top5_acc] = accuracy(net_out, label_data)
top1_acc = top1_acc.item()
top5_acc = top5_acc.item()
epoch_val_top1_acc = epoch_val_top1_acc + top1_acc
epoch_val_top5_acc = epoch_val_top5_acc + top5_acc
tbar.set_description(
"val: class_loss:{} top1-acc:{} top5-acc:{}".format(loss.item(), round(top1_acc, 4),
round(top5_acc, 4), refresh=True))
tbar.update(1)
# feature_map_visualize(train_data[0][0], writer)
# print("batch_index : {} ; batch_loss : {}".format(batch_index, batch_loss))
print(
"train-mean: batch_loss:{} batch_top1_acc:{} batch_top5_acc:{}".format(round(epoch_val_loss / val_loader.__len__(), 4), round(
epoch_val_top1_acc / val_loader.__len__(), 4), round(
epoch_val_top5_acc / val_loader.__len__(), 4)))
epoch = epoch + 1
if min_val_loss > epoch_val_loss:
min_val_loss = epoch_val_loss
param_dict['min_val_loss'] = min_val_loss
param_dict['min_loss_model'] = darkNet53.state_dict()
if epoch % epoch_interval == 0:
param_dict['model'] = darkNet53.state_dict()
param_dict['optim'] = optimizer
param_dict['epoch'] = epoch
torch.save(param_dict, './weights/Darknet-53_' + str(epoch) + '.pth')
writer.close()
writer = SummaryWriter(logdir='log', filename_suffix='[' + str(epoch) + '~' + str(epoch + epoch_interval) + ']')
print("epoch : {} ; train-loss : {}".format(epoch, {epoch_train_loss}))
for i, (name, layer) in enumerate(darkNet53.named_parameters()):
if 'bn' not in name:
writer.add_histogram(name + '_grad', layer, epoch)
writer.add_scalar('Train/Loss_sum', epoch_train_loss, epoch)
writer.add_scalar('Val/Loss_sum', epoch_val_loss, epoch)
writer.close()
2. 聚类得到Anchor尺度
A.k-means
anchor_k_means.py
import numpy as np
def iou(cluster, boxes):
Area_culster = cluster[0] * cluster[1]
Area_boxes = boxes[:,0] * boxes[:,1]
Area_inter = np.minimum(cluster[0], boxes[:,0]) * np.minimum(cluster[1], boxes[:,1])
return Area_inter / (Area_culster + Area_boxes - Area_inter)
def kmeans(boxes, k, dist=np.median, seed=1):
"""
计算k-均值聚类与交集的联合(IoU)指标
:param boxes:形状(r, 2)的numpy数组,其中r是行数
:param k: 集群的数量
:param dist: 距离函数
:返回:形状的numpy数组(k, 2)
"""
rows = boxes.shape[0] # 样本数
distances = np.empty((rows, k)) # N row x N cluster distance[row][k]:第row个样本到第k个聚类中心的距离
last_clusters = np.zeros((rows,))
np.random.seed(seed) # 设置随机种子
# 将集群中心初始化为k个项 np.random.choice(rows, k, replace=False) 从0~rows-1的均匀分布中随机采样k个点并保证不重复
clusters = boxes[np.random.choice(rows, k, replace=False)]
while True:
# 为每个点指定聚类的类别(如果这个点距离某类别最近,那么就指定它是这个类别)
for icluster in range(k): # I made change to lars76's code here to make the code faster
distances[:, icluster] = 1 - iou(clusters[icluster], boxes)
nearest_clusters = np.argmin(distances, axis=1) # 找到每一个样本距离最近的聚类中心
# 如果聚类簇的中心位置基本不变了,那么迭代终止。
if (last_clusters == nearest_clusters).all(): # 所有的聚类中心不变
break
# 重新计算每个聚类簇的平均中心位置,并它作为聚类中心点
for cluster in range(k):
clusters[cluster] = dist(boxes[nearest_clusters == cluster], axis=0) #聚类中心对每个维度求均值
last_clusters = nearest_clusters
return clusters, nearest_clusters, distances
import os
import cv2
import time
import image
target_size = 608
k = 9
txts_path = "../DataSet/COCO2017/Train/Labels"
imgs_path = "../DataSet/COCO2017/Train/Images"
txts_name = os.listdir(txts_path)
bounding_boxes = []
for txt_name in txts_name:
img_path = os.path.join(imgs_path, txt_name.replace(".txt", ".jpg"))
img = cv2.imread(img_path)
coords = []
with open(os.path.join(txts_path, txt_name), 'r') as file:
for line_context in file:
line_context = line_context.split(' ')
class_id = int(line_context[4])
xmin = round(float(line_context[0]))
ymin = round(float(line_context[1]))
xmax = round(float(line_context[2]))
ymax = round(float(line_context[3]))
coords.append([xmin, ymin, xmax, ymax, class_id])
img, coords = image.resize_image_with_coords(img, target_size, target_size, coords)
for coord in coords:
coord[0] = round(coord[0] * target_size)
coord[1] = round(coord[1] * target_size)
coord[2] = round(coord[2] * target_size)
coord[3] = round(coord[3] * target_size)
box = [coord[2] - coord[0], coord[3] - coord[1]]
bounding_boxes.append(box)
clusters, nearest_clusters, distances = kmeans(np.array(bounding_boxes), k, seed=int(time.time()))
import matplotlib.pyplot as plt
colors = ['peru', 'dodgerblue', 'turquoise', 'brown', 'red', 'lightsalmon', 'orange', 'springgreen' , 'orchid']
point_x = [list() for i in range(k)]
point_y = [list() for i in range(k)]
for index in range(len(nearest_clusters)):
point_x[nearest_clusters[index]].append(bounding_boxes[index][0])
point_y[nearest_clusters[index]].append(bounding_boxes[index][1])
for cluster_index in range(k):
plt.scatter(point_x[cluster_index], point_y[cluster_index], color=colors[cluster_index])
clusters.sort(lambda: x[0] * x[1] for x in clusters)
plt.show()
print(clusters)
笔者聚类得到的结果为:[10, 11], [15, 28], [36, 22], [30, 60], [61, 125], [67, 46], [129, 88], [162, 211], [391, 336]
3. 多尺度训练:
以32为步长间隔,设置了 [320, 352, 384, 416, 448, 480, 512, 544, 576, 608] 这十种尺度的输入图像分辨率,每隔10个epoch随机选取一种尺度进行训练。
YOLO_V3_Train.py
#---------------step0:Common Definitaion-------------
import torch
import random
from datetime import datetime
random.seed(datetime.now())
if torch.cuda.is_available():
device = torch.device("cuda:0")
#torch.backends.cudnn.benchmark = True
else:
device = torch.device("cpu")
#torch.autograd.set_detect_anomaly(True)
# train hype parameter
batch_size = 16
lr = 1e-3
weight_decay = 5e-4
momentum = 0.9
pre_weight_file = "../PreTrain/darknet53_901.pth"
class_num = 20
epoch_interval = 50
epoch_num = 200
num_workers = 4
min_val_loss = 9999999999
# train img parameter
img_sizes = [320, 352, 384, 416, 448, 480, 512, 544, 576, 608]
base_img_size = 608 #基准尺度是608
anchor_boxes = [[7, 10], [14, 30], [23, 14], [30, 61], [46, 30], [61, 118], [98, 61], [148, 188], [350, 330]]
now_img_size = 416
#---------------step1:Dataset-------------------
import torch
from COCO_DataSet import COCODataSet
from VOC_DataSet import VOCDataSet
train_dataSet = VOCDataSet(imgs_path="../DataSet/VOC2007+2012/Train/JPEGImages",annotations_path="../DataSet/VOC2007+2012/Train/Annotations",classes_file="../DataSet/VOC2007+2012/class.data", is_train=True, class_num=class_num)
val_dataSet = VOCDataSet(imgs_path="../DataSet/VOC2007+2012/Val/JPEGImages",annotations_path="../DataSet/VOC2007+2012/Val/Annotations",classes_file="../DataSet/VOC2007+2012/class.data", is_train=False, class_num=class_num)
train_dataSet.setInputSize(now_img_size, anchor_boxes)
val_dataSet.setInputSize(now_img_size, anchor_boxes)
#dataSet = COCODataSet(imgs_path="../DataSet/COCO2017/Train/JPEGImages",txts_path="../DataSet/COCO2017/Train/Labels", class_num=80)
#---------------step2:Model-------------------
from YOLO_V3_Model import YOLO_V3
from model import set_freeze_by_idxs
YOLO = YOLO_V3(class_num=80).to(device=device)
YOLO.initialize_weights(pre_weight_file)
set_freeze_by_idxs(YOLO,[0, 1, 2, 3, 4])
#---------------step3:LossFunction-------------------
from YOLO_V3_LossFunction import YOLO_V3_Loss
loss_function = YOLO_V3_Loss(anchor_boxes=anchor_boxes, class_num=class_num).to(device=device)
loss_function.setImgSize(now_img_size, anchor_boxes)
#---------------step4:Optimizer-------------------
import torch.optim as optim
#optimizer_Adam = optim.Adam(YOLO.parameters(),lr=1e-4,weight_decay=0.005)
optimizer_SGD = optim.SGD(YOLO.parameters(),lr=lr,weight_decay=weight_decay, momentum=momentum)
#使用余弦退火动态调整学习率
#lr_reduce_scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer=optimizer_Adam , T_max=20, eta_min=1e-4, last_epoch=-1)
#lr_reduce_scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer=optimizer_Adam, T_0=2, T_mult=2)
#--------------step5:Tensorboard Feature Map------------
import torch.nn as nn
import torchvision.utils as vutils
def feature_map_visualize(img_data, writer):
img_data = img_data.unsqueeze(0)
img_grid = vutils.make_grid(img_data, normalize=True, scale_each=True)
for i,m in enumerate(YOLO.modules()):
if isinstance(m, nn.Conv2d) or isinstance(m, nn.BatchNorm2d) or \
isinstance(m, nn.ReLU) or isinstance(m, nn.MaxPool2d) or isinstance(m, nn.AdaptiveAvgPool2d):
img_data = m(img_data)
x1 = img_data.transpose(0,1)
img_grid = vutils.make_grid(x1, normalize=True, scale_each=True)
writer.add_image('feature_map_' + str(i), img_grid)
#---------------step6:Train-------------------
from tqdm import tqdm
from tensorboardX import SummaryWriter
from torch.utils.data import DataLoader
if __name__ == '__main__':
epoch = 0
param_dict = {}
writer = SummaryWriter(logdir='./log', filename_suffix=' [' + str(epoch) + '~' + str(epoch + epoch_interval) + ']')
while epoch <= epoch_num:
epoch_train_loss = 0
epoch_val_loss = 0
epoch_train_iou = 0
epoch_val_iou = 0
epoch_train_object_num = 0
epoch_val_object_num = 0
epoch_train_loss_coord = 0
epoch_val_loss_coord = 0
epoch_train_loss_confidence = 0
epoch_val_loss_confidence = 0
epoch_train_loss_classes = 0
epoch_val_loss_classes = 0
train_loader = DataLoader(train_dataSet, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True)
train_len = train_loader.__len__()
YOLO.train()
with tqdm(total=train_len) as tbar:
for batch_index, batch_datas in enumerate(train_loader):
optimizer_SGD.zero_grad()
for data_index in range(len(batch_datas)):
batch_datas[data_index] = batch_datas[data_index].to(device=device,non_blocking=True)
#small_bounding_boxes, middle_bounding_boxes, big_bounding_boxes = YOLO(batch_datas[0].to(device=device,non_blocking=True))
#loss = loss_function(small_bounding_boxes, middle_bounding_boxes, big_bounding_boxes, batch_datas[1].to(device=device,non_blocking=True), batch_datas[2].float().to(device=device,non_blocking=True), batch_datas[3].to(device=device,non_blocking=True), batch_datas[4].float().to(device=device,non_blocking=True), batch_datas[5].to(device=device,non_blocking=True), batch_datas[6].float().to(device=device,non_blocking=True))
small_bounding_boxes, middle_bounding_boxes, big_bounding_boxes = YOLO(batch_datas[0])
loss = loss_function(small_bounding_boxes, middle_bounding_boxes, big_bounding_boxes,
batch_datas[1],
batch_datas[2],
batch_datas[3],
batch_datas[4],
batch_datas[5],
batch_datas[6],
batch_datas[7],
batch_datas[8],
batch_datas[9],
batch_datas[10],
batch_datas[11],
batch_datas[12],
batch_datas[13],
batch_datas[14],
batch_datas[15],
)
batch_loss = loss[0]
epoch_train_loss_coord = epoch_train_loss_coord + loss[1]
epoch_train_loss_confidence = epoch_train_loss_confidence + loss[2]
epoch_train_loss_classes = epoch_train_loss_classes + loss[3]
epoch_train_iou = epoch_train_iou + loss[4]
epoch_train_object_num = epoch_train_object_num + loss[5]
batch_loss.backward()
optimizer_SGD.step()
batch_loss = batch_loss.item()
epoch_train_loss = epoch_train_loss + batch_loss
tbar.set_description(
"train: coord_loss:{} confidence_loss:{} class_loss:{} avg_iou:{}".format(round(loss[1], 4),
round(loss[2], 4),
round(loss[3], 4),
round(loss[4] / loss[5], 4)),
refresh=True)
tbar.update(1)
#feature_map_visualize(train_data[0][0], writer)
print("train-batch-mean loss:{} coord_loss:{} confidence_loss:{} class_loss:{} iou:{}".format(round(epoch_train_loss / train_len, 4), round(epoch_train_loss_coord / train_len, 4), round(epoch_train_loss_confidence / train_len, 4), round(epoch_train_loss_classes / train_len, 4), round(epoch_train_iou / epoch_train_object_num, 4)))
#lr_reduce_scheduler.step()
val_loader = DataLoader(val_dataSet, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)
val_len = val_loader.__len__()
YOLO.eval()
with tqdm(total=val_len) as tbar:
with torch.no_grad():
for batch_index, batch_datas in enumerate(val_loader):
for data_index in range(len(batch_datas)):
batch_datas[data_index] = batch_datas[data_index].float().to(device=device, non_blocking=True)
small_bounding_boxes, middle_bounding_boxes, big_bounding_boxes = YOLO(batch_datas[0])
loss = loss_function(small_bounding_boxes, middle_bounding_boxes, big_bounding_boxes,
batch_datas[1],
batch_datas[2],
batch_datas[3],
batch_datas[4],
batch_datas[5],
batch_datas[6])
batch_loss = loss[0] / batch_size
epoch_val_loss_coord = epoch_val_loss_coord + loss[1]
epoch_val_loss_confidence = epoch_val_loss_confidence + loss[2]
epoch_val_loss_classes = epoch_val_loss_classes + loss[3]
epoch_val_iou = epoch_val_iou + loss[4]
epoch_val_object_num = epoch_val_object_num + loss[5]
batch_loss = batch_loss.item()
epoch_val_loss = epoch_val_loss + batch_loss
tbar.set_description("val: coord_loss:{} confidence_loss:{} class_loss:{} iou:{}".format(round(loss[1], 4), round(loss[2], 4), round(loss[3], 4), round(loss[4] / loss[5], 4)), refresh=True)
tbar.update(1)
# feature_map_visualize(train_data[0][0], writer)
# print("batch_index : {} ; batch_loss : {}".format(batch_index, batch_loss))
print("val-batch-mean loss:{} coord_loss:{} confidence_loss:{} class_loss:{} iou:{}".format(round(epoch_val_loss / val_len, 4), round(epoch_val_loss_coord / val_len, 4), round(epoch_val_loss_confidence / val_len, 4), round(epoch_val_loss_classes / val_len, 4), round(epoch_val_iou / epoch_val_object_num, 4)))
epoch = epoch + 1
if min_val_loss > epoch_val_loss:
min_val_loss = epoch_val_loss
param_dict['min_val_loss'] = min_val_loss
param_dict['min_loss_model'] = YOLO.state_dict()
if epoch % epoch_interval == 0:
param_dict['model'] = YOLO.state_dict()
param_dict['optim'] = optimizer_SGD
param_dict['epoch'] = epoch
torch.save(param_dict, './weights/YOLO_V1_PreTrain_' + str(epoch) + '.pth')
writer.close()
writer = SummaryWriter(logdir='log', filename_suffix='[' + str(epoch) + '~' + str(epoch + epoch_interval) + ']')
print("epoch : {} ; loss : {}".format(epoch, {epoch_train_loss}))
# ------------怎么保存???????------------
if epoch % 10 == 0:
transforms_seed = random.randint(0, 9)
temp_input_size = img_sizes[transforms_seed]
scale_factor = temp_input_size / base_img_size
temp_anchors = []
for anchor_box in anchor_boxes:
temp_anchors.append([round(anchor_box[0] * scale_factor), round(anchor_box[1])])
train_dataSet.setInputSize(temp_input_size, temp_anchors)
val_dataSet.setInputSize(temp_input_size, temp_anchors)
loss_function.setImgSize(temp_input_size, temp_anchors)
if min_val_loss > epoch_val_loss:
min_val_loss = epoch_val_loss
param_dict['min_val_loss'] = min_val_loss
param_dict['min_loss_model'] = YOLO.state_dict()
if epoch % epoch_interval == 0:
dict = {}
dict['model'] = YOLO.state_dict()
dict['optim'] = optimizer_SGD
dict['epoch'] = epoch
torch.save(dict, './YOLO_V3_' + str(epoch) + '.pth')
writer.close()
writer = SummaryWriter(logdir='log',filename_suffix='[' + str(epoch) + '~' + str(epoch + epoch_interval) + ']')
print("epoch : {} ; loss : {}".format(epoch,{epoch_train_loss}))
for name, layer in YOLO.named_parameters():
writer.add_histogram(name + '_grad', layer.grad.cpu().data.numpy(), epoch)
writer.add_histogram(name + '_data', layer.cpu().data.numpy(), epoch)
writer.add_scalar('Train/Loss_sum', epoch_train_loss, epoch)
writer.add_scalar('Train/Loss_coord', epoch_train_loss_coord, epoch)
writer.add_scalar('Train/Loss_confidenct', epoch_train_loss_confidence, epoch)
writer.add_scalar('Train/Loss_classes', epoch_train_loss_classes, epoch)
writer.add_scalar('Train/Epoch_iou', epoch_train_iou / epoch_train_object_num, epoch)
writer.add_scalar('Val/Loss_sum', epoch_val_loss, epoch)
writer.add_scalar('Val/Loss_coord', epoch_val_loss_coord, epoch)
writer.add_scalar('Val/Loss_confidenct', epoch_val_loss_confidence, epoch)
writer.add_scalar('Val/Loss_classes', epoch_val_loss_classes, epoch)
writer.add_scalar('Val/Epoch_iou', epoch_val_iou / epoch_val_object_num, epoch)
writer.close()
4. Loss设计
A.正负样本的选取
YOLOv3与YOLOv1对于正负样本的选取是非常不同的,YOLOv1是在训练的过程中,如果某一个grid cell含有物体的中心,那么让这个grid cell所预测的两个bounding box中,那个与真实框拥有更大IoU的box来预测真实框。
然而YOLOv3的方案是在制作ground truth的时候就已经分配好正负样本了,具体步骤如下:对于每一个物体,在浅、中、深三的层次下会对应3中尺度的grid cell,每一个grid cell拥有3个anchor尺度,此时让这总共9个anchor与ground truth计算IoU(注意这里的IoU计算只考虑形状不考虑中心点,因此应该将anchor与gt左上角对齐或者是中心点对齐后再计算IoU,其实这两种对齐方式得到的IoU值是一样的),将其分配给拥有最大IoU值的anchor,其他的如果IoU值大于阈值,则直接忽略。除了正样本和忽略样本以外的其他都是负样本。
B.损失函数
YOLOv3并没有官方的损失函数,这边贴一下大佬们整理的损失函数:
:是定位损失的权重,类别和置信度损失的收敛是以定位准确为前提的,因此需要让网络优先学会定位。
:负样本置信度损失的权重,在YOLOv3中,由于引入了9种Anchor,以416为例 ,总的输出预测结果共有(13 × 13 + 26 ×26 + 52 × 52) × 3 = 10647个预测框,但是实际上一副图片中真实物体是远没有这么多的,也就是说,正样本实际上远没有这么多,我们训练目标检测的最终目标是检出物体(对正样本的训练)而不是能够判断是不是背景(负样本),因此不能让过多的负样本淹没了正样本。
:此处的表示的是真实框的宽相对于整副图像的相对值,类似。这个权重是为了提高小物体占所有损失的权重,对于小物体来说,比大物体大。
注意:笔者注意到网上很多地方都有人说,对于没有分配到正样本的预测框,让其向Anchor靠齐,笔者暂时理解不能,并提出一些问题,如有大佬解答将万分感谢
1.Anchor只是一个形状锚框,本身是不具有任何形状信息的,那么Anchor的中心点来自哪里?
2.我们知道最终的预测框是通过对Anchor进行畸变得到的,畸变公式为
那么向Anchor靠齐指的是吗?
3.笔者明白12800表明这种方案只用于网络迭代的初期,但是12800这个设定本身的争议性有多大?需要这么多次迭代让网络记忆Anchor吗?
YOLO_V3_Loss.py
import time
import torch.nn as nn
import math
import torch
class YOLO_V3_Loss(nn.Module):
def __init__(self, anchor_boxes, small_downsample=8, middle_downsample=16, big_downsample=32, class_num=80, B=3, l_coord=50, l_noobj=0.5):
# 有物体的box损失权重设为l_coord,没有物体的box损失权重设置为l_noobj
super(YOLO_V3_Loss, self).__init__()
self.B = B
self.class_num = class_num
self.l_coord = l_coord
self.l_noobj = l_noobj
self.anchor_boxes = anchor_boxes
self.small_downsmaple = small_downsample
self.middle_downsmaple = middle_downsample
self.big_downsmaple = big_downsample
def iou(self, predict_coord, ground_coord): # 计算两个box的IoU值 存储格式 xmin ymin xmax ymax
predict_Area = (predict_coord[2] - predict_coord[0]) * (predict_coord[3] - predict_coord[1])
ground_Area = (ground_coord[2] - ground_coord[0]) * (ground_coord[3] - ground_coord[1])
CrossLX = max(predict_coord[0], ground_coord[0])
CrossRX = min(predict_coord[2], ground_coord[2])
CrossUY = max(predict_coord[1], ground_coord[1])
CrossDY = min(predict_coord[3], ground_coord[3])
if CrossRX < CrossLX or CrossDY < CrossUY: # 没有交集
return 0
interSection = (CrossRX - CrossLX) * (CrossDY - CrossUY)
return interSection / (predict_Area + ground_Area - interSection)
def forward(self, samll_bounding_boxes, middle_bounding_boxes, big_bounding_boxes, small_ground_truth, small_positive_modulus, small_anchor_mark_positive, small_anchor_mark_negative, small_positive_modulus_mark, middle_ground_truth, middle_positive_modulus, middle_anchor_mark_positive, middle_anchor_mark_negative, middle_positive_modulus_mark, big_ground_truth, big_positive_modulus, big_anchor_mark_positive, big_anchor_mark_negative, big_positive_modulus_mark): # 输入是 S * S * ( 2 * B + Classes)
# 定义三个计算损失的变量 正样本定位损失 样本置信度损失 样本类别损失
batch_size = len(samll_bounding_boxes[0])
loss = 0
loss_coord = 0
loss_confidence = 0
loss_classes = 0
iou_sum = 0
object_num = 0
#mse_loss = nn.MSELoss()
#bce_loss = nn.BCELoss()
positives_num = 0
negatives_num = 0
bce_loss = nn.BCEWithLogitsLoss()
small_grid_feature_size = round(self.img_size / self.small_downsmaple)
middle_grid_feature_size = round(self.img_size / self.middle_downsmaple)
big_grid_feature_size = round(self.img_size / self.big_downsmaple)
time_start = time.time()
# ground_size, batch_size, width, height, 3个anchor
# small_ground_truth = small_ground_truth.permute(4, 0, 1, 2, 3)
# samll_bounding_boxes = samll_bounding_boxes.permute(4, 0, 1, 2, 3)
#<=================small loss==============>
small_ground_positive = torch.masked_select(small_ground_truth, small_anchor_mark_positive)
object_num = object_num + len(small_ground_positive)
if len(small_ground_positive) > 0:
small_predict_positive = torch.masked_select(samll_bounding_boxes, small_anchor_mark_positive)
small_box_param = torch.masked_select(small_positive_modulus, small_positive_modulus_mark)
small_ground_positive = small_ground_positive.view([-1, 5 + self.class_num])
small_predict_positive = small_predict_positive.view([-1, 5 + self.class_num])
small_box_param = small_box_param.view([-1, 6])
for ground_index in range(len(small_ground_positive)):
ground_box = small_box_param[ground_index][1:5]
grid_x = int((ground_box[0] + ground_box[2]) / 2 / self.small_downsmaple)
grid_y = int((ground_box[1] + ground_box[3]) / 2 / self.small_downsmaple)
anchor_index = small_box_param[ground_index][5].int().item()
anchor_width, anchor_height = self.anchors_size[anchor_index]
predict_center_x = (grid_x + small_predict_positive[ground_index][0].item()) * self.small_downsmaple
predict_center_y = (grid_y + small_predict_positive[ground_index][1].item()) * self.small_downsmaple
predict_width = anchor_width * math.pow(math.e, small_predict_positive[ground_index][2].item())
predict_height = anchor_height * math.pow(math.e, small_predict_positive[ground_index][3].item())
predict_box = [round(predict_center_x - predict_width / 2),
round(predict_center_y - predict_height / 2),
round(predict_center_x + predict_width - predict_width / 2),
round(predict_center_y + predict_height - predict_height / 2)]
iou_sum = iou_sum + self.iou(predict_box, ground_box)
#print("iou:{}".format(self.iou(predict_box, ground_box)))
# positive samples
coord = self.l_coord * (torch.pow(small_ground_positive[:, 0:2] - small_predict_positive[:, 0:2], 2).sum() / batch_size + \
(torch.pow(small_ground_positive[:, 2] - small_predict_positive[:, 2], 2) * small_box_param[:,0]).sum() / batch_size + \
(torch.pow(small_ground_positive[:, 3] - small_predict_positive[:, 3], 2) * small_box_param[:, 0]).sum() / batch_size)
loss = loss + coord
loss_coord = loss_coord + coord.item()
confidence = torch.pow(small_ground_positive[:, 4] - small_predict_positive[:, 4], 2).sum() / batch_size
loss = loss + confidence
loss_confidence = loss_confidence + confidence.item()
#small_predict_classes = torch.clamp(small_predict_positive[:, 5:].clone(), min=1e-5, max=1-1e-5)
classify = bce_loss(small_predict_positive[:, 5:], small_ground_positive[:, 5:])
loss = loss + classify
loss_classes = loss_classes + classify.item()
# negative
small_ground_negative = torch.masked_select(small_ground_truth, small_anchor_mark_negative)
if len(small_ground_negative) > 0:
small_predict_negative = torch.masked_select(samll_bounding_boxes, small_anchor_mark_negative)
confidence = self.l_noobj * torch.pow(small_ground_negative - small_predict_negative, 2).sum() / batch_size
loss = loss + confidence
loss_confidence = loss_confidence + confidence.item()
#print("loss-1:{} coord:{} conf:{} class:{}".format(loss, coord, confidence, classify))
#<================middle loss==============>
middle_ground_positive = torch.masked_select(middle_ground_truth, middle_anchor_mark_positive)
object_num = object_num + len(middle_ground_positive)
if len(middle_ground_positive) > 0:
middle_predict_positive = torch.masked_select(middle_bounding_boxes, middle_anchor_mark_positive)
middle_box_param = torch.masked_select(middle_positive_modulus, middle_positive_modulus_mark)
middle_ground_positive = middle_ground_positive.view([-1, 5 + self.class_num])
middle_predict_positive = middle_predict_positive.view([-1, 5 + self.class_num])
middle_box_param = middle_box_param.view([-1, 6])
# positive samples
for ground_index in range(len(middle_ground_positive)):
ground_box = middle_box_param[ground_index][1:5]
grid_x = int((ground_box[0] + ground_box[2]) / 2 / self.middle_downsmaple)
grid_y = int((ground_box[1] + ground_box[3]) / 2 / self.middle_downsmaple)
anchor_index = middle_box_param[ground_index][5].int().item()
anchor_width, anchor_height = self.anchors_size[anchor_index]
predict_center_x = (grid_x + middle_predict_positive[ground_index][0].item()) * self.middle_downsmaple
predict_center_y = (grid_y + middle_predict_positive[ground_index][1].item()) * self.middle_downsmaple
predict_width = anchor_width * math.pow(math.e, middle_predict_positive[ground_index][2].item())
predict_height = anchor_height * math.pow(math.e, middle_predict_positive[ground_index][3].item())
predict_box = [round(predict_center_x - predict_width / 2),
round(predict_center_y - predict_height / 2),
round(predict_center_x + predict_width - predict_width / 2),
round(predict_center_y + predict_height - predict_height / 2)]
iou_sum = iou_sum + self.iou(predict_box, ground_box)
#print("iou:{}".format(self.iou(predict_box, ground_box)))
coord = self.l_coord * (torch.pow(middle_ground_positive[:, 0:2] - middle_predict_positive[:, 0:2], 2).sum() / batch_size + \
(torch.pow(middle_ground_positive[:, 2] - middle_predict_positive[:, 2], 2) * middle_box_param[:, 0]).sum() / batch_size + \
(torch.pow(middle_ground_positive[:, 3] - middle_predict_positive[:, 3], 2) * middle_box_param[:, 0]).sum() / batch_size)
loss = loss + coord
loss_coord = loss_coord + coord.item()
confidence = torch.pow(middle_ground_positive[:, 4] - middle_predict_positive[:, 4], 2).sum() / batch_size
loss = loss + confidence
loss_confidence = loss_confidence + confidence.item()
#middle_predict_classes = torch.clamp(middle_predict_positive[:, 5:], min=1e-5, max=1 - 1e-5)
classify = bce_loss(middle_predict_positive[:, 5:], middle_ground_positive[:, 5:])
loss = loss + classify
loss_classes = loss_classes + classify.item()
# negative
middle_ground_negative = torch.masked_select(middle_ground_truth, middle_anchor_mark_negative)
if len(middle_ground_negative) > 0:
middle_predict_negative = torch.masked_select(middle_bounding_boxes, middle_anchor_mark_negative)
confidence = self.l_noobj * torch.pow(middle_ground_negative - middle_predict_negative, 2).sum() / batch_size
loss = loss + confidence
loss_confidence = loss_confidence + confidence.item()
#print("loss-2:{} coord:{} conf:{} class:{}".format(loss, coord, confidence, classify))
#<=================big loss==============>
big_ground_positive = torch.masked_select(big_ground_truth, big_anchor_mark_positive)
big_predict_positive = torch.masked_select(big_bounding_boxes, big_anchor_mark_positive)
big_box_param = torch.masked_select(big_positive_modulus, big_positive_modulus_mark)
big_ground_positive = big_ground_positive.view([-1, 5 + self.class_num])
object_num = object_num + len(big_ground_positive)
if len(big_ground_positive) > 0:
big_predict_positive = big_predict_positive.view([-1, 5 + self.class_num])
big_box_param = big_box_param.view([-1, 6])
# positive samples
for ground_index in range(len(big_ground_positive)):
ground_box = big_box_param[ground_index][1:5]
grid_x = int((ground_box[0] + ground_box[2]) / 2 / self.big_downsmaple)
grid_y = int((ground_box[1] + ground_box[3]) / 2 / self.big_downsmaple)
anchor_index = big_box_param[ground_index][5].int().item()
anchor_width, anchor_height = self.anchors_size[anchor_index]
predict_center_x = (grid_x + big_predict_positive[ground_index][0].item()) * self.big_downsmaple
predict_center_y = (grid_y + big_predict_positive[ground_index][1].item()) * self.big_downsmaple
predict_width = anchor_width * math.pow(math.e, big_predict_positive[ground_index][2].item())
predict_height = anchor_height * math.pow(math.e, big_predict_positive[ground_index][3].item())
predict_box = [round(predict_center_x - predict_width / 2),
round(predict_center_y - predict_height / 2),
round(predict_center_x + predict_width - predict_width / 2),
round(predict_center_y + predict_height - predict_height / 2)]
iou_sum = iou_sum + self.iou(predict_box, ground_box)
#print("iou:{}".format(self.iou(predict_box, ground_box)))
coord = self.l_coord * (torch.pow(big_ground_positive[:, 0:2] - big_predict_positive[:, 0:2], 2).sum() / batch_size + \
(torch.pow(big_ground_positive[:, 2] - big_predict_positive[:, 2], 2) * big_box_param[:,0]).sum() / batch_size + \
(torch.pow(big_ground_positive[:, 3] - big_predict_positive[:, 3], 2) * big_box_param[:,0]).sum() / batch_size)
loss = loss + coord
loss_coord = loss_coord + coord.item()
confidence = torch.pow(big_ground_positive[:, 4] - big_predict_positive[:, 4], 2).sum() / batch_size
loss = loss + confidence
loss_confidence = loss_confidence + confidence.item()
#big_predict_classes = torch.clamp(big_predict_positive[:, 5:], min=1e-7, max=1 - 1e-7)
classify = bce_loss(big_predict_positive[:, 5:], big_ground_positive[:, 5:])
loss = loss + classify
loss_classes = loss_classes + classify.item()
# negative
big_ground_negative = torch.masked_select(big_ground_truth, big_anchor_mark_negative)
if len(big_ground_negative) > 0:
big_predict_negative = torch.masked_select(big_bounding_boxes, big_anchor_mark_negative)
confidence = self.l_noobj * torch.pow(big_ground_negative - big_predict_negative, 2).sum() / batch_size
loss = loss + confidence
loss_confidence = loss_confidence + confidence.item()
#print("loss-3:{} coord:{} conf:{} class:{}".format(loss, coord, confidence, classify))
#time_end = time.time()
#print('loss_middle:totally cost:{} loss:{}'.format(time_end - time_start, loss))
#print("iou:{} num:{}".format(iou_sum, object_num))
return loss, loss_coord, loss_confidence, loss_classes, iou_sum.item(), object_num
def setImgSize(self, img_size, anchors_size):
self.img_size = img_size
self.anchors_size = anchors_size
5.踩坑实况:
A.尝试使用ImageNet-mini进行预训练
离谱的是,网络迭代到后期直接发散了=-=
原因分析:ImageNet-mini数据集中的数据如下所示
原因分析如下:
1.全局池化层带来的信息损失:实际上这个类别对应的物体是那条鱼,然后我们这边进行预训练的模型是Darknet-53,很关键的一点在于,这个backbone含有全局池化层GAP,而这条鱼占图片的信息太少了,经过GAP之后信息损失太严重了,试想另一个类别也是被人拿在手里,经过全局池化后,这两副图片保留的更多信息是人的信息,结果我们要将他们分配到两个不是人这个类别的细粒度分类里(分为不同的类别),对网络来说是很困难的,因此也不容易训练。
2.数据量不足:ImageNet-mini是细粒度分类数据集,但是每一个类别只有30~50张左右的图片
验证猜想:使用VOC2007+2012数据集,结合标注的数据,拿出宽高均不小于200的bbox,resize到256尺度下后进行训练
实验验证:可以看到,使用这种方案,网络并没有发散,并且训练的很快
[注]:笔者这里用VOC数据集做一些预训练只是为了验证自己的猜想,最终的实验笔者还是选择用COCO2017预训练,用VOC07+12混合数据集进行目标检测训练的。