一、训练部分相关代码详解
tools/train.py
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) Megvii, Inc. and its affiliates.
import argparse
import random
import warnings
from loguru import logger
import torch
import torch.backends.cudnn as cudnn
from yolox.core import Trainer, launch
from yolox.exp import get_exp
from yolox.utils import configure_nccl, configure_omp, get_num_devices
def make_parser():
parser = argparse.ArgumentParser("YOLOX train parser")
parser.add_argument("-expn", "--experiment-name", type=str, default='Dark')#设置输出名称,输出地址在YOLO_outputs下
parser.add_argument("-n", "--name", type=str, default=None, help="model name")#模型的类型,有s、x、m、l等
parser.add_argument("-b", "--batch-size", type=int, default=8, help="batch size")#批次
parser.add_argument(
"-d", "--devices", default=0, type=int, help="device for training"#gpu or cpu
)
parser.add_argument(
"-f",
"--exp_file",
default='../exps/yolox_voc_s.py',#实验描述文件,数据配置等
type=str,
help="plz input your experiment description file",
)
parser.add_argument(
"--resume", default=False, action="store_true", help="resume training"
)
# ---------------中断后,恢复训练等一些文件------------
parser.add_argument("-c", "--ckpt", default=None, type=str, help="checkpoint file")
parser.add_argument(
"-e",
"--start_epoch",
default=None,
type=int,
help="resume training start epoch",
)
parser.add_argument(
"--num_machines", default=1, type=int, help="num of node for training"#多卡设置
)
parser.add_argument(
"--machine_rank", default=0, type=int, help="node rank for multi-node training"#多节点训练的节点等级
)
parser.add_argument(
"--fp16",
dest="fp16",
default=False,
action="store_true",
help="Adopting mix precision training.",
)
parser.add_argument(
"--cache",
dest="cache",
default=False,
action="store_true",
help="Caching imgs to RAM for fast training.",#将图像缓存到内存/磁盘以进行快速训练
)
parser.add_argument(
"-o",
"--occupy",
dest="occupy",
default=True,
action="store_true",
help="occupy GPU memory first for training.",#首先占用GPU内存进行训练
)
parser.add_argument(
"opts",
help="Modify config options using the command-line",#使用命令行修改配置选项
default=None,
nargs=argparse.REMAINDER,
)
return parser
@logger.catch
def main(exp, args):
# if exp.seed is not None:
# random.seed(exp.seed)
# torch.manual_seed(exp.seed)
# cudnn.deterministic = True
# warnings.warn(
# "You have chosen to seed training. This will turn on the CUDNN deterministic setting, "
# "which can slow down your training considerably! You may see unexpected behavior "
# "when restarting from checkpoints."
# )
# set environment variables for distributed training
configure_nccl()
configure_omp()
cudnn.benchmark = True
trainer = Trainer(exp, args)#train.py跳转1
trainer.train()#train.py跳转2
if __name__ == "__main__":
args = make_parser().parse_args()
exp = get_exp(args.exp_file, args.name)
exp.merge(args.opts)
if not args.experiment_name:
args.experiment_name = exp.exp_name
num_gpu = get_num_devices() if args.devices is None else args.devices
assert num_gpu <= get_num_devices()
launch(
main,
num_gpu,
args.num_machines,
args.machine_rank,
args=(exp, args),
)
yolox/core/trainer.py
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) Megvii, Inc. and its affiliates.
import datetime
import os
import time
from loguru import logger
import torch
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.tensorboard import SummaryWriter
from yolox.data import DataPrefetcher
from yolox.utils import (
MeterBuffer,
ModelEMA,
all_reduce_norm,
get_local_rank,
get_model_info,
get_rank,
get_world_size,
gpu_mem_usage,
is_parallel,
load_ckpt,
occupy_mem,
save_checkpoint,
setup_logger,
synchronize
)
#跳转1
class Trainer:
def __init__(self, exp, args):
# init function only defines some basic attr, other attrs like model, optimizer are built in
# before_train methods.
self.exp = exp #数据参数配置
self.args = args #训练参数配置
# training related attr
self.max_epoch = exp.max_epoch
self.amp_training = args.fp16
self.scaler = torch.cuda.amp.GradScaler(enabled=args.fp16)
self.is_distributed = get_world_size() > 1 #分布式训练
self.rank = get_rank() #用来判断torch版本和分布式
self.local_rank = get_local_rank() #用来配置cuda,采用gpu进行训练,跳转get_local_rank()
self.device = "cuda:{}".format(self.local_rank)#跳转self.local_rank
self.use_model_ema = exp.ema #ema所得的数据是在测试时候使用的,这里不进行梯度下降
#取最后n步权重进行平均,能使得模型更加鲁棒
# data/dataloader related attr
self.data_type = torch.float16 if args.fp16 else torch.float32
self.input_size = exp.input_size
self.best_ap = 0
# metric record
self.meter = MeterBuffer(window_size=exp.print_interval) #计算并存储平均值和当前值
self.file_name = os.path.join(exp.output_dir, args.experiment_name) #输出地址
if self.rank == 0:
os.makedirs(self.file_name, exist_ok=True) #创建输出文件夹,若exist_ok为False则抛出OSError
setup_logger( #日志
self.file_name,
distributed_rank=self.rank,
filename="train_log.txt",
mode="a",
)
#跳转1返回到train.py
def train(self):#train.py跳转2
self.before_train()#trainer.py跳转1 有模型定义.get_model()
try:
self.train_in_epoch()#trainer.py跳转2 中间参数更新定义
except Exception:
raise
finally:
self.after_train() #trainer.py跳转3 保存权重收尾
#trainer.py跳转2
def train_in_epoch(self):
for self.epoch in range(self.start_epoch, self.max_epoch):
self.before_epoch() #跳转 打印部分超参数分配细节
self.train_in_iter() #跳转 定义参数更新、打印输出细节
self.after_epoch() #跳转 收尾,保存权重
def train_in_iter(self):
for self.iter in range(self.max_iter):
self.before_iter() #pass
self.train_one_iter() #中间参数更新过程
self.after_iter() #终端打印输出细节
def train_one_iter(self):
iter_start_time = time.time() #数据处理计时
inps, targets = self.prefetcher.next() #下一轮数据加载
inps = inps.to(self.data_type) #怎样训练,16 or 32
targets = targets.to(self.data_type)
targets.requires_grad = False
inps, targets = self.exp.preprocess(inps, targets, self.input_size) #统一输入图片尺寸640*640
data_end_time = time.time()
with torch.cuda.amp.autocast(enabled=self.amp_training): # 前向过程model开启autocast
outputs = self.model(inps, targets)
loss = outputs["total_loss"]
self.optimizer.zero_grad() # 梯度初始化为零,把loss关于weight的导数变成0
self.scaler.scale(loss).backward() # scaler实现的反向误差传播
self.scaler.step(self.optimizer) # 优化器中的值也需要放缩
self.scaler.update() # 更新scaler
if self.use_model_ema: #使用滑动平均
self.ema_model.update(self.model)
lr = self.lr_scheduler.update_lr(self.progress_in_iter + 1) #更新学习率
for param_group in self.optimizer.param_groups:
param_group["lr"] = lr
iter_end_time = time.time()
self.meter.update( #存储数值更新
iter_time=iter_end_time - iter_start_time,
data_time=data_end_time - iter_start_time,
lr=lr,
**outputs,
)
# trainer.py跳转2跳回
#trainer.py跳转1
def before_train(self):
logger.info("args: {}".format(self.args))#打印args
logger.info("exp value:\n{}".format(self.exp))#打印exp
# model related init
torch.cuda.set_device(self.local_rank)#cpu or gpu,跳到->self.local_rank
model = self.exp.get_model() #模型进入口,跳到->yolox.exp.yolox_base->.models._init_.py->darknet.py->network_blocks.py
logger.info(
"Model Summary: {}".format(get_model_info(model, self.exp.test_size))#训练的一些配置
)
model.to(self.device) #将模型调到gpu上
# solver related init初始化偏执b 权重w 归一化bn 优化器sgd
self.optimizer = self.exp.get_optimizer(self.args.batch_size)#跳转.get_optimizer
# value of epoch will be set in `resume_train`
model = self.resume_train(model) #判断是否中途启动
# data related init
self.no_aug = self.start_epoch >= self.max_epoch - self.exp.no_aug_epochs
#若max_epoch=20,no_aug_epochs不使用增强的epoch=15
#那么self.max_epoch - self.exp.no_aug_epochs = 5,也就是说只有前5轮使用数据增强方法
#即no_aug在前5轮的时候为false
self.train_loader = self.exp.get_data_loader( #跳到yolox_voc_s.py,完成一些关于数据的处理
batch_size=self.args.batch_size,
is_distributed=self.is_distributed,
no_aug=self.no_aug,
cache_img=self.args.cache,
)
logger.info("init prefetcher, this might take one minute or less...")
self.prefetcher = DataPrefetcher(self.train_loader)
#跳转DataPrefetcher()->当模型在 GPU 中运行时,有另一个流将数据发送到 GPU 内存,使得两次迭代之间的差距非常小。
# max_iter means iters per epoch
self.max_iter = len(self.train_loader) #表示每次batch_size中待处理数据量长度
self.lr_scheduler = self.exp.get_lr_scheduler(
self.exp.basic_lr_per_img * self.args.batch_size, self.max_iter
) #学习率lr=0.01/64.0 * batch_size
if self.args.occupy:
occupy_mem(self.local_rank) #先可gpu使用
if self.is_distributed: #并行训练DDP or DP
model = DDP(model, device_ids=[self.local_rank], broadcast_buffers=False)
if self.use_model_ema: #使用ema滑动平均
self.ema_model = ModelEMA(model, 0.9998) #导入ema_model,跳转ModelEMA()看细节
self.ema_model.updates = self.max_iter * self.start_epoch #导入ema_model.updates数值,其与衰减率计算有关,计算v
self.model = model
self.model.train()
self.evaluator = self.exp.get_evaluator( #评估模型,跳转至yolox_voc_s.py->yolox/evaluators/voc_evaluator.py
batch_size=self.args.batch_size, is_distributed=self.is_distributed
)
# Tensorboard logger日记
if self.rank == 0:
self.tblogger = SummaryWriter(self.file_name)
logger.info("Training start...")
logger.info("\n{}".format(model))#模型打印
#trainer.py跳转1跳回
# trainer.py跳转3
def after_train(self):
logger.info(
"Training of experiment is done and the best AP is {:.2f}".format(self.best_ap * 100)
)
# trainer.py跳转3跳回
def before_epoch(self):
logger.info("---> start train epoch{}".format(self.epoch + 1))
if self.epoch + 1 == self.max_epoch - self.exp.no_aug_epochs or self.no_aug:
logger.info("--->No mosaic aug now!")
self.train_loader.close_mosaic()
logger.info("--->Add additional L1 loss now!") #最后15轮
if self.is_distributed: #分布式训练
self.model.module.head.use_l1 = True
else:
self.model.head.use_l1 = True
self.exp.eval_interval = 1
if not self.no_aug:
self.save_ckpt(ckpt_name="last_mosaic_epoch")
def after_epoch(self):
self.save_ckpt(ckpt_name="latest")
if (self.epoch + 1) % self.exp.eval_interval == 0:
all_reduce_norm(self.model)
self.evaluate_and_save_model()
# trainer.py跳转2跳回
def before_iter(self):
pass
def after_iter(self): #终端打印输出细节
"""
`after_iter` contains two parts of logic:
* log information
* reset setting of resize
"""
# log needed information
if (self.iter + 1) % self.exp.print_interval == 0:
# TODO check ETA logic
left_iters = self.max_iter * self.max_epoch - (self.progress_in_iter + 1)
eta_seconds = self.meter["iter_time"].global_avg * left_iters
eta_str = "ETA: {}".format(datetime.timedelta(seconds=int(eta_seconds)))
progress_str = "epoch: {}/{}, iter: {}/{}".format(
self.epoch + 1, self.max_epoch, self.iter + 1, self.max_iter
)
loss_meter = self.meter.get_filtered_meter("loss")
loss_str = ", ".join(
["{}: {:.1f}".format(k, v.latest) for k, v in loss_meter.items()]
)
time_meter = self.meter.get_filtered_meter("time")
time_str = ", ".join(
["{}: {:.3f}s".format(k, v.avg) for k, v in time_meter.items()]
)
logger.info(
"{}, mem: {:.0f}Mb, {}, {}, lr: {:.3e}".format(
progress_str,
gpu_mem_usage(),
time_str,
loss_str,
self.meter["lr"].latest,
)
+ (", size: {:d}, {}".format(self.input_size[0], eta_str))
)
self.meter.clear_meters()
# random resizing
if (self.progress_in_iter + 1) % 10 == 0:
self.input_size = self.exp.random_resize(
self.train_loader, self.epoch, self.rank, self.is_distributed
)
@property
def progress_in_iter(self):
return self.epoch * self.max_iter + self.iter
def resume_train(self, model):
if self.args.resume:
logger.info("resume training")
if self.args.ckpt is None:
ckpt_file = os.path.join(self.file_name, "latest" + "_ckpt.pth")
else:
ckpt_file = self.args.ckpt
ckpt = torch.load(ckpt_file, map_location=self.device)
# resume the model/optimizer state dict
model.load_state_dict(ckpt["model"])
self.optimizer.load_state_dict(ckpt["optimizer"])
# resume the training states variables
start_epoch = (
self.args.start_epoch - 1
if self.args.start_epoch is not None
else ckpt["start_epoch"]
)
self.start_epoch = start_epoch
logger.info(
"loaded checkpoint '{}' (epoch {})".format(
self.args.resume, self.start_epoch
)
) # noqa
else:
if self.args.ckpt is not None:
logger.info("loading checkpoint for fine tuning")
ckpt_file = self.args.ckpt
ckpt = torch.load(ckpt_file, map_location=self.device)["model"]
model = load_ckpt(model, ckpt)
self.start_epoch = 0
return model
def evaluate_and_save_model(self):
if self.use_model_ema:
evalmodel = self.ema_model.ema
else:
evalmodel = self.model
if is_parallel(evalmodel):
evalmodel = evalmodel.module
ap50_95, ap50, summary = self.exp.eval(
evalmodel, self.evaluator, self.is_distributed
)
self.model.train()
if self.rank == 0:
self.tblogger.add_scalar("val/COCOAP50", ap50, self.epoch + 1)
self.tblogger.add_scalar("val/COCOAP50_95", ap50_95, self.epoch + 1)
logger.info("\n" + summary)
synchronize()
self.save_ckpt("last_epoch", ap50_95 > self.best_ap)
self.best_ap = max(self.best_ap, ap50_95)
def save_ckpt(self, ckpt_name, update_best_ckpt=False):
if self.rank == 0:
save_model = self.ema_model.ema if self.use_model_ema else self.model
logger.info("Save weights to {}".format(self.file_name))
ckpt_state = {
"start_epoch": self.epoch + 1,
"model": save_model.state_dict(),
"optimizer": self.optimizer.state_dict(),
}
save_checkpoint(
ckpt_state,
update_best_ckpt,
self.file_name,
ckpt_name,
)
二、参数初始化设定相关代码详解
yolox/exp/yolox_base.py
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
import os
import random
import torch
import torch.distributed as dist
import torch.nn as nn
from .base_exp import BaseExp
# 这个文件是yolox最全的参数设定,可以称为基底参数设定文件,想要改变参数的初始化值,首先查看该文件。
# 同理yolox_voc_s.py也是基于这个文件额外进行数据集地址添加。
class Exp(BaseExp): #跳转1,点击Exp进入yolox_voc_s.py
def __init__(self):
super().__init__()
# ---------------- yolox基本设定 ---------------- #
self.num_classes = 12 #数据集格式
self.depth = 1.00
self.width = 1.00
self.act = 'silu'
# ---------------- 数据加载器参数设定 ---------------- #
# set worker to 4 for shorter dataloader init time
self.data_num_workers = 4 #数据加载器个数 管不管都行
self.input_size = (640, 640) # (height, width)
# Actual multiscale ranges: [640-5*32, 640+5*32].
# To disable multiscale training, set the
# self.multiscale_range to 0.
self.multiscale_range = 5 #[640-5*32, 640+5*32]中的5,默认不动
# You can uncomment this line to specify a multiscale range
# self.random_size = (14, 26)
#coco数据集设定,不用管,看yolox_voc_s.py----------------------------------------------
self.data_dir = None
self.train_ann = "instances_train2017.json"
self.val_ann = "instances_val2017.json"
self.test_ann = "instances_test2017.json"
# --------------- 数据增强参数设定 ----------------- #
self.mosaic_prob = 1.0
self.mixup_prob = 1.0
self.hsv_prob = 1.0
self.flip_prob = 0.5
self.degrees = 10.0
self.translate = 0.1
self.mosaic_scale = (0.1, 2)
self.mixup_scale = (0.5, 1.5)
self.shear = 2.0
self.enable_mixup = True
# -------------- 训练参数设定 --------------------- #
self.warmup_epochs = 5
self.max_epoch = 20 #epoch设定
self.warmup_lr = 0
self.basic_lr_per_img = 0.01 / 64.0 #参与学习率计算,具体在trainer.py中
self.scheduler = "yoloxwarmcos"
self.no_aug_epochs = 15 #训练最后15轮停止使用数据增强
self.min_lr_ratio = 0.05
self.ema = True #使用滑动平均
self.weight_decay = 5e-4
self.momentum = 0.9
self.print_interval = 1 #多少轮评估一次模型
self.eval_interval = 1 #多少轮评估一次模型
self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
# ----------------- 测试参数设定 ------------------ #
self.test_size = (640, 640)
self.test_conf = 0.01
self.nmsthre = 0.65
def get_model(self): #trainer.py跳转1
from yolox.models import YOLOX, YOLOPAFPN, YOLOXHead
def init_yolo(M): #初始化模型参数
for m in M.modules():
if isinstance(m, nn.BatchNorm2d):
m.eps = 1e-3
m.momentum = 0.03
if getattr(self, "model", None) is None:
in_channels = [256, 512, 1024]
backbone = YOLOPAFPN(self.depth, self.width, in_channels=in_channels, act=self.act) #backbone跳入2
head = YOLOXHead(self.num_classes, self.width, in_channels=in_channels, act=self.act) #head跳入3
self.model = YOLOX(backbone, head) #构建模型
self.model.apply(init_yolo) #初始化模型权重参数
self.model.head.initialize_biases(1e-2) #初始化检测头权重参数
return self.model
#使用coco数据集时采用,这里不看----------------------使用voc看yolox_voc_s.py
def get_data_loader(
self, batch_size, is_distributed, no_aug=False, cache_img=False
):
from yolox.data import (
COCODataset,
TrainTransform,
YoloBatchSampler,
DataLoader,
InfiniteSampler,
MosaicDetection,
worker_init_reset_seed,
)
from yolox.utils import (
wait_for_the_master,
get_local_rank,
)
local_rank = get_local_rank()
with wait_for_the_master(local_rank):
dataset = COCODataset(
data_dir=self.data_dir,
json_file=self.train_ann,
img_size=self.input_size,
preproc=TrainTransform(
max_labels=50,
flip_prob=self.flip_prob,
hsv_prob=self.hsv_prob),
cache=cache_img,
)
dataset = MosaicDetection(
dataset,
mosaic=not no_aug,
img_size=self.input_size,
preproc=TrainTransform(
max_labels=120,
flip_prob=self.flip_prob,
hsv_prob=self.hsv_prob),
degrees=self.degrees,
translate=self.translate,
mosaic_scale=self.mosaic_scale,
mixup_scale=self.mixup_scale,
shear=self.shear,
enable_mixup=self.enable_mixup,
mosaic_prob=self.mosaic_prob,
mixup_prob=self.mixup_prob,
)
self.dataset = dataset
if is_distributed:
batch_size = batch_size // dist.get_world_size()
sampler = InfiniteSampler(len(self.dataset), seed=self.seed if self.seed else 0)
batch_sampler = YoloBatchSampler(
sampler=sampler,
batch_size=batch_size,
drop_last=False,
mosaic=not no_aug,
)
dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True}
dataloader_kwargs["batch_sampler"] = batch_sampler
# Make sure each process has different random seed, especially for 'fork' method.
# Check https://github.com/pytorch/pytorch/issues/63311 for more details.
dataloader_kwargs["worker_init_fn"] = worker_init_reset_seed
train_loader = DataLoader(self.dataset, **dataloader_kwargs)
return train_loader
def random_resize(self, data_loader, epoch, rank, is_distributed):
tensor = torch.LongTensor(2).cuda() #切换张量为长整型
if rank == 0:
size_factor = self.input_size[1] * 1.0 / self.input_size[0]
if not hasattr(self, 'random_size'):
min_size = int(self.input_size[0] / 32) - self.multiscale_range
max_size = int(self.input_size[0] / 32) + self.multiscale_range
self.random_size = (min_size, max_size)
size = random.randint(*self.random_size)
size = (int(32 * size), 32 * int(size * size_factor))
tensor[0] = size[0]
tensor[1] = size[1]
if is_distributed:
dist.barrier()
dist.broadcast(tensor, 0)
input_size = (tensor[0].item(), tensor[1].item())
return input_size
def preprocess(self, inputs, targets, tsize):
scale_y = tsize[0] / self.input_size[0]
scale_x = tsize[1] / self.input_size[1]
if scale_x != 1 or scale_y != 1:
inputs = nn.functional.interpolate(
inputs, size=tsize, mode="bilinear", align_corners=False
)
targets[..., 1::2] = targets[..., 1::2] * scale_x
targets[..., 2::2] = targets[..., 2::2] * scale_y
return inputs, targets
def get_optimizer(self, batch_size):
if "optimizer" not in self.__dict__:
if self.warmup_epochs > 0:
lr = self.warmup_lr
else:
lr = self.basic_lr_per_img * batch_size
pg0, pg1, pg2 = [], [], [] # optimizer parameter groups
for k, v in self.model.named_modules():
if hasattr(v, "bias") and isinstance(v.bias, nn.Parameter):
pg2.append(v.bias) # biases
if isinstance(v, nn.BatchNorm2d) or "bn" in k:
pg0.append(v.weight) # no decay
elif hasattr(v, "weight") and isinstance(v.weight, nn.Parameter):
pg1.append(v.weight) # apply decay
optimizer = torch.optim.SGD(
pg0, lr=lr, momentum=self.momentum, nesterov=True
)
optimizer.add_param_group(
{"params": pg1, "weight_decay": self.weight_decay}
) # add pg1 with weight_decay
optimizer.add_param_group({"params": pg2})
self.optimizer = optimizer
return self.optimizer
def get_lr_scheduler(self, lr, iters_per_epoch):
from yolox.utils import LRScheduler
scheduler = LRScheduler(
self.scheduler,
lr,
iters_per_epoch,
self.max_epoch,
warmup_epochs=self.warmup_epochs,
warmup_lr_start=self.warmup_lr,
no_aug_epochs=self.no_aug_epochs,
min_lr_ratio=self.min_lr_ratio,
)
return scheduler
def get_eval_loader(self, batch_size, is_distributed, testdev=False, legacy=False):
from yolox.data import COCODataset, ValTransform
valdataset = COCODataset(
data_dir=self.data_dir,
json_file=self.val_ann if not testdev else self.test_ann,
name="val2017" if not testdev else "test2017",
img_size=self.test_size,
preproc=ValTransform(legacy=legacy),
)
if is_distributed:
batch_size = batch_size // dist.get_world_size()
sampler = torch.utils.data.distributed.DistributedSampler(
valdataset, shuffle=False
)
else:
sampler = torch.utils.data.SequentialSampler(valdataset)
dataloader_kwargs = {
"num_workers": self.data_num_workers,
"pin_memory": True,
"sampler": sampler,
}
dataloader_kwargs["batch_size"] = batch_size
val_loader = torch.utils.data.DataLoader(valdataset, **dataloader_kwargs)
return val_loader
def get_evaluator(self, batch_size, is_distributed, testdev=False, legacy=False):
from yolox.evaluators import COCOEvaluator
val_loader = self.get_eval_loader(batch_size, is_distributed, testdev, legacy)
evaluator = COCOEvaluator(
dataloader=val_loader,
img_size=self.test_size,
confthre=self.test_conf,
nmsthre=self.nmsthre,
num_classes=self.num_classes,
testdev=testdev,
)
return evaluator
def eval(self, model, evaluator, is_distributed, half=False):
return evaluator.evaluate(model, is_distributed, half)
# coco数据集----------------------------------------
yolox/exp/base_exp.py部分代码
from abc import ABCMeta, abstractmethod
from typing import Dict
from tabulate import tabulate
import torch
from torch.nn import Module
from yolox.utils import LRScheduler
class BaseExp(metaclass=ABCMeta):
"""Basic class for any experiment."""
def __init__(self):
self.seed = None
self.output_dir = "./YOLOX_outputs" #训练时输出的地址
self.print_interval = 100 #多少轮评估一次模型,这些在yolox_base.py重新完成设定
self.eval_interval = 10 #多少轮评估一次模型,这些在yolox_base.py重新完成设定
exp/yolox_voc_s.py
# encoding: utf-8
import os
import torch
import torch.distributed as dist
from yolox.data import get_yolox_datadir
from yolox.exp import Exp as MyExp
#这个文件才是最终模型在训练时的初始化参数,基于yolox_base.py
class Exp(MyExp):
def __init__(self):
super(Exp, self).__init__()
# ---------------- yolox基本设定 ---------------- #
self.num_classes = 12 #数据集类别数
self.depth = 0.33
self.width = 0.50
self.warmup_epochs = 1
# ---------- 数据增强参数设定 ------------ #
self.mosaic_prob = 1.0
self.mixup_prob = 1.0
self.hsv_prob = 1.0
self.flip_prob = 0.5
self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
def get_data_loader(self, batch_size, is_distributed, no_aug=False, cache_img=False):
from yolox.data import (
VOCDetection, #voc数据获取定义,跳转
TrainTransform, #数据增强=旋转+hsv
YoloBatchSampler, #批量采样器将从另一个采样器生成小批量(马赛克,索引)元组。
DataLoader, #数据加载器
InfiniteSampler, #类似于引流器,让数据不间断
MosaicDetection, #执行mixup mosaic_prob
worker_init_reset_seed, #随机种子
)
from yolox.utils import (
wait_for_the_master, #是否分布式训练
get_local_rank, #gpu or cpu
)
local_rank = get_local_rank()
with wait_for_the_master(local_rank): #分布式训练就调用dist.barrier(),使得所有进程同步进行。
dataset = VOCDetection(
data_dir="D:/darkyolo/YOLOX-main/datasets/VOCdevkit/VOC2007/", #数据集地址
image_sets=[('train')], #这里只有一个索引,因此需要改voc.py中name=self.image_set[0][1]修改为[0]
img_size=self.input_size,
preproc=TrainTransform(#preproc为类TrainTransform生成的对象,由于TrainTransform()使用def __call__,因此preproc可以被调用
max_labels=100,
flip_prob=self.flip_prob,
hsv_prob=self.hsv_prob),
cache=cache_img,
)
dataset = MosaicDetection( #数据增强
dataset, #voc数据集
mosaic=not no_aug, #true
img_size=self.input_size,
preproc=TrainTransform( #preproc为类TrainTransform生成的对象,由于def __call__使用preproc可以被调用
max_labels=120,
flip_prob=self.flip_prob, #50%
hsv_prob=self.hsv_prob), #true
degrees=self.degrees,
translate=self.translate,
mosaic_scale=self.mosaic_scale,
mixup_scale=self.mixup_scale,
shear=self.shear,
enable_mixup=self.enable_mixup, #true
mosaic_prob=self.mosaic_prob,
mixup_prob=self.mixup_prob,
)
self.dataset = dataset
if is_distributed:
batch_size = batch_size // dist.get_world_size()
sampler = InfiniteSampler(
len(self.dataset), seed=self.seed if self.seed else 0
)
batch_sampler = YoloBatchSampler(
sampler=sampler,
batch_size=batch_size,
drop_last=False,
mosaic=not no_aug,
)
dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True}
dataloader_kwargs["batch_sampler"] = batch_sampler
# Make sure each process has different random seed, especially for 'fork' method
dataloader_kwargs["worker_init_fn"] = worker_init_reset_seed
train_loader = DataLoader(self.dataset, **dataloader_kwargs)
return train_loader
#测试定义与训练同理
def get_eval_loader(self, batch_size, is_distributed, testdev=False, legacy=False):
from yolox.data import VOCDetection, ValTransform
valdataset = VOCDetection(
data_dir="D:/darkyolo/YOLOX-main/datasets/VOCdevkit/VOC2007/",
image_sets=[('val')],
img_size=self.test_size,
preproc=ValTransform(legacy=legacy), #统一输入图片尺寸
)
if is_distributed:
batch_size = batch_size // dist.get_world_size()
sampler = torch.utils.data.distributed.DistributedSampler(
valdataset, shuffle=False
)
else:
sampler = torch.utils.data.SequentialSampler(valdataset)
dataloader_kwargs = {
"num_workers": self.data_num_workers,
"pin_memory": True,
"sampler": sampler,
}
dataloader_kwargs["batch_size"] = batch_size
val_loader = torch.utils.data.DataLoader(valdataset, **dataloader_kwargs)
return val_loader
# 模型评估,在网络全部训练完成后执行
def get_evaluator(self, batch_size, is_distributed, testdev=False, legacy=False):
from yolox.evaluators import VOCEvaluator
val_loader = self.get_eval_loader(batch_size, is_distributed, testdev, legacy)
evaluator = VOCEvaluator( #跳转VOCEvaluator()
dataloader=val_loader,
img_size=self.test_size,
confthre=self.test_conf,
nmsthre=self.nmsthre,
num_classes=self.num_classes,
)
return evaluator