yolov8 trainer.py源码解读(一)

trainer.py中定义了基类BaseTrainer()。解析如下

1.初始化:
    def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None):
        """
        Initializes the BaseTrainer class.

        Args:
            cfg (str, optional): Path to a configuration file. Defaults to DEFAULT_CFG.
            overrides (dict, optional): Configuration overrides. Defaults to None.
        """
        self.args = get_cfg(cfg, overrides)     # 获取训练器的配置
        self.check_resume(overrides)        # 检查resume参数(是否是恢复某次训练)
        self.device = select_device(self.args.device, self.args.batch)      # 选择训练设备
        self.validator = None   # 验证器
        self.metrics = None     # 训练指标
        self.plots = {}
        init_seeds(self.args.seed + 1 + RANK, deterministic=self.args.deterministic)    # 初始化训练随机种子

        # Dirs
        self.save_dir = get_save_dir(self.args) # 获取运行结果保存路径
        self.args.name = self.save_dir.name  # update name for loggers
        self.wdir = self.save_dir / "weights"  # 定义模型权重保存路径
        if RANK in {-1, 0}:
            self.wdir.mkdir(parents=True, exist_ok=True)  # make dir
            self.args.save_dir = str(self.save_dir)
            yaml_save(self.save_dir / "args.yaml", vars(self.args))  # save run args
        self.last, self.best = self.wdir / "last.pt", self.wdir / "best.pt"  # 最后检查点(last.pt)和最佳检查点(best.pt)路径
        self.save_period = self.args.save_period    # 每隔几轮保存一次模型的检查点(若设置数值小于1,表明不启用这个功能)

        self.batch_size = self.args.batch   # 训练batch大小
        self.epochs = self.args.epochs      # 训练轮次
        self.start_epoch = 0
        if RANK == -1:
            print_args(vars(self.args))

        # Device
        if self.device.type in {"cpu", "mps"}:
            self.args.workers = 0  # faster CPU training as time dominated by inference, not dataloading

        # Model and Dataset
        # 初始化模型和数据集
        self.model = check_model_file_from_stem(self.args.model)  # 给模型加上后缀名 例:yolov8n -> yolov8n.pt
        self.trainset, self.testset = self.get_dataset()  # 获取训练集和测试集
        self.ema = None     # 模型指数平均指标EMA (Exponential Moving Average)

        # Optimization utils init
        # 优化器初始化
        self.lf = None      # 学习率
        self.scheduler = None       # 学习率调度器

        # Epoch level metrics
        # 每轮指标
        self.best_fitness = None    # 最佳适应度
        self.fitness = None     # 当前适应度
        self.loss = None    # 当前损失值
        self.tloss = None   # 总损失值
        self.loss_names = ["Loss"]
        self.csv = self.save_dir / "results.csv"  # 指标文件results.csv
        self.plot_idx = [0, 1, 2]

        # Callbacks
        self.callbacks = _callbacks or callbacks.get_default_callbacks() # 回调函数
        if RANK in {-1, 0}:
            callbacks.add_integration_callbacks(self)
2.训练函数train()

2.1 开始确认训练过程中使用的设备数量(world_size),并考虑了不同的设备设置情况。

if isinstance(self.args.device, str) and len(self.args.device):  # 处理输入设备参数为字符串形式的情况 如 device='0' or device='0,1,2,3'
    world_size = len(self.args.device.split(","))
elif isinstance(self.args.device, (tuple, list)):  # 处理输入设备参数为元组和列表的情况 如 device=[0, 1, 2, 3]
    world_size = len(self.args.device)
elif torch.cuda.is_available():  # 检查cuda是否可用
    world_size = 1  # (默认使用设备0)
else:  # 以上条件都不满足,表示使用CPU或其他非GPU设备训练
    world_size = 0

2.2 在分布式训练环境中执行特定的子进程和参数检查,若不满足则按照正常方式进行训练

if world_size > 1 and "LOCAL_RANK" not in os.environ:
    # Argument checks
    if self.args.rect:
        LOGGER.warning("WARNING ⚠️ 'rect=True' is incompatible with Multi-GPU training, setting 'rect=False'")
        self.args.rect = False
    if self.args.batch == -1:
        LOGGER.warning(
            "WARNING ⚠️ 'batch=-1' for AutoBatch is incompatible with Multi-GPU training, setting "
            "default 'batch=16'"
        )
        self.args.batch = 16

cmd, file = generate_ddp_command(world_size, self) # 生成DDP训练的子进程命令
try:
    LOGGER.info(f'{colorstr("DDP:")} debug command {" ".join(cmd)}')
    subprocess.run(cmd, check=True)    # 执行子进程命令
except Exception as e:
    raise e
finally:
    ddp_cleanup(self, str(file))    # 清理DDP训练过程中生成的文件和临时资源

else:
    self._do_train(world_size)
3._do_train()函数 
if world_size > 1:  # 设备大于1,采用多卡训练
    self._setup_ddp(world_size)
self._setup_train(world_size)

3.1 _setup_train函数

    def _setup_train(self, world_size):
        """Builds dataloaders and optimizer on correct rank process."""

        # 设置模型
        self.run_callbacks("on_pretrain_routine_start")
        ckpt = self.setup_model()
        self.model = self.model.to(self.device)
        self.set_model_attributes()

        # 冻结指定层,冻结的层不会进行梯度更新
        freeze_list = (
            self.args.freeze
            if isinstance(self.args.freeze, list)
            else range(self.args.freeze)
            if isinstance(self.args.freeze, int)
            else []
        )
        always_freeze_names = [".dfl"]  # always freeze these layers
        freeze_layer_names = [f"model.{x}." for x in freeze_list] + always_freeze_names
        for k, v in self.model.named_parameters():
            # v.register_hook(lambda x: torch.nan_to_num(x))  # NaN to 0 (commented for erratic training results)
            if any(x in k for x in freeze_layer_names):
                LOGGER.info(f"Freezing layer '{k}'")
                v.requires_grad = False
            elif not v.requires_grad and v.dtype.is_floating_point:  # only floating point Tensor can require gradients
                LOGGER.info(
                    f"WARNING ⚠️ setting 'requires_grad=True' for frozen layer '{k}'. "
                    "See ultralytics.engine.trainer for customization of frozen layers."
                )
                v.requires_grad = True

        # 检查是否启用自动混合精度(AMP)
        self.amp = torch.tensor(self.args.amp).to(self.device)  # True or False
        if self.amp and RANK in {-1, 0}:  # Single-GPU and DDP
            callbacks_backup = callbacks.default_callbacks.copy()  # backup callbacks as check_amp() resets them
            self.amp = torch.tensor(check_amp(self.model), device=self.device)
            callbacks.default_callbacks = callbacks_backup  # restore callbacks
        if RANK > -1 and world_size > 1:  # DDP
            dist.broadcast(self.amp, src=0)  # broadcast the tensor from rank 0 to all other ranks (returns None)
        self.amp = bool(self.amp)  # as boolean
        self.scaler = torch.cuda.amp.GradScaler(enabled=self.amp)
        if world_size > 1:
            self.model = nn.parallel.DistributedDataParallel(self.model, device_ids=[RANK])

        # 检查图像尺寸
        gs = max(int(self.model.stride.max() if hasattr(self.model, "stride") else 32), 32)  # grid size (max stride)
        self.args.imgsz = check_imgsz(self.args.imgsz, stride=gs, floor=gs, max_dim=1)
        self.stride = gs  # for multiscale training

        # 如果batch_size设置为-1且当前进程的排名为-1,则根据模型、图像尺寸和是否启用混合精度估计最佳的批量大小
        if self.batch_size == -1 and RANK == -1:  # single-GPU only, estimate best batch size
            self.args.batch = self.batch_size = check_train_batch_size(self.model, self.args.imgsz, self.amp)

        # 构建训练数据加载器和测试数据加载器
        batch_size = self.batch_size // max(world_size, 1)
        self.train_loader = self.get_dataloader(self.trainset, batch_size=batch_size, rank=RANK, mode="train")
        if RANK in {-1, 0}:
            # Note: When training DOTA dataset, double batch size could get OOM on images with >2000 objects.
            self.test_loader = self.get_dataloader(
                self.testset, batch_size=batch_size if self.args.task == "obb" else batch_size * 2, rank=-1, mode="val"
            )
            self.validator = self.get_validator()
            metric_keys = self.validator.metrics.keys + self.label_loss_items(prefix="val")
            self.metrics = dict(zip(metric_keys, [0] * len(metric_keys)))
            self.ema = ModelEMA(self.model)
            if self.args.plots:
                self.plot_training_labels()

        # 创建优化器,并根据参数进行相应的设置和调整
        self.accumulate = max(round(self.args.nbs / self.batch_size), 1)  # accumulate loss before optimizing
        weight_decay = self.args.weight_decay * self.batch_size * self.accumulate / self.args.nbs  # scale weight_decay
        iterations = math.ceil(len(self.train_loader.dataset) / max(self.batch_size, self.args.nbs)) * self.epochs
        self.optimizer = self.build_optimizer(
            model=self.model,
            name=self.args.optimizer,
            lr=self.args.lr0,
            momentum=self.args.momentum,
            decay=weight_decay,
            iterations=iterations,
        )
        # 设置学习率调度器
        self._setup_scheduler()
        self.stopper, self.stop = EarlyStopping(patience=self.args.patience), False
        self.resume_training(ckpt)
        self.scheduler.last_epoch = self.start_epoch - 1  # do not move
        self.run_callbacks("on_pretrain_routine_end")

3.2进行训练开始的初始化操作

nb = len(self.train_loader)  # 训练集批次数
nw = max(round(self.args.warmup_epochs * nb), 100) if self.args.warmup_epochs > 0 else -1  # 预热(warmup)迭代次数(-1表示不进行预热)
last_opt_step = -1  # 上一次优化器(optimizer)更新的步骤数
self.epoch_time = None
self.epoch_time_start = time.time()     # 每轮开始时间
self.train_time_start = time.time()     # 训练开始时间
self.run_callbacks("on_train_start")

# 以日志形式打印训练相关信息
LOGGER.info(
    f'Image sizes {self.args.imgsz} train, {self.args.imgsz} val\n'
    f'Using {self.train_loader.num_workers * (world_size or 1)} dataloader workers\n'
    f"Logging results to {colorstr('bold', self.save_dir)}\n"
    f'Starting training for ' + (f"{self.args.time} hours..." if self.args.time else f"{self.epochs} epochs...")
        )

# 是否存在禁用mosaic增强的参数
if self.args.close_mosaic:
    base_idx = (self.epochs - self.args.close_mosaic) * nb
    self.plot_idx.extend([base_idx, base_idx + 1, base_idx + 2])
    epoch = self.start_epoch

3.3每一epoch前的准备工作:

self.epoch = epoch
self.run_callbacks("on_train_epoch_start")
with warnings.catch_warnings():     # 捕获警告信息
    warnings.simplefilter("ignore")  # 忽略特定的警告信息
    self.scheduler.step()   # 更新学习率
if RANK != -1:
    self.train_loader.sampler.set_epoch(epoch)
pbar = enumerate(self.train_loader)     # 创建可迭代进度条对象
# Update dataloader attributes (optional)
if epoch == (self.epochs - self.args.close_mosaic):     # 当前是否为开始关闭mosaic操作的轮次
    self._close_dataloader_mosaic()     # 关闭mosaic操作
    self.train_loader.reset()       # 重置数据加载器状态

Mosaic操作是Yolo中一种数据增强技术,通过将多个图像合并成一个大的马赛克图像,并在训练过程中使用这个合并图像来进行训练。这个合并图像包含了四个小图像,每个小图像都是原始图像中的一个部分。

if RANK in {-1, 0}:
    LOGGER.info(self.progress_string())     # 生成记录训练的信息
    pbar = TQDM(enumerate(self.train_loader), total=nb)     # 创建进度条对象,总迭代次数为nb
self.tloss = None   # 设置总损失率
self.optimizer.zero_grad()     # 优化器梯度清空

3.4进行每一epoch的训练

        针对每一轮次,执行以下预热操作:

for i, batch in pbar:
    self.run_callbacks("on_train_batch_start")
    # Warmup
    ni = i + nb * epoch
    if ni <= nw:
        xi = [0, nw]  # x interp
        self.accumulate = max(1, int(np.interp(ni, xi, [1, self.args.nbs /                       self.batch_size]).round()))
        for j, x in enumerate(self.optimizer.param_groups):
            # Bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0
            x["lr"] = np.interp(
                ni, xi, [self.args.warmup_bias_lr if j == 0 else 0.0, x["initial_lr"] *                 self.lf(epoch)]
            )
            if "momentum" in x:
                x["momentum"] = np.interp(ni, xi, [self.args.warmup_momentum, self.args.momentum])

   ni 表示当前的迭代步数(i)加上总批次数(nb)乘以当前训练周期(epoch)。如果 ni 小于等于预热迭代总数(nw),则在 xi 范围内进行插值计算,得到 self.accumulate 的值,这个值控制梯度累积的数量。然后遍历优化器的参数组(self.optimizer.param_groups),对学习率和动量等参数进行插值计算。

        进行前向传播和后向传播:

# Forward
with torch.cuda.amp.autocast(self.amp):    # 开启混合精度训练(mixed precision training)
    batch = self.preprocess_batch(batch)    # batch 进行预处理操作
    self.loss, self.loss_items = self.model(batch) # 计算损失值和损失项
    if RANK != -1:
        self.loss *= world_size
    self.tloss = (    # 更新总损失值
        (self.tloss * i + self.loss_items) / (i + 1) if self.tloss is not None else self.loss_items
    )

# Backward
self.scaler.scale(self.loss).backward()    # 进行反向传播,使用scaler自动缩放损失值

        优化和超时判断:

# Optimize - https://pytorch.org/docs/master/notes/amp_examples.html
if ni - last_opt_step >= self.accumulate:
    self.optimizer_step()    # 进行优化
    last_opt_step = ni       # 更新索引

    # Timed stopping
    # 时间限制
    if self.args.time:
        self.stop = (time.time() - self.train_time_start) > (self.args.time * 3600)   # 判断是否超时
        if RANK != -1:  # 在分布式训练中
            broadcast_list = [self.stop if RANK == 0 else None]
            dist.broadcast_object_list(broadcast_list, 0)  # 广播给所有进程
            self.stop = broadcast_list[0]
        if self.stop:  # 超时
            break

        记录训练日志信息和绘制训练样本:

# Log
mem = f"{torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0:.3g}G"  # (GB)
loss_len = self.tloss.shape[0] if len(self.tloss.shape) else 1
losses = self.tloss if loss_len > 1 else torch.unsqueeze(self.tloss, 0)
if RANK in {-1, 0}:
    pbar.set_description(
    ("%11s" * 2 + "%11.4g" * (2 + loss_len))
    % (f"{epoch + 1}/{self.epochs}", mem, *losses, batch["cls"].shape[0], batch["img"].shape[-1])
    )
    self.run_callbacks("on_batch_end")
    if self.args.plots and ni in self.plot_idx:
        self.plot_training_samples(batch, ni)

3.7  每一轮训练后的操作

if RANK in {-1, 0}:
    final_epoch = epoch + 1 >= self.epochs # 判断是否是最后一个轮次
    self.ema.update_attr(self.model, include=["yaml", "nc", "args", "names", "stride", "class_weights"])    # 更新EMA

    # 进行验证
    if self.args.val or final_epoch or self.stopper.possible_stop or self.stop:
        self.metrics, self.fitness = self.validate()
    self.save_metrics(metrics={**self.label_loss_items(self.tloss), **self.metrics, **self.lr})    # 保存指标
    self.stop |= self.stopper(epoch + 1, self.fitness) or final_epoch    # 更新stop的值
    if self.args.time:
        self.stop |= (time.time() - self.train_time_start) > (self.args.time * 3600)

    # 保存模型
    if self.args.save or final_epoch:
        self.save_model()
        self.run_callbacks("on_model_save")

    # Scheduler
    t = time.time()
    self.epoch_time = t - self.epoch_time_start     # 记录当前轮次训练时间
    self.epoch_time_start = t       # 更新轮次开始时间
    if self.args.time:
        mean_epoch_time = (t - self.train_time_start) / (epoch - self.start_epoch + 1)      # 计算每轮平均时间
        self.epochs = self.args.epochs = math.ceil(self.args.time * 3600 / mean_epoch_time)         # 重新计算总轮次
        self._setup_scheduler()     # 设置调度器
        self.scheduler.last_epoch = self.epoch  # do not move
        self.stop |= epoch >= self.epochs  # 当前轮次大于总轮次,停止训练
    self.run_callbacks("on_fit_epoch_end")
    gc.collect()    # 执行回收
    torch.cuda.empty_cache()  # 清除GPU缓存

# Early Stopping
if RANK != -1:  # if DDP training
    broadcast_list = [self.stop if RANK == 0 else None]    # 创建广播列表
    dist.broadcast_object_list(broadcast_list, 0)  # 广播同步停止标志
    self.stop = broadcast_list[0]
if self.stop:
    break  # 停止训练
epoch += 1

在训练结束后,进行best.pt模型的验证,并返回验证结果和指标

if RANK in {-1, 0}:
    # Do final val with best.pt
    LOGGER.info(
        f"\n{epoch - self.start_epoch + 1} epochs completed in "
        f"{(time.time() - self.train_time_start) / 3600:.3f} hours."
    )
    self.final_eval()    # 调用final_eval()进行验证
    if self.args.plots:
        self.plot_metrics()
    self.run_callbacks("on_train_end")
gc.collect()
torch.cuda.empty_cache()
self.run_callbacks("teardown")

 先写到这里,后续进一步的细节补充。

  • 16
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值