trainer.py中定义了基类BaseTrainer()。解析如下
1.初始化:
def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None):
"""
Initializes the BaseTrainer class.
Args:
cfg (str, optional): Path to a configuration file. Defaults to DEFAULT_CFG.
overrides (dict, optional): Configuration overrides. Defaults to None.
"""
self.args = get_cfg(cfg, overrides) # 获取训练器的配置
self.check_resume(overrides) # 检查resume参数(是否是恢复某次训练)
self.device = select_device(self.args.device, self.args.batch) # 选择训练设备
self.validator = None # 验证器
self.metrics = None # 训练指标
self.plots = {}
init_seeds(self.args.seed + 1 + RANK, deterministic=self.args.deterministic) # 初始化训练随机种子
# Dirs
self.save_dir = get_save_dir(self.args) # 获取运行结果保存路径
self.args.name = self.save_dir.name # update name for loggers
self.wdir = self.save_dir / "weights" # 定义模型权重保存路径
if RANK in {-1, 0}:
self.wdir.mkdir(parents=True, exist_ok=True) # make dir
self.args.save_dir = str(self.save_dir)
yaml_save(self.save_dir / "args.yaml", vars(self.args)) # save run args
self.last, self.best = self.wdir / "last.pt", self.wdir / "best.pt" # 最后检查点(last.pt)和最佳检查点(best.pt)路径
self.save_period = self.args.save_period # 每隔几轮保存一次模型的检查点(若设置数值小于1,表明不启用这个功能)
self.batch_size = self.args.batch # 训练batch大小
self.epochs = self.args.epochs # 训练轮次
self.start_epoch = 0
if RANK == -1:
print_args(vars(self.args))
# Device
if self.device.type in {"cpu", "mps"}:
self.args.workers = 0 # faster CPU training as time dominated by inference, not dataloading
# Model and Dataset
# 初始化模型和数据集
self.model = check_model_file_from_stem(self.args.model) # 给模型加上后缀名 例:yolov8n -> yolov8n.pt
self.trainset, self.testset = self.get_dataset() # 获取训练集和测试集
self.ema = None # 模型指数平均指标EMA (Exponential Moving Average)
# Optimization utils init
# 优化器初始化
self.lf = None # 学习率
self.scheduler = None # 学习率调度器
# Epoch level metrics
# 每轮指标
self.best_fitness = None # 最佳适应度
self.fitness = None # 当前适应度
self.loss = None # 当前损失值
self.tloss = None # 总损失值
self.loss_names = ["Loss"]
self.csv = self.save_dir / "results.csv" # 指标文件results.csv
self.plot_idx = [0, 1, 2]
# Callbacks
self.callbacks = _callbacks or callbacks.get_default_callbacks() # 回调函数
if RANK in {-1, 0}:
callbacks.add_integration_callbacks(self)
2.训练函数train()
2.1 开始确认训练过程中使用的设备数量(world_size),并考虑了不同的设备设置情况。
if isinstance(self.args.device, str) and len(self.args.device): # 处理输入设备参数为字符串形式的情况 如 device='0' or device='0,1,2,3'
world_size = len(self.args.device.split(","))
elif isinstance(self.args.device, (tuple, list)): # 处理输入设备参数为元组和列表的情况 如 device=[0, 1, 2, 3]
world_size = len(self.args.device)
elif torch.cuda.is_available(): # 检查cuda是否可用
world_size = 1 # (默认使用设备0)
else: # 以上条件都不满足,表示使用CPU或其他非GPU设备训练
world_size = 0
2.2 在分布式训练环境中执行特定的子进程和参数检查,若不满足则按照正常方式进行训练
if world_size > 1 and "LOCAL_RANK" not in os.environ:
# Argument checks
if self.args.rect:
LOGGER.warning("WARNING ⚠️ 'rect=True' is incompatible with Multi-GPU training, setting 'rect=False'")
self.args.rect = False
if self.args.batch == -1:
LOGGER.warning(
"WARNING ⚠️ 'batch=-1' for AutoBatch is incompatible with Multi-GPU training, setting "
"default 'batch=16'"
)
self.args.batch = 16
cmd, file = generate_ddp_command(world_size, self) # 生成DDP训练的子进程命令
try:
LOGGER.info(f'{colorstr("DDP:")} debug command {" ".join(cmd)}')
subprocess.run(cmd, check=True) # 执行子进程命令
except Exception as e:
raise e
finally:
ddp_cleanup(self, str(file)) # 清理DDP训练过程中生成的文件和临时资源
else:
self._do_train(world_size)
3._do_train()函数
if world_size > 1: # 设备大于1,采用多卡训练
self._setup_ddp(world_size)
self._setup_train(world_size)
3.1 _setup_train函数
def _setup_train(self, world_size):
"""Builds dataloaders and optimizer on correct rank process."""
# 设置模型
self.run_callbacks("on_pretrain_routine_start")
ckpt = self.setup_model()
self.model = self.model.to(self.device)
self.set_model_attributes()
# 冻结指定层,冻结的层不会进行梯度更新
freeze_list = (
self.args.freeze
if isinstance(self.args.freeze, list)
else range(self.args.freeze)
if isinstance(self.args.freeze, int)
else []
)
always_freeze_names = [".dfl"] # always freeze these layers
freeze_layer_names = [f"model.{x}." for x in freeze_list] + always_freeze_names
for k, v in self.model.named_parameters():
# v.register_hook(lambda x: torch.nan_to_num(x)) # NaN to 0 (commented for erratic training results)
if any(x in k for x in freeze_layer_names):
LOGGER.info(f"Freezing layer '{k}'")
v.requires_grad = False
elif not v.requires_grad and v.dtype.is_floating_point: # only floating point Tensor can require gradients
LOGGER.info(
f"WARNING ⚠️ setting 'requires_grad=True' for frozen layer '{k}'. "
"See ultralytics.engine.trainer for customization of frozen layers."
)
v.requires_grad = True
# 检查是否启用自动混合精度(AMP)
self.amp = torch.tensor(self.args.amp).to(self.device) # True or False
if self.amp and RANK in {-1, 0}: # Single-GPU and DDP
callbacks_backup = callbacks.default_callbacks.copy() # backup callbacks as check_amp() resets them
self.amp = torch.tensor(check_amp(self.model), device=self.device)
callbacks.default_callbacks = callbacks_backup # restore callbacks
if RANK > -1 and world_size > 1: # DDP
dist.broadcast(self.amp, src=0) # broadcast the tensor from rank 0 to all other ranks (returns None)
self.amp = bool(self.amp) # as boolean
self.scaler = torch.cuda.amp.GradScaler(enabled=self.amp)
if world_size > 1:
self.model = nn.parallel.DistributedDataParallel(self.model, device_ids=[RANK])
# 检查图像尺寸
gs = max(int(self.model.stride.max() if hasattr(self.model, "stride") else 32), 32) # grid size (max stride)
self.args.imgsz = check_imgsz(self.args.imgsz, stride=gs, floor=gs, max_dim=1)
self.stride = gs # for multiscale training
# 如果batch_size设置为-1且当前进程的排名为-1,则根据模型、图像尺寸和是否启用混合精度估计最佳的批量大小
if self.batch_size == -1 and RANK == -1: # single-GPU only, estimate best batch size
self.args.batch = self.batch_size = check_train_batch_size(self.model, self.args.imgsz, self.amp)
# 构建训练数据加载器和测试数据加载器
batch_size = self.batch_size // max(world_size, 1)
self.train_loader = self.get_dataloader(self.trainset, batch_size=batch_size, rank=RANK, mode="train")
if RANK in {-1, 0}:
# Note: When training DOTA dataset, double batch size could get OOM on images with >2000 objects.
self.test_loader = self.get_dataloader(
self.testset, batch_size=batch_size if self.args.task == "obb" else batch_size * 2, rank=-1, mode="val"
)
self.validator = self.get_validator()
metric_keys = self.validator.metrics.keys + self.label_loss_items(prefix="val")
self.metrics = dict(zip(metric_keys, [0] * len(metric_keys)))
self.ema = ModelEMA(self.model)
if self.args.plots:
self.plot_training_labels()
# 创建优化器,并根据参数进行相应的设置和调整
self.accumulate = max(round(self.args.nbs / self.batch_size), 1) # accumulate loss before optimizing
weight_decay = self.args.weight_decay * self.batch_size * self.accumulate / self.args.nbs # scale weight_decay
iterations = math.ceil(len(self.train_loader.dataset) / max(self.batch_size, self.args.nbs)) * self.epochs
self.optimizer = self.build_optimizer(
model=self.model,
name=self.args.optimizer,
lr=self.args.lr0,
momentum=self.args.momentum,
decay=weight_decay,
iterations=iterations,
)
# 设置学习率调度器
self._setup_scheduler()
self.stopper, self.stop = EarlyStopping(patience=self.args.patience), False
self.resume_training(ckpt)
self.scheduler.last_epoch = self.start_epoch - 1 # do not move
self.run_callbacks("on_pretrain_routine_end")
3.2进行训练开始的初始化操作
nb = len(self.train_loader) # 训练集批次数
nw = max(round(self.args.warmup_epochs * nb), 100) if self.args.warmup_epochs > 0 else -1 # 预热(warmup)迭代次数(-1表示不进行预热)
last_opt_step = -1 # 上一次优化器(optimizer)更新的步骤数
self.epoch_time = None
self.epoch_time_start = time.time() # 每轮开始时间
self.train_time_start = time.time() # 训练开始时间
self.run_callbacks("on_train_start")
# 以日志形式打印训练相关信息
LOGGER.info(
f'Image sizes {self.args.imgsz} train, {self.args.imgsz} val\n'
f'Using {self.train_loader.num_workers * (world_size or 1)} dataloader workers\n'
f"Logging results to {colorstr('bold', self.save_dir)}\n"
f'Starting training for ' + (f"{self.args.time} hours..." if self.args.time else f"{self.epochs} epochs...")
)
# 是否存在禁用mosaic增强的参数
if self.args.close_mosaic:
base_idx = (self.epochs - self.args.close_mosaic) * nb
self.plot_idx.extend([base_idx, base_idx + 1, base_idx + 2])
epoch = self.start_epoch
3.3每一epoch前的准备工作:
self.epoch = epoch
self.run_callbacks("on_train_epoch_start")
with warnings.catch_warnings(): # 捕获警告信息
warnings.simplefilter("ignore") # 忽略特定的警告信息
self.scheduler.step() # 更新学习率
if RANK != -1:
self.train_loader.sampler.set_epoch(epoch)
pbar = enumerate(self.train_loader) # 创建可迭代进度条对象
# Update dataloader attributes (optional)
if epoch == (self.epochs - self.args.close_mosaic): # 当前是否为开始关闭mosaic操作的轮次
self._close_dataloader_mosaic() # 关闭mosaic操作
self.train_loader.reset() # 重置数据加载器状态
Mosaic操作是Yolo中一种数据增强技术,通过将多个图像合并成一个大的马赛克图像,并在训练过程中使用这个合并图像来进行训练。这个合并图像包含了四个小图像,每个小图像都是原始图像中的一个部分。
if RANK in {-1, 0}:
LOGGER.info(self.progress_string()) # 生成记录训练的信息
pbar = TQDM(enumerate(self.train_loader), total=nb) # 创建进度条对象,总迭代次数为nb
self.tloss = None # 设置总损失率
self.optimizer.zero_grad() # 优化器梯度清空
3.4进行每一epoch的训练
针对每一轮次,执行以下预热操作:
for i, batch in pbar:
self.run_callbacks("on_train_batch_start")
# Warmup
ni = i + nb * epoch
if ni <= nw:
xi = [0, nw] # x interp
self.accumulate = max(1, int(np.interp(ni, xi, [1, self.args.nbs / self.batch_size]).round()))
for j, x in enumerate(self.optimizer.param_groups):
# Bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0
x["lr"] = np.interp(
ni, xi, [self.args.warmup_bias_lr if j == 0 else 0.0, x["initial_lr"] * self.lf(epoch)]
)
if "momentum" in x:
x["momentum"] = np.interp(ni, xi, [self.args.warmup_momentum, self.args.momentum])
ni
表示当前的迭代步数(i
)加上总批次数(nb
)乘以当前训练周期(epoch
)。如果 ni
小于等于预热迭代总数(nw
),则在 xi
范围内进行插值计算,得到 self.accumulate
的值,这个值控制梯度累积的数量。然后遍历优化器的参数组(self.optimizer.param_groups
),对学习率和动量等参数进行插值计算。
进行前向传播和后向传播:
# Forward
with torch.cuda.amp.autocast(self.amp): # 开启混合精度训练(mixed precision training)
batch = self.preprocess_batch(batch) # batch 进行预处理操作
self.loss, self.loss_items = self.model(batch) # 计算损失值和损失项
if RANK != -1:
self.loss *= world_size
self.tloss = ( # 更新总损失值
(self.tloss * i + self.loss_items) / (i + 1) if self.tloss is not None else self.loss_items
)
# Backward
self.scaler.scale(self.loss).backward() # 进行反向传播,使用scaler自动缩放损失值
优化和超时判断:
# Optimize - https://pytorch.org/docs/master/notes/amp_examples.html
if ni - last_opt_step >= self.accumulate:
self.optimizer_step() # 进行优化
last_opt_step = ni # 更新索引
# Timed stopping
# 时间限制
if self.args.time:
self.stop = (time.time() - self.train_time_start) > (self.args.time * 3600) # 判断是否超时
if RANK != -1: # 在分布式训练中
broadcast_list = [self.stop if RANK == 0 else None]
dist.broadcast_object_list(broadcast_list, 0) # 广播给所有进程
self.stop = broadcast_list[0]
if self.stop: # 超时
break
记录训练日志信息和绘制训练样本:
# Log
mem = f"{torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0:.3g}G" # (GB)
loss_len = self.tloss.shape[0] if len(self.tloss.shape) else 1
losses = self.tloss if loss_len > 1 else torch.unsqueeze(self.tloss, 0)
if RANK in {-1, 0}:
pbar.set_description(
("%11s" * 2 + "%11.4g" * (2 + loss_len))
% (f"{epoch + 1}/{self.epochs}", mem, *losses, batch["cls"].shape[0], batch["img"].shape[-1])
)
self.run_callbacks("on_batch_end")
if self.args.plots and ni in self.plot_idx:
self.plot_training_samples(batch, ni)
3.7 每一轮训练后的操作
if RANK in {-1, 0}:
final_epoch = epoch + 1 >= self.epochs # 判断是否是最后一个轮次
self.ema.update_attr(self.model, include=["yaml", "nc", "args", "names", "stride", "class_weights"]) # 更新EMA
# 进行验证
if self.args.val or final_epoch or self.stopper.possible_stop or self.stop:
self.metrics, self.fitness = self.validate()
self.save_metrics(metrics={**self.label_loss_items(self.tloss), **self.metrics, **self.lr}) # 保存指标
self.stop |= self.stopper(epoch + 1, self.fitness) or final_epoch # 更新stop的值
if self.args.time:
self.stop |= (time.time() - self.train_time_start) > (self.args.time * 3600)
# 保存模型
if self.args.save or final_epoch:
self.save_model()
self.run_callbacks("on_model_save")
# Scheduler
t = time.time()
self.epoch_time = t - self.epoch_time_start # 记录当前轮次训练时间
self.epoch_time_start = t # 更新轮次开始时间
if self.args.time:
mean_epoch_time = (t - self.train_time_start) / (epoch - self.start_epoch + 1) # 计算每轮平均时间
self.epochs = self.args.epochs = math.ceil(self.args.time * 3600 / mean_epoch_time) # 重新计算总轮次
self._setup_scheduler() # 设置调度器
self.scheduler.last_epoch = self.epoch # do not move
self.stop |= epoch >= self.epochs # 当前轮次大于总轮次,停止训练
self.run_callbacks("on_fit_epoch_end")
gc.collect() # 执行回收
torch.cuda.empty_cache() # 清除GPU缓存
# Early Stopping
if RANK != -1: # if DDP training
broadcast_list = [self.stop if RANK == 0 else None] # 创建广播列表
dist.broadcast_object_list(broadcast_list, 0) # 广播同步停止标志
self.stop = broadcast_list[0]
if self.stop:
break # 停止训练
epoch += 1
在训练结束后,进行best.pt模型的验证,并返回验证结果和指标
if RANK in {-1, 0}:
# Do final val with best.pt
LOGGER.info(
f"\n{epoch - self.start_epoch + 1} epochs completed in "
f"{(time.time() - self.train_time_start) / 3600:.3f} hours."
)
self.final_eval() # 调用final_eval()进行验证
if self.args.plots:
self.plot_metrics()
self.run_callbacks("on_train_end")
gc.collect()
torch.cuda.empty_cache()
self.run_callbacks("teardown")
先写到这里,后续进一步的细节补充。