现在工作中目标检测视觉算法基本就锁定Yolo了(以前用SSD,后来是SSD和Yolo混着用),由于每隔一段时间回来看项目老是忘记某些代码当初为什么要这样写或者新开项目就要把Yolo知识点重新梳理一边,所以这次下定决心好好做个笔记,查了很多大神的笔记和视频,对YoloV5的源码写了比较详细的注释,贴出来跟大家分享一下,我本人是偏向于工程应用的,看源码主要是为了帮助我去开发项目,更多的是看别人对源码的理解,所以这个理解对与错有的我也不是很清楚,不对的地方大家留言改正。
篇幅有限,完整的工程文件会放到附件中,大家自行下载,已通过测试,可以直接使用。
Train.py源码注释
"""
Train a YOLOv5 model on a custom dataset
Usage:
数据集 预训练模型权重 img大小
$ python path/to/train.py --data coco128.yaml --weights yolov5s.pt --img 640
"""
# 导入一些常规的库
import argparse
import math
import os
import random
import sys
import time
from copy import deepcopy
from datetime import datetime
from pathlib import Path
# 导入训练需要的库
import numpy as np
import torch
import torch.distributed as dist
import torch.nn as nn
import yaml
from torch.cuda import amp # 提高模型的训练速度
from torch.nn.parallel import DistributedDataParallel as DDP # 单机多卡
from torch.optim import SGD, Adam, lr_scheduler # 几个优化器
from tqdm import tqdm # 进度条显示
FILE = Path(__file__).resolve() # 解析当前文件的路径
ROOT = FILE.parents[0] # 当前工程文件的根目录
if str(ROOT) not in sys.path:
sys.path.append(str(ROOT)) # add ROOT to PATH,根目录不在的话就加入到环境变量中,
ROOT = Path(os.path.relpath(ROOT, Path.cwd())) # relative,root的相对路径
import val # for end-of-epoch mAP
from models.experimental import attempt_load
from models.yolo import Model
from utils.autoanchor import check_anchors
from utils.autobatch import check_train_batch_size
from utils.callbacks import Callbacks
from utils.datasets import create_dataloader
from utils.downloads import attempt_download
from utils.general import (LOGGER, NCOLS, check_dataset, check_file, check_git_status, check_img_size,
check_requirements, check_suffix, check_yaml, colorstr, get_latest_run, increment_path,
init_seeds, intersect_dicts, labels_to_class_weights, labels_to_image_weights, methods,
one_cycle, print_args, print_mutation, strip_optimizer)
from utils.loggers import Loggers
from utils.loggers.wandb.wandb_utils import check_wandb_resume
from utils.loss import ComputeLoss
from utils.metrics import fitness
from utils.plots import plot_evolve, plot_labels
from utils.torch_utils import EarlyStopping, ModelEMA, de_parallel, select_device, torch_distributed_zero_first
'''
以下三个变量用于分布式训练
一般就一个cpu或者gpu用不上这些参数
'''
LOCAL_RANK = int(os.getenv('LOCAL_RANK', -1)) # https://pytorch.org/docs/stable/elastic/run.html
RANK = int(os.getenv('RANK', -1))
WORLD_SIZE = int(os.getenv('WORLD_SIZE', 1))
def train(hyp, # path/to/hyp.yaml or hyp dictionary
opt,
device,
callbacks
):
# 从opt中拿到参数
save_dir, epochs, batch_size, weights, single_cls, evolve, data, cfg, resume, noval, nosave, workers, freeze, = \
Path(opt.save_dir), opt.epochs, opt.batch_size, opt.weights, opt.single_cls, opt.evolve, opt.data, opt.cfg, \
opt.resume, opt.noval, opt.nosave, opt.workers, opt.freeze
# Directories
w = save_dir / 'weights' # weights dir,保存权重文件
(w.parent if evolve else w).mkdir(parents=True, exist_ok=True) # make dir,判断weights文件夹是否存在
last, best = w / 'last.pt', w / 'best.pt' # last.pt最后一轮的权重文件,best.pt最好一轮的权重文件
# Hyperparameters
# 加载训练过程中使用到的超参数
if isinstance(hyp, str):
with open(hyp, errors='ignore') as f:
hyp = yaml.safe_load(f) # load hyps dict
LOGGER.info(colorstr('hyperparameters: ') + ', '.join(f'{k}={v}' for k, v in hyp.items()))
# Save run settings
# 运行过程中的配置环境保存下来
with open(save_dir / 'hyp.yaml', 'w') as f: # 保存超参数
yaml.safe_dump(hyp, f, sort_keys=False)
with open(save_dir / 'opt.yaml', 'w') as f: # 保存命令行中的参数
yaml.safe_dump(vars(opt), f, sort_keys=False)
data_dict = None
# Loggers
if RANK in [-1, 0]:
loggers = Loggers(save_dir, weights, opt, hyp, LOGGER) # loggers instance,训练过程可视化
if loggers.wandb:
data_dict = loggers.wandb.data_dict
if resume:
weights, epochs, hyp = opt.weights, opt.epochs, opt.hyp
# Register actions
# 遍历日志记录器所有的方法
for k in methods(loggers):
callbacks.register_action(k, callback=getattr(loggers, k))
# Config
plots = not evolve # create plots,设置是否将训练过程中的图标画出来
cuda = device.type != 'cpu' # 判断电脑是否支持cuda
init_seeds(1 + RANK) # 初始化随机化种子
with torch_distributed_zero_first(LOCAL_RANK): # 判断是否分布式训练
data_dict = data_dict or check_dataset(data) # check if None,检查数据集信息
train_path, val_path = data_dict['train'], data_dict['val'] # 从数据集中取出训练集和测试集的路径
nc = 1 if single_cls else int(data_dict['nc']) # number of classes,数据集中一共有多少类
names = ['item'] if single_cls and len(data_dict['names']) != 1 else data_dict['names'] # class names,取出所有80个类名
assert len(names) == nc, f'{len(names)} names found for nc={nc} dataset in {data}' # check,判断80个类型和预测类型的数量是否一致
is_coco = isinstance(val_path, str) and val_path.endswith('coco/val2017.txt') # COCO dataset,判断验证集是否是val2017.txt
# Model,以下跟模型加载有关
check_suffix(weights, '.pt') # check weights,传进来的weights参数是否以.pt结尾
pretrained = weights.endswith('.pt')
if pretrained:
with torch_distributed_zero_first(LOCAL_RANK):
weights = attempt_download(weights) # download if not found locally,检测有没有,没有就去下载
# 加载预训练模型权重
ckpt = torch.load(weights, map_location=device) # load checkpoint,加载权重文件
# 新建一个模型,比如自己定义的模型跟原始模型不一样的情况
model = Model(cfg or ckpt['model'].yaml, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device) # create,创建模型
exclude = ['anchor'] if (cfg or hyp.get('anchors')) and not resume else [] # exclude keys
csd = ckpt['model'].float().state_dict() # checkpoint state_dict as FP32
csd = intersect_dicts(csd, model.state_dict(), exclude=exclude) # intersect
model.load_state_dict(csd, strict=False) # load
LOGGER.info(f'Transferred {len(csd)}/{len(model.state_dict())} items from {weights}') # report
else:
# 通道
model = Model(cfg, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device) # create
# 配置文件 标签数量
# Freeze
# 冻结模型中的某些层
freeze = [f'model.{x}.' for x in range(freeze)] # layers to freeze
for k, v in model.named_parameters():
v.requires_grad = True # train all layers
if any(x in k for x in freeze):
LOGGER.info(f'freezing {k}')
v.requires_grad = False
# Image size
gs = max(int(model.stride.max()), 32) # grid size (max stride),获取最高层的图形输入特征,相比较于原始图像缩小了多少倍
imgsz = check_img_size(opt.imgsz, gs, floor=gs * 2) # verify imgsz is gs-multiple,检查输入图像的尺寸满不满足32的倍数
# Batch size
# batch size是-1的话会自动选择一个合适的batch大小
if RANK == -1 and batch_size == -1: # single-GPU only, estimate best batch size
batch_size = check_train_batch_size(model, imgsz)
# Optimizer
# 创建深度学习的优化器
nbs = 64 # nominal batch size,名义上的batchsize,比如gpu只支持最大8的batchsize,想要实现64的batchsize的效果,就送8批8的batchsize
accumulate = max(round(nbs / batch_size), 1) # accumulate loss before optimizing,存放累计次数
hyp['weight_decay'] *= batch_size * accumulate / nbs # scale weight_decay,对weight_decay超参数进行缩放,可以防止训练过程中过拟合
LOGGER.info(f"Scaled weight_decay = {hyp['weight_decay']}")
# 以下把模型的参数划分到三个不同的组中
g0, g1, g2 = [], [], [] # optimizer parameter groups,定义三个列表来存储参数
# 遍历网络中的所有层
for v in model.modules():
if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter): # bias,如果有偏置则放到g2
g2.append(v.bias)
if isinstance(v, nn.BatchNorm2d): # weight (no decay)如果是bn,就放到g0层
g0.append(v.weight)
elif hasattr(v, 'weight') and isinstance(v.weight, nn.Parameter): # weight (with decay)
g1.append(v.weight)
# 判断使用优化器的类型,没有指定的话就会用sgd
if opt.adam:
optimizer = Adam(g0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum
else:
# 随机梯度下降算法
optimizer = SGD(g0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) # 把g0加入到优化器中去
# 把g0和g1加到优化器中去
optimizer.add_param_group({'params': g1, 'weight_decay': hyp['weight_decay']}) # add g1 with weight_decay,权重衰减加到g1层,这就是为什么到对参数做分组的原因
optimizer.add_param_group({'params': g2}) # add g2 (biases)
# 打印用到的优化器,需要权重衰减的w有哪些,不需要权重衰减的w有哪些
LOGGER.info(f"{colorstr('optimizer:')} {type(optimizer).__name__} with parameter groups "
f"{len(g0)} weight, {len(g1)} weight (no decay), {len(g2)} bias")
del g0, g1, g2 # 创建完后g变量没什么用了,就删掉
# Scheduler,模型训练过程中,学习率变化的策略,随着训练的深入,减少学习率有利于找到局部最优解
if opt.linear_lr:
lf = lambda x: (1 - x / (epochs - 1)) * (1.0 - hyp['lrf']) + hyp['lrf'] # linear,线性的变化策略
else:
lf = one_cycle(1, hyp['lrf'], epochs) # cosine 1->hyp['lrf'],
scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) # plot_lr_scheduler(optimizer, scheduler, epochs)
# EMA
# 指数平均的算法,能在每次计算参数的时候,考虑历史值对参数的影响
ema = ModelEMA(model) if RANK in [-1, 0] else None
# Resume,从预训练的权重文件中加载一些信息
start_epoch, best_fitness = 0, 0.0
if pretrained:
# Optimizer,如果有优化器的信息,就把优化器的权重加载进来,,当时训练的模型的拟合程度加载进来
if ckpt['optimizer'] is not None:
optimizer.load_state_dict(ckpt['optimizer'])
best_fitness = ckpt['best_fitness']
# EMA
if ema and ckpt.get('ema'):
ema.ema.load_state_dict(ckpt['ema'].float().state_dict())
ema.updates = ckpt['updates']
# Epochs,根据epochs的值来决定从哪里开始训练,比如100轮,训练完了,则是-1,-1+1=0就从头开始训练,如果是第51轮,就51+1从第52轮开始训练
start_epoch = ckpt['epoch'] + 1
if resume:
assert start_epoch > 0, f'{weights} training to {epochs} epochs is finished, nothing to resume.'
if epochs < start_epoch:
LOGGER.info(f"{weights} has been trained for {ckpt['epoch']} epochs. Fine-tuning for {epochs} more epochs.")
epochs += ckpt['epoch'] # finetune additional epochs
del ckpt, csd
# DP mode,是否使用多张显卡,多张显卡会执行数据并行操作
if cuda and RANK == -1 and torch.cuda.device_count() > 1:
LOGGER.warning('WARNING: DP not recommended, use torch.distributed.run for best DDP Multi-GPU results.\n'
'See Multi-GPU Tutorial at https://github.com/ultralytics/yolov5/issues/475 to get started.')
model = torch.nn.DataParallel(model)
# SyncBatchNorm,跟分布式训练相关
if opt.sync_bn and cuda and RANK != -1:
model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device)
LOGGER.info('Using SyncBatchNorm()')
# Trainloader,加载训练集的数据
train_loader, dataset = create_dataloader(train_path, imgsz, batch_size // WORLD_SIZE, gs, single_cls,
hyp=hyp, augment=True, cache=opt.cache, rect=opt.rect, rank=LOCAL_RANK,
workers=workers, image_weights=opt.image_weights, quad=opt.quad,
prefix=colorstr('train: '), shuffle=True)
mlc = int(np.concatenate(dataset.labels, 0)[:, 0].max()) # max label class,计算标签的最大类别号
nb = len(train_loader) # number of batches,计算一共有多少个batch
assert mlc < nc, f'Label class {mlc} exceeds nc={nc} in {data}. Possible class labels are 0-{nc - 1}'
# Process 0
# 加载验证集的数据
if RANK in [-1, 0]:
val_loader = create_dataloader(val_path, imgsz, batch_size // WORLD_SIZE * 2, gs, single_cls,
hyp=hyp, cache=None if noval else opt.cache, rect=True, rank=-1,
workers=workers, pad=0.5,
prefix=colorstr('val: '))[0]
if not resume:
labels = np.concatenate(dataset.labels, 0) # 取出数据集所有的标签
# c = torch.tensor(labels[:, 0]) # classes
# cf = torch.bincount(c.long(), minlength=nc) + 1. # frequency
# model._initialize_biases(cf.to(device))
if plots:
plot_labels(labels, names, save_dir) # 绘制标签的分布图
# Anchors
if not opt.noautoanchor:
check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz) # 检查配置文件中的anchor是不是合适
model.half().float() # pre-reduce anchor precision
# callbacks.run,去查看类中有没有这个方法,有的话就执行,没有就不执行
callbacks.run('on_pretrain_routine_end')
# DDP mode,多卡训练内容
if cuda and RANK != -1:
model = DDP(model, device_ids=[LOCAL_RANK], output_device=LOCAL_RANK)
# Model attributes
nl = de_parallel(model).model[-1].nl # number of detection layers (to scale hyps),模型中检测层的数量
# 利用层数对下面几个超参数进行缩放
# yolov5损失值包含3个部分,一个是框回归的损失,一部分是类别损失,一部分是置信度损失,以下三个超参数就是这三个损失的系数
hyp['box'] *= 3 / nl # scale to layers
hyp['cls'] *= nc / 80 * 3 / nl # scale to classes and layers
hyp['obj'] *= (imgsz / 640) ** 2 * 3 / nl # scale to image size and layers
hyp['label_smoothing'] = opt.label_smoothing # 做标签平滑的超参数
# 类别数,超参数,类别权重,标签名,四个东西写入到model变量里
model.nc = nc # attach number of classes to model
model.hyp = hyp # attach hyperparameters to model
model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) * nc # attach class weights,数据一般是不均衡的,对数据给个权重
model.names = names
# Start training
# 开始训练,第一部分初始化动作
t0 = time.time()
# warmup的迭代次数
nw = max(round(hyp['warmup_epochs'] * nb), 1000) # number of warmup iterations, max(3 epochs, 1k iterations)
# nw = min(nw, (epochs - start_epoch) / 2 * nb) # limit warmup to < 1/2 of training
last_opt_step = -1 # 上一次更新参数的值
maps = np.zeros(nc) # mAP per class,存放训练过程中每一类的map值
results = (0, 0, 0, 0, 0, 0, 0) # 存放P, R, mAP@.5, mAP@.5-.95, val_loss(box, obj, cls)
scheduler.last_epoch = start_epoch - 1
scaler = amp.GradScaler(enabled=cuda) # 使用自动混合精度去训练
stopper = EarlyStopping(patience=opt.patience) # 训练过程中连续几轮模型没有提升就结束
compute_loss = ComputeLoss(model) # init loss class,定义损失函数
LOGGER.info(f'Image sizes {imgsz} train, {imgsz} val\n'
f'Using {train_loader.num_workers * WORLD_SIZE} dataloader workers\n'
f"Logging results to {colorstr('bold', save_dir)}\n"
f'Starting training for {epochs} epochs...')
# 遍历训练过程
for epoch in range(start_epoch, epochs): # epoch
model.train() # 模型切换到训练状态
# Update image weights (optional, single-GPU only),更新图片的权重,给每一张图片分配采样权重
# 就是说训练过程中难识别的物体就给高权重,多训练几次
if opt.image_weights:
cw = model.class_weights.cpu().numpy() * (1 - maps) ** 2 / nc # class weights
iw = labels_to_image_weights(dataset.labels, nc=nc, class_weights=cw) # image weights
dataset.indices = random.choices(range(dataset.n), weights=iw, k=dataset.n) # 根据图片的权重,进行重采样
# Update mosaic border (optional)
# b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs)
# dataset.mosaic_border = [b - imgsz, -b] # height, width borders
mloss = torch.zeros(3, device=device) # mean losses,初始化变量,存放损失值
if RANK != -1:
train_loader.sampler.set_epoch(epoch)
# 下面的一对bar用来在训练过程中显示进度条
pbar = enumerate(train_loader)
# 第几轮 gpu显存占用 类别损失 平均每批样本中标注框的个数
LOGGER.info(('\n' + '%10s' * 7) % ('Epoch', 'gpu_mem', 'box', 'obj', 'cls', 'labels', 'img_size'))
# 框损失 置信度损失 图片尺寸
if RANK in [-1, 0]:
pbar = tqdm(pbar, total=nb, ncols=NCOLS, bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}') # progress bar
optimizer.zero_grad() # 在取数前先把梯度归零
for i, (imgs, targets, paths, _) in pbar: # batch ,imgs图片数据,targets标注框,paths图片路径
ni = i + nb * epoch # number integrated batches (since train start),一共训练了多少批数据
imgs = imgs.to(device, non_blocking=True).float() / 255 # uint8 to float32, 0-255 to 0.0-1.0,图片移动到gpu上进行归一化的操作
# Warmup,是一种训练技巧,刚开始用比较小的学习率,慢慢的提升到高的学习率
if ni <= nw:
xi = [0, nw] # x interp
# compute_loss.gr = np.interp(ni, xi, [0.0, 1.0]) # iou loss ratio (obj_loss = 1.0 or iou)
accumulate = max(1, np.interp(ni, xi, [1, nbs / batch_size]).round())
for j, x in enumerate(optimizer.param_groups): # 遍历优化器中的所有参数组
# bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0
x['lr'] = np.interp(ni, xi, [hyp['warmup_bias_lr'] if j == 2 else 0.0, x['initial_lr'] * lf(epoch)])
if 'momentum' in x:
x['momentum'] = np.interp(ni, xi, [hyp['warmup_momentum'], hyp['momentum']])
# Multi-scale,多尺度训练,训练过程中随机得到一个比例因子
if opt.multi_scale:
sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs # size
sf = sz / max(imgs.shape[2:]) # scale factor
if sf != 1:
ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]] # new shape (stretched to gs-multiple)
imgs = nn.functional.interpolate(imgs, size=ns, mode='bilinear', align_corners=False)
# Forward,前向传播
with amp.autocast(enabled=cuda):
pred = model(imgs) # forward,图像传给模型得到预测框
loss, loss_items = compute_loss(pred, targets.to(device)) # loss scaled by batch_size,利用预测框和标注框得到损失值
if RANK != -1:
loss *= WORLD_SIZE # gradient averaged between devices in DDP mode
if opt.quad:
loss *= 4.
# Backward,反向传播
scaler.scale(loss).backward()
# Optimize,更新参数
# ni表示取到第几批数据了
# last_opt_step上一次更新参数的批次
if ni - last_opt_step >= accumulate:
scaler.step(optimizer) # optimizer.step
scaler.update()
optimizer.zero_grad()
if ema:
ema.update(model)
last_opt_step = ni
# Log
if RANK in [-1, 0]:
mloss = (mloss * i + loss_items) / (i + 1) # update mean losses
mem = f'{torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0:.3g}G' # (GB)
pbar.set_description(('%10s' * 2 + '%10.4g' * 5) % (
f'{epoch}/{epochs - 1}', mem, *mloss, targets.shape[0], imgs.shape[-1]))
callbacks.run('on_train_batch_end', ni, model, imgs, targets, paths, plots, opt.sync_bn) # 前三批会有额外保存的log记录效果
# end batch
# Scheduler
lr = [x['lr'] for x in optimizer.param_groups] # for loggers
scheduler.step() # 根据学习率变化策略更一下学习率
if RANK in [-1, 0]:
# mAP,在验证集上验证
callbacks.run('on_train_epoch_end', epoch=epoch)
ema.update_attr(model, include=['yaml', 'nc', 'hyp', 'names', 'stride', 'class_weights']) # 给ema添加属性
final_epoch = (epoch + 1 == epochs) or stopper.possible_stop # 判断当前是不是最终一轮
if not noval or final_epoch: # Calculate mAP,如果不是最终轮,会把这一轮的模型在验证集上跑一遍,得到results和maps
results, maps, _ = val.run(data_dict,
batch_size=batch_size // WORLD_SIZE * 2,
imgsz=imgsz,
model=ema.ema,
single_cls=single_cls,
dataloader=val_loader,
save_dir=save_dir,
plots=False,
callbacks=callbacks,
compute_loss=compute_loss)
# Update best mAP
fi = fitness(np.array(results).reshape(1, -1)) # weighted combination of [P, R, mAP@.5, mAP@.5-.95],定义拟合度指标
# 拟合度情况指标
if fi > best_fitness:
best_fitness = fi
log_vals = list(mloss) + list(results) + lr
callbacks.run('on_fit_epoch_end', log_vals, epoch, best_fitness, fi)
# Save model,保存模型
if (not nosave) or (final_epoch and not evolve): # if save
ckpt = {'epoch': epoch,
'best_fitness': best_fitness,
'model': deepcopy(de_parallel(model)).half(),
'ema': deepcopy(ema.ema).half(),
'updates': ema.updates,
'optimizer': optimizer.state_dict(),
'wandb_id': loggers.wandb.wandb_run.id if loggers.wandb else None,
'date': datetime.now().isoformat()}
# Save last, best and delete
torch.save(ckpt, last)
if best_fitness == fi:
torch.save(ckpt, best)
if (epoch > 0) and (opt.save_period > 0) and (epoch % opt.save_period == 0):
torch.save(ckpt, w / f'epoch{epoch}.pt')
del ckpt
callbacks.run('on_model_save', last, epoch, final_epoch, best_fitness, fi)
# Stop Single-GPU,需不需要预先停止训练
if RANK == -1 and stopper(epoch=epoch, fitness=fi):
break
# Stop DDP TODO: known issues shttps://github.com/ultralytics/yolov5/pull/4576
# stop = stopper(epoch=epoch, fitness=fi)
# if RANK == 0:
# dist.broadcast_object_list([stop], 0) # broadcast 'stop' to all ranks
# Stop DPP
# with torch_distributed_zero_first(RANK):
# if stop:
# break # must break all DDP ranks
# end epoch
# end training
if RANK in [-1, 0]:
LOGGER.info(f'\n{epoch - start_epoch + 1} epochs completed in {(time.time() - t0) / 3600:.3f} hours.')
for f in last, best:
if f.exists():
strip_optimizer(f) # strip optimizers
if f is best:
LOGGER.info(f'\nValidating {f}...')
results, _, _ = val.run(data_dict,
batch_size=batch_size // WORLD_SIZE * 2,
imgsz=imgsz,
model=attempt_load(f, device).half(),
iou_thres=0.65 if is_coco else 0.60, # best pycocotools results at 0.65
single_cls=single_cls,
dataloader=val_loader,
save_dir=save_dir,
save_json=is_coco,
verbose=True,
plots=True,
callbacks=callbacks,
compute_loss=compute_loss) # val best model with plots
if is_coco:
callbacks.run('on_fit_epoch_end', list(mloss) + list(results) + lr, epoch, best_fitness, fi)
callbacks.run('on_train_end', last, best, plots, epoch, results)
LOGGER.info(f"Results saved to {colorstr('bold', save_dir)}")
torch.cuda.empty_cache()
return results
# 以下方法把配置项加载进来
def parse_opt(known=False):
parser = argparse.ArgumentParser()
# 权重
parser.add_argument('--weights', type=str, default=ROOT / 'pretrained/yolov5s.pt', help='initial weights path')
# 模型的配置文件
parser.add_argument('--cfg', type=str, default=ROOT / 'models/yolov5s.yaml', help='model.yaml path')
# 加载的数据
parser.add_argument('--data', type=str, default=ROOT / 'data/data.yaml', help='dataset.yaml path')
parser.add_argument('--hyp', type=str, default=ROOT / 'data/hyps/hyp.scratch.yaml', help='hyperparameters path')
parser.add_argument('--epochs', type=int, default=300)
parser.add_argument('--batch-size', type=int, default=4, help='total batch size for all GPUs, -1 for autobatch')
parser.add_argument('--imgsz', '--img', '--img-size', type=int, default=640, help='train, val image size (pixels)')
#
parser.add_argument('--rect', action='store_true', help='rectangular training')
parser.add_argument('--resume', nargs='?', const=True, default=False, help='resume most recent training')
parser.add_argument('--nosave', action='store_true', help='only save final checkpoint')
parser.add_argument('--noval', action='store_true', help='only validate final epoch')
# 自动计算anchor
parser.add_argument('--noautoanchor', action='store_true', help='disable autoanchor check')
# 自动计算超参数,很花算力和时间
parser.add_argument('--evolve', type=int, nargs='?', const=300, help='evolve hyperparameters for x generations')
parser.add_argument('--bucket', type=str, default='', help='gsutil bucket')
# dataloader中数据的缓存
parser.add_argument('--cache', type=str, nargs='?', const='ram', help='--cache images in "ram" (default) or "disk"')
# 可以给某些数据集设置权重,比较难训练的就设置的高一些,这样模型在训练的时候会有意的多挑一些这类数据
parser.add_argument('--image-weights', action='store_true', help='use weighted image selection for training')
parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
# parser.add_argument('--multi-scale', action='store_true', help='vary img-size +/- 50%%')
parser.add_argument('--multi-scale', default=True, help='vary img-size +/- 50%%')
parser.add_argument('--single-cls', action='store_true', help='train multi-class data as single-class')
parser.add_argument('--adam', action='store_true', help='use torch.optim.Adam() optimizer')
# 多卡训练参数
parser.add_argument('--sync-bn', action='store_true', help='use SyncBatchNorm, only available in DDP mode')
# dataloader有多少个小的进程
parser.add_argument('--workers', type=int, default=0, help='max dataloader workers (per RANK in DDP mode)')
# 模型保存输出的一些东西的路径
parser.add_argument('--project', default=ROOT / 'runs/train', help='save to project/name')
parser.add_argument('--name', default='exp', help='save to project/name')
parser.add_argument('--exist-ok', action='store_true', help='existing project/name ok, do not increment')
parser.add_argument('--quad', action='store_true', help='quad dataloader')
parser.add_argument('--linear-lr', action='store_true', help='linear LR')
parser.add_argument('--label-smoothing', type=float, default=0.0, help='Label smoothing epsilon')
parser.add_argument('--patience', type=int, default=100, help='EarlyStopping patience (epochs without improvement)')
parser.add_argument('--freeze', type=int, default=0, help='Number of layers to freeze. backbone=10, all=24')
parser.add_argument('--save-period', type=int, default=-1, help='Save checkpoint every x epochs (disabled if < 1)')
parser.add_argument('--local_rank', type=int, default=-1, help='DDP parameter, do not modify')
# Weights & Biases arguments
parser.add_argument('--entity', default=None, help='W&B: Entity')
parser.add_argument('--upload_dataset', action='store_true', help='W&B: Upload dataset as artifact table')
parser.add_argument('--bbox_interval', type=int, default=-1, help='W&B: Set bounding-box image logging interval')
parser.add_argument('--artifact_alias', type=str, default='latest', help='W&B: Version of dataset artifact to use')
opt = parser.parse_known_args()[0] if known else parser.parse_args()
return opt
def main(opt, callbacks=Callbacks()):
# Checks
# 校验工作
if RANK in [-1, 0]: # rank是-1
print_args(FILE.stem, opt) # 打印传入的参数信息
check_git_status() # 检查yolov5在github的代码仓库是否更新
check_requirements(exclude=['thop']) # 检查requirement文件中的库有没有安装
# Resume参数表示从中断处恢复
# 根据是否传入resume执行不同的操作
if opt.resume and not check_wandb_resume(opt) and not opt.evolve: # resume an interrupted run
ckpt = opt.resume if isinstance(opt.resume, str) else get_latest_run() # specified or most recent path
assert os.path.isfile(ckpt), 'ERROR: --resume checkpoint does not exist'
with open(Path(ckpt).parent.parent / 'opt.yaml', errors='ignore') as f:
opt = argparse.Namespace(**yaml.safe_load(f)) # replace
opt.cfg, opt.weights, opt.resume = '', ckpt, True # reinstate
LOGGER.info(f'Resuming training from {ckpt}')
else:
# 检查一堆需要的文件,data数据,cfg配置,hyp超参数(在data/hyps中),weights预训练权重,project训练结果保存的路径
opt.data, opt.cfg, opt.hyp, opt.weights, opt.project = \
check_file(opt.data), check_yaml(opt.cfg), check_yaml(opt.hyp), str(opt.weights), str(opt.project) # checks
# 判断cfg和weight是否为空
assert len(opt.cfg) or len(opt.weights), 'either --cfg or --weights must be specified'
if opt.evolve: # 是否传入evolve参数,改变保存路径
opt.project = str(ROOT / 'runs/evolve')
opt.exist_ok, opt.resume = opt.resume, False # pass resume to exist_ok and disable resume
opt.save_dir = str(increment_path(Path(opt.project) / opt.name, exist_ok=opt.exist_ok))
# DDP mode,判断是否使用ddp训练方法
device = select_device(opt.device, batch_size=opt.batch_size) # 选择使用cpu还是gpu
if LOCAL_RANK != -1: # 加入使用分布式的话会额外执行一些操作
assert torch.cuda.device_count() > LOCAL_RANK, 'insufficient CUDA devices for DDP command'
assert opt.batch_size % WORLD_SIZE == 0, '--batch-size must be multiple of CUDA device count'
assert not opt.image_weights, '--image-weights argument is not compatible with DDP training'
assert not opt.evolve, '--evolve argument is not compatible with DDP training'
torch.cuda.set_device(LOCAL_RANK)
device = torch.device('cuda', LOCAL_RANK)
dist.init_process_group(backend="nccl" if dist.is_nccl_available() else "gloo")
# Train,正式开始训练
if not opt.evolve: # 是否用到evolve参数,没有用到,则调用train训练
train(opt.hyp, opt, device, callbacks)
if WORLD_SIZE > 1 and RANK == 0:
LOGGER.info('Destroying process group... ')
dist.destroy_process_group()
# Evolve hyperparameters (optional),可以自动调整超参数
else:
# Hyperparameter evolution metadata (mutation scale 0-1, lower_limit, upper_limit)
meta = {'lr0': (1, 1e-5, 1e-1), # initial learning rate (SGD=1E-2, Adam=1E-3)
'lrf': (1, 0.01, 1.0), # final OneCycleLR learning rate (lr0 * lrf)
'momentum': (0.3, 0.6, 0.98), # SGD momentum/Adam beta1
'weight_decay': (1, 0.0, 0.001), # optimizer weight decay
'warmup_epochs': (1, 0.0, 5.0), # warmup epochs (fractions ok)
'warmup_momentum': (1, 0.0, 0.95), # warmup initial momentum
'warmup_bias_lr': (1, 0.0, 0.2), # warmup initial bias lr
'box': (1, 0.02, 0.2), # box loss gain
'cls': (1, 0.2, 4.0), # cls loss gain
'cls_pw': (1, 0.5, 2.0), # cls BCELoss positive_weight
'obj': (1, 0.2, 4.0), # obj loss gain (scale with pixels)
'obj_pw': (1, 0.5, 2.0), # obj BCELoss positive_weight
'iou_t': (0, 0.1, 0.7), # IoU training threshold
'anchor_t': (1, 2.0, 8.0), # anchor-multiple threshold
'anchors': (2, 2.0, 10.0), # anchors per output grid (0 to ignore)
'fl_gamma': (0, 0.0, 2.0), # focal loss gamma (efficientDet default gamma=1.5)
'hsv_h': (1, 0.0, 0.1), # image HSV-Hue augmentation (fraction)
'hsv_s': (1, 0.0, 0.9), # image HSV-Saturation augmentation (fraction)
'hsv_v': (1, 0.0, 0.9), # image HSV-Value augmentation (fraction)
'degrees': (1, 0.0, 45.0), # image rotation (+/- deg)
'translate': (1, 0.0, 0.9), # image translation (+/- fraction)
'scale': (1, 0.0, 0.9), # image scale (+/- gain)
'shear': (1, 0.0, 10.0), # image shear (+/- deg)
'perspective': (0, 0.0, 0.001), # image perspective (+/- fraction), range 0-0.001
'flipud': (1, 0.0, 1.0), # image flip up-down (probability)
'fliplr': (0, 0.0, 1.0), # image flip left-right (probability)
'mosaic': (1, 0.0, 1.0), # image mixup (probability)
'mixup': (1, 0.0, 1.0), # image mixup (probability)
'copy_paste': (1, 0.0, 1.0)} # segment copy-paste (probability)
with open(opt.hyp, errors='ignore') as f:
hyp = yaml.safe_load(f) # load hyps dict
if 'anchors' not in hyp: # anchors commented in hyp.yaml
hyp['anchors'] = 3
opt.noval, opt.nosave, save_dir = True, True, Path(opt.save_dir) # only val/save final epoch
# ei = [isinstance(x, (int, float)) for x in hyp.values()] # evolvable indices
evolve_yaml, evolve_csv = save_dir / 'hyp_evolve.yaml', save_dir / 'evolve.csv'
if opt.bucket:
os.system(f'gsutil cp gs://{opt.bucket}/evolve.csv {save_dir}') # download evolve.csv if exists
for _ in range(opt.evolve): # generations to evolve
if evolve_csv.exists(): # if evolve.csv exists: select best hyps and mutate
# Select parent(s)
parent = 'single' # parent selection method: 'single' or 'weighted'
x = np.loadtxt(evolve_csv, ndmin=2, delimiter=',', skiprows=1)
n = min(5, len(x)) # number of previous results to consider
x = x[np.argsort(-fitness(x))][:n] # top n mutations
w = fitness(x) - fitness(x).min() + 1E-6 # weights (sum > 0)
if parent == 'single' or len(x) == 1:
# x = x[random.randint(0, n - 1)] # random selection
x = x[random.choices(range(n), weights=w)[0]] # weighted selection
elif parent == 'weighted':
x = (x * w.reshape(n, 1)).sum(0) / w.sum() # weighted combination
# Mutate
mp, s = 0.8, 0.2 # mutation probability, sigma
npr = np.random
npr.seed(int(time.time()))
g = np.array([meta[k][0] for k in hyp.keys()]) # gains 0-1
ng = len(meta)
v = np.ones(ng)
while all(v == 1): # mutate until a change occurs (prevent duplicates)
v = (g * (npr.random(ng) < mp) * npr.randn(ng) * npr.random() * s + 1).clip(0.3, 3.0)
for i, k in enumerate(hyp.keys()): # plt.hist(v.ravel(), 300)
hyp[k] = float(x[i + 7] * v[i]) # mutate
# Constrain to limits
for k, v in meta.items():
hyp[k] = max(hyp[k], v[1]) # lower limit
hyp[k] = min(hyp[k], v[2]) # upper limit
hyp[k] = round(hyp[k], 5) # significant digits
# Train mutation
results = train(hyp.copy(), opt, device, callbacks)
# Write mutation results
print_mutation(results, hyp.copy(), save_dir, opt.bucket)
# Plot results
plot_evolve(evolve_csv)
LOGGER.info(f'Hyperparameter evolution finished\n'
f"Results saved to {colorstr('bold', save_dir)}\n"
f'Use best hyperparameters example: $ python train.py --hyp {evolve_yaml}')
def run(**kwargs):
# Usage: import train; train.run(data='coco128.yaml', imgsz=320, weights='yolov5m.pt')
opt = parse_opt(True)
for k, v in kwargs.items():
setattr(opt, k, v)
main(opt)
# python train.py --data mask_data.yaml --cfg mask_yolov5s.yaml --weights pretrained/yolov5s.pt --epoch 100 --batch-size 4 --device cpu
# python train.py --data mask_data.yaml --cfg mask_yolov5l.yaml --weights pretrained/yolov5l.pt --epoch 100 --batch-size 4
# python train.py --data mask_data.yaml --cfg mask_yolov5m.yaml --weights pretrained/yolov5m.pt --epoch 100 --batch-size 4
if __name__ == "__main__":
opt = parse_opt() # 解析输入的参数
main(opt)