mmcv
mmcv的github
mmcv的文章概述
mmcv讲解: mmcv
关于上文一些补充
- 装饰器
- FileClient
关于FileClient,LoadImageFromFile类读取图片时使用到了disk的FileClient。之前我们读取数据源都是使用的opencv,而mmcv直接读取的图片字节再进行解码。
@PIPELINES.register_module()
class LoadImageFromFile(object): # 加载图片到内存中
def __init__(self,
to_float32=False,
color_type='color',
file_client_args=dict(backend='disk')):
self.to_float32 = to_float32
self.color_type = color_type
# 默认是 disk 后端
self.file_client_args = file_client_args.copy()
self.file_client = None
def __call__(self, results):
if self.file_client is None:
self.file_client = mmcv.FileClient(**self.file_client_args)
# 读取图片字节内容
img_bytes = self.file_client.get(filename)
# 对字节内容进行解码
img = mmcv.imfrombytes(img_bytes, flag=self.color_type)
...
return results
mmdetection
在阅读下面之前,确保已经看过register机制和Hook机制
数据集
对于想要在mmdetection上训练自己的数据集,最简单的办法就是将自己的数据集转化成COCO格式,然后修改mmdetection的一部分代码,这部分网上的代码很多,就不一一介绍了。
首先将自己的数据集放在data/coco目录下
定义自己的数据集时,需要新写一个继承CustomDataset的Dataset类,然后重写load_annotations()函数和get_ann_info()函数。
CustomDataset位于mmdet/datasets/custom.py文件内。
官方文档上说,用户如果要使用CustomDataset,要将现有数据集转换成MMDetection兼容的格式(COCO格式或中间格式) 。但我看了一下底层的代码并没有发现有这个限制,只要你的数据格式能和你实现的load_annotations()和get_ann_info()对应上即可。
coco.py中实现了对于coco数据集的读取,将CLASSES更改为自己的数据类别就可以。
mmdet/train.py
parse_args
1.config:训练配置文件
2.work-dir:训练日志和模型的保存地方
3.auto-scale-lr:自动lr
4.resume:恢复训练
# Copyright (c) OpenMMLab. All rights reserved.
import argparse
import logging
import os
import os.path as osp
from mmengine.config import Config, DictAction
from mmengine.logging import print_log
from mmengine.registry import RUNNERS
from mmengine.runner import Runner
from mmdet.utils import setup_cache_size_limit_of_dynamo
def parse_args():
parser = argparse.ArgumentParser(description='Train a detector')
parser.add_argument('config', help='train config file path')
parser.add_argument('--work-dir', help='the dir to save logs and models')
parser.add_argument(
'--amp',
action='store_true',
default=False,
help='enable automatic-mixed-precision training')
parser.add_argument(
'--auto-scale-lr',
action='store_true',
help='enable automatically scaling LR.')
parser.add_argument(
'--resume',
nargs='?',
type=str,
const='auto',
help='If specify checkpoint path, resume from it, while if not '
'specify, try to auto resume from the latest checkpoint '
'in the work directory.')
parser.add_argument(
'--cfg-options',
nargs='+',
action=DictAction,
help='override some settings in the used config, the key-value pair '
'in xxx=yyy format will be merged into config file. If the value to '
'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
'Note that the quotation marks are necessary and that no white space '
'is allowed.')
parser.add_argument(
'--launcher',
choices=['none', 'pytorch', 'slurm', 'mpi'],
default='none',
help='job launcher')
# When using PyTorch version >= 2.0.0, the `torch.distributed.launch`
# will pass the `--local-rank` parameter to `tools/train.py` instead
# of `--local_rank`.
parser.add_argument('--local_rank', '--local-rank', type=int, default=0)
args = parser.parse_args()
if 'LOCAL_RANK' not in os.environ:
os.environ['LOCAL_RANK'] = str(args.local_rank)
return args
def main():
# 读取参数
args = parse_args()
# Reduce the number of repeated compilations and improve
# training speed.
setup_cache_size_limit_of_dynamo()
# 加载配置文件
cfg = Config.fromfile(args.config)
cfg.launcher = args.launcher
# 如果有需要修改的参数,融合
if args.cfg_options is not None:
cfg.merge_from_dict(args.cfg_options)
# 设置workdir
# work_dir is determined in this priority: CLI > segment in file > filename
if args.work_dir is not None:
# update configs according to CLI args if args.work_dir is not None
cfg.work_dir = args.work_dir
elif cfg.get('work_dir', None) is None:
# use config filename as default work_dir if cfg.work_dir is None
cfg.work_dir = osp.join('./work_dirs',
osp.splitext(osp.basename(args.config))[0])
#设置自动混合精度训练
# enable automatic-mixed-precision training
if args.amp is True:
optim_wrapper = cfg.optim_wrapper.type
if optim_wrapper == 'AmpOptimWrapper':
print_log(
'AMP training is already enabled in your config.',
logger='current',
level=logging.WARNING)
else:
assert optim_wrapper == 'OptimWrapper', (
'`--amp` is only supported when the optimizer wrapper type is '
f'`OptimWrapper` but got {optim_wrapper}.')
cfg.optim_wrapper.type = 'AmpOptimWrapper'
cfg.optim_wrapper.loss_scale = 'dynamic'
# 设置自动lr
# enable automatically scaling LR
if args.auto_scale_lr:
if 'auto_scale_lr' in cfg and \
'enable' in cfg.auto_scale_lr and \
'base_batch_size' in cfg.auto_scale_lr:
cfg.auto_scale_lr.enable = True
else:
raise RuntimeError('Can not find "auto_scale_lr" or '
'"auto_scale_lr.enable" or '
'"auto_scale_lr.base_batch_size" in your'
' configuration file.')
# resume is determined in this priority: resume from > auto_resume
if args.resume == 'auto':
cfg.resume = True
cfg.load_from = None
elif args.resume is not None:
cfg.resume = True
cfg.load_from = args.resume
# build the runner from config
if 'runner_type' not in cfg:
# build the default runner
runner = Runner.from_cfg(cfg)
else:
# build customized runner from the registry
# if 'runner_type' is set in the cfg
runner = RUNNERS.build(cfg)
# start training
runner.train()
if __name__ == '__main__':
main()
mmdet在2.x版本还依赖于mmcv1.x的runner操作,mmdet在3.x版本就不依赖于mmcv的runner了,并且mmcv2.x版本已经没有runner了,而mmengine中更新了runner
- mmdet2.x----->mmcv1.x
在mmcv的1.x版本中,mmcv/mmcv/runner/epoch_based_runner.py中
def run(self,
data_loaders: List[DataLoader],
workflow: List[Tuple[str, int]],
max_epochs: Optional[int] = None,
**kwargs) -> None:
"""Start running.
Args:
data_loaders (list[:obj:`DataLoader`]): Dataloaders for training
and validation.
workflow (list[tuple]): A list of (phase, epochs) to specify the
running order and epochs. E.g, [('train', 2), ('val', 1)] means
running 2 epochs for training and 1 epoch for validation,
iteratively.
"""
assert isinstance(data_loaders, list)
assert mmcv.is_list_of(workflow, tuple)
assert len(data_loaders) == len(workflow)
if max_epochs is not None:
warnings.warn(
'setting max_epochs in run is deprecated, '
'please set max_epochs in runner_config', DeprecationWarning)
self._max_epochs = max_epochs
assert self._max_epochs is not None, (
'max_epochs must be specified during instantiation')
for i, flow in enumerate(workflow):
mode, epochs = flow
if mode == 'train':
self._max_iters = self._max_epochs * len(data_loaders[i])
break
work_dir = self.work_dir if self.work_dir is not None else 'NONE'
self.logger.info('Start running, host: %s, work_dir: %s',
get_host_info(), work_dir)
self.logger.info('Hooks will be executed in the following order:\n%s',
self.get_hook_info())
self.logger.info('workflow: %s, max: %d epochs', workflow,
self._max_epochs)
# 训练开始前的hook
self.call_hook('before_run')
while self.epoch < self._max_epochs:
for i, flow in enumerate(workflow):
mode, epochs = flow
if isinstance(mode, str): # self.train()
if not hasattr(self, mode):
raise ValueError(
f'runner has no method named "{mode}" to run an '
'epoch')
epoch_runner = getattr(self, mode)
else:
raise TypeError(
'mode in workflow must be a str, but got {}'.format(
type(mode)))
for _ in range(epochs):
if mode == 'train' and self.epoch >= self._max_epochs:
break
# 运行一个epoch
epoch_runner(data_loaders[i], **kwargs)
time.sleep(1) # wait for some hooks like loggers to finish
# 运行结束的hook
self.call_hook('after_run')
# 训练的代码
def train(self, data_loader, **kwargs):
self.model.train()
self.mode = 'train'
self.data_loader = data_loader
self._max_iters = self._max_epochs * len(self.data_loader)
self.call_hook('before_train_epoch')
time.sleep(2) # Prevent possible deadlock during epoch transition
for i, data_batch in enumerate(self.data_loader):
self.data_batch = data_batch
self._inner_iter = i
# 训练之前的hook
self.call_hook('before_train_iter')
self.run_iter(data_batch, train_mode=True, **kwargs)
# 训练之后的hook
self.call_hook('after_train_iter')
del self.data_batch
self._iter += 1
self.call_hook('after_train_epoch')
self._epoch += 1
# run_iter的代码
def run_iter(self, data_batch: Any, train_mode: bool, **kwargs) -> None:
if self.batch_processor is not None:
outputs = self.batch_processor(
self.model, data_batch, train_mode=train_mode, **kwargs)
elif train_mode:
# 寻找对应的SingleStage的train_step方法
outputs = self.model.train_step(data_batch, self.optimizer,
**kwargs)
else:
outputs = self.model.val_step(data_batch, self.optimizer, **kwargs)
if not isinstance(outputs, dict):
raise TypeError('"batch_processor()" or "model.train_step()"'
'and "model.val_step()" must return a dict')
if 'log_vars' in outputs:
self.log_buffer.update(outputs['log_vars'], outputs['num_samples'])
self.outputs = outputs
- mmdet3.x----->mmengine
在mmengine中,runner做为loop中的一个对象参与训练过程
@LOOPS.register_module()
class EpochBasedTrainLoop(BaseLoop):
"""Loop for epoch-based training.
Args:
runner (Runner): A reference of runner.
dataloader (Dataloader or dict): A dataloader object or a dict to
build a dataloader.
max_epochs (int): Total training epochs.
val_begin (int): The epoch that begins validating.
Defaults to 1.
val_interval (int): Validation interval. Defaults to 1.
dynamic_intervals (List[Tuple[int, int]], optional): The
first element in the tuple is a milestone and the second
element is a interval. The interval is used after the
corresponding milestone. Defaults to None.
"""
def __init__(
self,
runner,
dataloader: Union[DataLoader, Dict],
max_epochs: int,
val_begin: int = 1,
val_interval: int = 1,
dynamic_intervals: Optional[List[Tuple[int, int]]] = None) -> None:
super().__init__(runner, dataloader)
self._max_epochs = int(max_epochs)
assert self._max_epochs == max_epochs, \
f'`max_epochs` should be a integer number, but get {max_epochs}.'
self._max_iters = self._max_epochs * len(self.dataloader)
self._epoch = 0
self._iter = 0
self.val_begin = val_begin
self.val_interval = val_interval
if hasattr(self.dataloader.dataset, 'metainfo'):
self.runner.visualizer.dataset_meta = \
self.dataloader.dataset.metainfo
else:
warnings.warn(
f'Dataset {self.dataloader.dataset.__class__.__name__} has no '
'metainfo. ``dataset_meta`` in visualizer will be '
'None.')
self.dynamic_milestones, self.dynamic_intervals = \
calc_dynamic_intervals(
self.val_interval, dynamic_intervals)
@property
def max_epochs(self):
"""int: Total epochs to train model."""
return self._max_epochs
@property
def max_iters(self):
"""int: Total iterations to train model."""
return self._max_iters
@property
def epoch(self):
"""int: Current epoch."""
return self._epoch
@property
def iter(self):
"""int: Current iteration."""
return self._iter
def run(self) -> torch.nn.Module:
"""Launch training."""
self.runner.call_hook('before_train')
while self._epoch < self._max_epochs:
# 训练代码
self.run_epoch()
self._decide_current_val_interval()
if (self.runner.val_loop is not None
and self._epoch >= self.val_begin
and self._epoch % self.val_interval == 0):
self.runner.val_loop.run()
self.runner.call_hook('after_train')
return self.runner.model
def run_epoch(self) -> None:
"""Iterate one epoch."""
self.runner.call_hook('before_train_epoch')
# model.train()
self.runner.model.train()
for idx, data_batch in enumerate(self.dataloader):
self.run_iter(idx, data_batch)
self.runner.call_hook('after_train_epoch')
self._epoch += 1
def run_iter(self, idx, data_batch: Sequence[dict]) -> None:
"""Iterate one min-batch.
Args:
data_batch (Sequence[dict]): Batch of data from dataloader.
"""
self.runner.call_hook(
'before_train_iter', batch_idx=idx, data_batch=data_batch)
# Enable gradient accumulation mode and avoid unnecessary gradient
# synchronization during gradient accumulation process.
# outputs should be a dict of loss.
outputs = self.runner.model.train_step(
data_batch, optim_wrapper=self.runner.optim_wrapper)
self.runner.call_hook(
'after_train_iter',
batch_idx=idx,
data_batch=data_batch,
outputs=outputs)
self._iter += 1
def _decide_current_val_interval(self) -> None:
"""Dynamically modify the ``val_interval``."""
step = bisect.bisect(self.dynamic_milestones, (self.epoch + 1))
self.val_interval = self.dynamic_intervals[step - 1]