1.前言
最近在研究maptr,因为代码实在复杂,不记录一下很容易就忘记,因此来开个坑写一写自己的研究历程。
2.运行
学习一个项目的必经之路首先是要将这个项目运行起来,建议完全按照官方的安装环境的方式,避免发生问题,装完环境后,按照官方的命令运行即可。那么如何进行debug来逐行进行查看呢?我使用的是pycharm进行debug,可以分为如下两个步骤
- pycharm连接服务器
这一步网上有很多讲解,可以参考如下博客:
PyCharm连接远程服务器配置过程_pycharm将代码同步到远程服务器-CSDN博客
- 链接数据集
运行时,有可能报错说找不到data数据集,或者出行路径错误,这就需要我们进行地址映射相关操作:
pycharm:远程连接服务器调试代码(保姆级详细步骤)_pycharm远程连接服务器跑代码-CSDN博客
3.网络结构
在正式开始调试之前,我们非常有必要熟悉一下网络的整体结构
从图上可以看出整体的关键就两步:1)利用 Map Encoder提取BEV feature;2)将query 和 BEV feature输入Map Decoder 输出预测结果。
4.模型配置文件
本文采用的是tiny模型进行测试,几个模型之间的不同点主要在于bev_query的大小、输入数据的类型以及FPN的多尺度特征个数配置文件为projects/configs/maptr/maptr_nano_r18_110e.py ,模型的网络结构在此进行定义,运行时,首先会对下面的模块进行注册,从上到下基本上就是forward的步骤了。
_base_ = [
'../datasets/custom_nus-3d.py', # 导入自定义的数据集配置文件
'../_base_/default_runtime.py' # 导入默认的运行时配置文件
]
# 插件配置
plugin = True
plugin_dir = 'projects/mmdet3d_plugin/'
# 点云范围配置
point_cloud_range = [-15.0, -30.0, -2.0, 15.0, 30.0, 2.0]
voxel_size = [0.15, 0.15, 4]
# 图像归一化配置
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
# 目标类别配置
class_names = [
'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
]
# 地图类别配置
map_classes = ['divider', 'ped_crossing','boundary']
fixed_ptsnum_per_gt_line = 20 # 每个地图类别的固定点数
fixed_ptsnum_per_pred_line = 20 # 每个预测框的固定点数
eval_use_same_gt_sample_num_flag = True # 是否使用相同数量的真实样本进行评估
num_map_classes = len(map_classes) # 地图类别数量
# 输入模态配置
input_modality = dict(
use_lidar=False,
use_camera=True,
use_radar=False,
use_map=False,
use_external=True
)
_dim_ = 256 # 模型维度
_pos_dim_ = _dim_ // 2 # 位置编码维度
_ffn_dim_ = _dim_ * 2 # 前馈网络维度
_num_levels_ = 1 # Transformer层数
bev_h_ = 80 # BEV图像高度
bev_w_ = 40 # BEV图像宽度
queue_length = 1 # 每个序列包含的帧数
model = dict(
type='MapTR',
use_grid_mask=True,
video_test_mode=False,
pretrained=dict(img='/root/autodl-fs/MapTR1/ckpts/resnet18-f37072fd.pth'), # 预训练权重
img_backbone=dict(
type='ResNet',
depth=18,
num_stages=4,
out_indices=(3,),
frozen_stages=-1,
norm_cfg=dict(type='BN', requires_grad=True),
norm_eval=False,
style='pytorch'
),
# 其他模型配置...
)
# 数据集配置
dataset_type = 'CustomNuScenesLocalMapDataset'
data_root = '/root/autodl-fs/MapTR1/data/nuscenes/'
file_client_args = dict(backend='disk')
train_pipeline = [
# 数据预处理和增强操作...
]
test_pipeline = [
# 数据预处理和增强操作...
]
data = dict(
samples_per_gpu=6,
workers_per_gpu=2,
train=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'nuscenes_infos_temporal_train.pkl',
pipeline=train_pipeline,
classes=class_names,
modality=input_modality,
test_mode=False,
use_valid_flag=True,
bev_size=(bev_h_, bev_w_),
pc_range=point_cloud_range,
fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line,
eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag,
padding_value=-10000,
map_classes=map_classes,
queue_length=queue_length,
box_type_3d='LiDAR'
),
val=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
map_ann_file=data_root + 'nuscenes_map_anns_val.json',
pipeline=test_pipeline,
bev_size=(bev_h_, bev_w_),
pc_range=point_cloud_range,
fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line,
eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag,
padding_value=-10000,
map_classes=map_classes,
classes=class_names,
modality=input_modality,
samples_per_gpu=1
),
test=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
map_ann_file=data_root + 'nuscenes_map_anns_val.json',
pipeline=test_pipeline,
bev_size=(bev_h_, bev_w_),
pc_range=point_cloud_range,
fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line,
eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag,
padding_value=-10000,
map_classes=map_classes,
classes=class_names,
modality=input_modality
),
shuffler_sampler=dict(type='DistributedGroupSampler'),
nonshuffler_sampler=dict(type='DistributedSampler')
)
optimizer = dict(
type='AdamW',
lr=4e-3,
paramwise_cfg=dict(
custom_keys={
'img_backbone': dict(lr_mult=0.1),
}),
weight_decay=0.01
)
optimizer_config = dict(grad_clip=dict(max_norm=50, norm_type=2))
lr_config = dict(
policy='CosineAnnealing',
warmup='linear',
warmup_iters=500,
warmup_ratio=1.0 / 3,
min_lr_ratio=1e-3
)
total_epochs = 100
evaluation = dict(interval=2, pipeline=test_pipeline, metric='chamfer')
runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
]
)
fp16 = dict(loss_scale=512.)
checkpoint_config = dict(interval=5)
以上是对配置文件的注释,其中包括了模型配置、数据集配置、优化器配置、学习率配置、训练周期配置等。每个配置项都有相应的注释说明其作用和含义。
5.forword流程
能够进行debug后,就可以逐行代码进行查看变量的shape了,由于该项目涉及了很多模块,而且是用openmmlab实现的,刚接触时会有点绕,于是通过多次调试,我记录了推理的大致流程,基本上可以按下面的数字依次进行。
5.1构建模型
1.autodl-fs/MapTR1/tools/train.py
def main():
args = parse_args()
cfg = Config.fromfile(args.config)
if args.cfg_options is not None:
cfg.merge_from_dict(args.cfg_options)
# import modules from string list.
# 如果配置文件中有自定义导入模块的设置,则导入这些模块
if cfg.get('custom_imports', None):
from mmcv.utils import import_modules_from_strings
import_modules_from_strings(**cfg['custom_imports'])
# import modules from plguin/xx, registry will be updated
# 如果配置文件中有插件设置,则导入插件模块
if hasattr(cfg, 'plugin'):
if cfg.plugin:
import importlib
if hasattr(cfg, 'plugin_dir'):
plugin_dir = cfg.plugin_dir
_module_dir = os.path.dirname(plugin_dir)
_module_dir = _module_dir.split('/')
_module_path = _module_dir[0] # 这里肯定有问题
for m in _module_dir[1:]:
_module_path = _module_path + '.' + m
print(_module_path)
plg_lib = importlib.import_module(_module_path)
else:
# import dir is the dirpath for the config file
_module_dir = os.path.dirname(args.config)
_module_dir = _module_dir.split('/')
_module_path = _module_dir[0]
for m in _module_dir[1:]:
_module_path = _module_path + '.' + m
print(_module_path)
plg_lib = importlib.import_module(_module_path)
from projects.mmdet3d_plugin.bevformer.apis.train import custom_train_model
# set cudnn_benchmark
# 设置cudnn_benchmark
if cfg.get('cudnn_benchmark', False):
torch.backends.cudnn.benchmark = True
# work_dir is determined in this priority: CLI > segment in file > filename
if args.work_dir is not None:
# update configs according to CLI args if args.work_dir is not None
cfg.work_dir = args.work_dir
elif cfg.get('work_dir', None) is None:
# use config filename as default work_dir if cfg.work_dir is None
cfg.work_dir = osp.join('./work_dirs',
osp.splitext(osp.basename(args.config))[0])
# if args.resume_from is not None:
if args.resume_from is not None and osp.isfile(args.resume_from):
cfg.resume_from = args.resume_from
if args.gpu_ids is not None:
cfg.gpu_ids = args.gpu_ids
else:
cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus)
if digit_version(TORCH_VERSION) == digit_version('1.8.1') and cfg.optimizer['type'] == 'AdamW':
cfg.optimizer['type'] = 'AdamW2' # fix bug in Adamw
if args.autoscale_lr:
# apply the linear scaling rule (https://arxiv.org/abs/1706.02677)
cfg.optimizer['lr'] = cfg.optimizer['lr'] * len(cfg.gpu_ids) / 8
# init distributed env first, since logger depends on the dist info.
if args.launcher == 'none':
distributed = False
else:
distributed = True
init_dist(args.launcher, **cfg.dist_params)
# re-set gpu_ids with distributed training mode
_, world_size = get_dist_info()
cfg.gpu_ids = range(world_size)
# create work_dir
# 创建工作目录
mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))
# dump config
cfg.dump(osp.join(cfg.work_dir, osp.basename(args.config)))
# init the logger before other steps
timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
log_file = osp.join(cfg.work_dir, f'{timestamp}.log')
# specify logger name, if we still use 'mmdet', the output info will be
# filtered and won't be saved in the log_file
# TODO: ugly workaround to judge whether we are training det or seg model
if cfg.model.type in ['EncoderDecoder3D']:
logger_name = 'mmseg'
else:
logger_name = 'mmdet'
logger = get_root_logger(
log_file=log_file, log_level=cfg.log_level, name=logger_name)
# init the meta dict to record some important informa1tion such as
# environment info and seed, which will be logged
meta = dict()
# log env info
env_info_dict = collect_env()
env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()])
dash_line = '-' * 60 + '\n'
logger.info('Environment info:\n' + dash_line + env_info + '\n' +
dash_line)
meta['env_info'] = env_info
meta['config'] = cfg.pretty_text
# log some basic info
logger.info(f'Distributed training: {distributed}')
logger.info(f'Config:\n{cfg.pretty_text}')
# set random seeds
if args.seed is not None:
logger.info(f'Set random seed to {args.seed}, '
f'deterministic: {args.deterministic}')
set_random_seed(args.seed, deterministic=args.deterministic)
cfg.seed = args.seed
meta['seed'] = args.seed
meta['exp_name'] = osp.basename(args.config)
# 构建模型
model = build_model(
cfg.model,
train_cfg=cfg.get('train_cfg'),
test_cfg=cfg.get('test_cfg'))
model.init_weights()
......
当执行到构建模型这一步时,就会调用mmdetection3d中的builder.py进行模型的构建。(mmdetection3d在3d领域检测,分割等任务这一块真的很厉害)
2.miniconda3/envs/maptr/lib/python3.8/site-packages/mmdet/models/builder.py
def build_model(cfg, train_cfg=None, test_cfg=None):
"""A function warpper for building 3D detector or segmentor according to
cfg.
Should be deprecated in the future.
"""
if cfg.type in ['EncoderDecoder3D']:
return build_segmentor(cfg, train_cfg=train_cfg, test_cfg=test_cfg)
else:
return build_detector(cfg, train_cfg=train_cfg, test_cfg=test_cfg)
......
# 会在上一步执行后转入下面函数构建detector
def build_detector(cfg, train_cfg=None, test_cfg=None):
"""Build detector."""
if train_cfg is not None or test_cfg is not None:
warnings.warn(
'train_cfg and test_cfg is deprecated, '
'please specify them in model', UserWarning)
assert cfg.get('train_cfg') is None or train_cfg is None, \
'train_cfg specified in both outer field and model field '
assert cfg.get('test_cfg') is None or test_cfg is None, \
'test_cfg specified in both outer field and model field '
return DETECTORS.build(
cfg, default_args=dict(train_cfg=train_cfg, test_cfg=test_cfg))
3.miniconda3/envs/maptr/lib/python3.8/site-packages/mmcv/utils/registry.py
def build(self, *args, **kwargs):
return self.build_func(*args, **kwargs, registry=self)
......
def build_from_cfg(cfg, registry, default_args=None):
"""从配置字典中构建模块。
Args:
cfg (dict): 配置字典。至少包含键名为"type"的键。
registry (:obj:`Registry`): 用于搜索类型的注册表。
default_args (dict, optional): 默认的初始化参数。
Returns:
object: 构建的对象。
"""
if not isinstance(cfg, dict):
raise TypeError(f'cfg must be a dict, but got {type(cfg)}')
if 'type' not in cfg:
if default_args is None or 'type' not in default_args:
raise KeyError(
'`cfg` or `default_args` must contain the key "type", '
f'but got {cfg}\n{default_args}')
if not isinstance(registry, Registry):
raise TypeError('registry must be an mmcv.Registry object, '
f'but got {type(registry)}')
if not (isinstance(default_args, dict) or default_args is None):
raise TypeError('default_args must be a dict or None, '
f'but got {type(default_args)}')
args = cfg.copy()
if default_args is not None:
for name, value in default_args.items():
args.setdefault(name, value)
obj_type = args.pop('type')
if isinstance(obj_type, str):
obj_cls = registry.get(obj_type)
if obj_cls is None:
raise KeyError(
f'{obj_type} is not in the {registry.name} registry')
elif inspect.isclass(obj_type):
obj_cls = obj_type
else:
raise TypeError(
f'type must be a str or valid type, but got {type(obj_type)}')
try:
return obj_cls(**args)
except Exception as e:
# Normal TypeError does not print class name.
raise type(e)(f'{obj_cls.__name__}: {e}')
注意上述代码中的obj_cls,它其实是自己所写的MapTR类,在调试页面中可以看到obj_cls值如下:
所以上述代码在return obj_cls(**args)时会返回autodl-fs/MapTR1/projects/mmdet3d_plugin /maptr/detectors/maptr.py
4.autodl-fs/MapTR1/projects/mmdet3d_plugin /maptr/detectors/maptr.py
def __init__(self,
use_grid_mask=False,
pts_voxel_layer=None,
pts_voxel_encoder=None,
pts_middle_encoder=None,
pts_fusion_layer=None,
img_backbone=None,
pts_backbone=None,
img_neck=None,
pts_neck=None,
pts_bbox_head=None,
img_roi_head=None,
img_rpn_head=None,
train_cfg=None,
test_cfg=None,
pretrained=None,
video_test_mode=False,
modality='vision',
lidar_encoder=None,
):
"""
MapTR模型的初始化函数。
Args:
use_grid_mask (bool, optional): 是否使用GridMask数据增强。默认为False。
pts_voxel_layer (object, optional): 点云的体素化层。默认为None。
pts_voxel_encoder (object, optional): 点云的体素编码器。默认为None。
pts_middle_encoder (object, optional): 点云的中间编码器。默认为None。
pts_fusion_layer (object, optional): 点云的融合层。默认为None。
img_backbone (object, optional): 图像的主干网络。默认为None。
pts_backbone (object, optional): 点云的主干网络。默认为None。
img_neck (object, optional): 图像的特征融合层。默认为None。
pts_neck (object, optional): 点云的特征融合层。默认为None。
pts_bbox_head (object, optional): 点云的目标框头部。默认为None。
img_roi_head (object, optional): 图像的ROI头部。默认为None。
img_rpn_head (object, optional): 图像的RPN头部。默认为None。
train_cfg (dict, optional): 训练配置。默认为None。
test_cfg (dict, optional): 测试配置。默认为None。
pretrained (str, optional): 预训练模型路径。默认为None。
video_test_mode (bool, optional): 是否在视频测试模式下。默认为False。
modality (str, optional): 模态类型。默认为'vision'。
lidar_encoder (dict, optional): 激光雷达编码器的配置。默认为None。
"""
super(MapTR, self).__init__(pts_voxel_layer, pts_voxel_encoder,
pts_middle_encoder, pts_fusion_layer,
img_backbone, pts_backbone, img_neck, pts_neck,
pts_bbox_head, img_roi_head, img_rpn_head,
train_cfg, test_cfg, pretrained)
5.autodl-fs/MapTR1/projects/mmdet3d_plugin/maptr/dense_heads/maptr_head.py
class MapTRHead(nn.Module):
def __init__(self,
*args,
with_box_refine=False,
as_two_stage=False,
transformer=None,
bbox_coder=None,
num_cls_fcs=2,
code_weights=None,
bev_h=30,
bev_w=30,
num_vec=20,
num_pts_per_vec=2,
num_pts_per_gt_vec=2,
query_embed_type='all_pts',
transform_method='minmax',
gt_shift_pts_pattern='v0',
dir_interval=1,
loss_pts=dict(type='ChamferDistance',
loss_src_weight=1.0,
loss_dst_weight=1.0),
loss_dir=dict(type='PtsDirCosLoss', loss_weight=2.0),
**kwargs):
self.bev_h = bev_h
self.bev_w = bev_w
self.fp16_enabled = False
self.with_box_refine = with_box_refine
self.as_two_stage = as_two_stage
self.bev_encoder_type = transformer.encoder.type
if self.as_two_stage:
transformer['as_two_stage'] = self.as_two_stage
if 'code_size' in kwargs:
self.code_size = kwargs['code_size']
else:
self.code_size = 10
if code_weights is not None:
self.code_weights = code_weights
else:
self.code_weights = [1.0, 1.0, 1.0,
1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2]
self.bbox_coder = build_bbox_coder(bbox_coder)
self.pc_range = self.bbox_coder.pc_range
self.real_w = self.pc_range[3] - self.pc_range[0]
self.real_h = self.pc_range[4] - self.pc_range[1]
self.num_cls_fcs = num_cls_fcs - 1
self.query_embed_type = query_embed_type
self.transform_method = transform_method
self.gt_shift_pts_pattern = gt_shift_pts_pattern
num_query = num_vec * num_pts_per_vec
self.num_query = num_query
self.num_vec = num_vec
self.num_pts_per_vec = num_pts_per_vec
self.num_pts_per_gt_vec = num_pts_per_gt_vec
self.dir_interval = dir_interval
super(MapTRHead, self).__init__(
*args, transformer=transformer, **kwargs) # 进入transformer
......
6.autodl-fs/MapTR1/projects/mmdet3d_plugin/maptr/modules/transformer.py
class MapTRPerceptionTransformer(BaseModule):
"""
实现Detr3D的Transformer。
Args:
num_feature_levels (int, optional): FPN的特征图数量。默认为4。
num_cams (int, optional): 相机数量。默认为6。
two_stage_num_proposals (int, optional): 当as_two_stage为True时的提议数量。默认为300。
fuser (dict, optional): 融合器的配置。默认为None。
encoder (dict, optional): 编码器的配置。默认为None。
decoder (dict, optional): 解码器的配置。默认为None。
embed_dims (int, optional): 嵌入维度。默认为256。
rotate_prev_bev (bool, optional): 是否旋转上一帧BEV特征。默认为True。
use_shift (bool, optional): 是否使用平移向量。默认为True。
use_can_bus (bool, optional): 是否使用CAN总线数据。默认为True。
len_can_bus (int, optional): CAN总线数据的长度。默认为18。
can_bus_norm (bool, optional): 是否对CAN总线数据进行归一化。默认为True。
use_cams_embeds (bool, optional): 是否使用相机嵌入。默认为True。
rotate_center (list, optional): 旋转中心。默认为[100, 100]。
modality (str, optional): 模态类型。默认为'vision'。
"""
def __init__(self,
num_feature_levels=4,
num_cams=6,
two_stage_num_proposals=300,
fuser=None,
encoder=None,
decoder=None,
embed_dims=256,
rotate_prev_bev=True,
use_shift=True,
use_can_bus=True,
len_can_bus=18,
can_bus_norm=True,
use_cams_embeds=True,
rotate_center=[100, 100],
modality='vision',
**kwargs):
super(MapTRPerceptionTransformer, self).__init__(**kwargs)
if modality == 'fusion':
self.fuser = build_fuser(fuser) #TODO
self.use_attn_bev = encoder['type'] == 'BEVFormerEncoder'
self.encoder = build_transformer_layer_sequence(encoder)
self.decoder = build_transformer_layer_sequence(decoder)
self.embed_dims = embed_dims
self.num_feature_levels = num_feature_levels
self.num_cams = num_cams
self.fp16_enabled = False
self.rotate_prev_bev = rotate_prev_bev
self.use_shift = use_shift
self.use_can_bus = use_can_bus
self.len_can_bus = len_can_bus
self.can_bus_norm = can_bus_norm
self.use_cams_embeds = use_cams_embeds
self.two_stage_num_proposals = two_stage_num_proposals
self.init_layers()
self.rotate_center = rotate_center
函数的实现逻辑如下:
- 根据参数配置构建融合器对象(如果模态为融合模态)。
- 判断是否使用BEV编码器。
- 根据参数配置构建编码器和解码器的序列。
- 设置一些其他属性,如嵌入维度、特征图数量等。
- 设置一些标志位,如是否旋转上一帧BEV特征、是否使用平移向量等。
- 返回初始化后的MapTRPerceptionTransformer对象。
7.autodl-fs/MapTR1/projects/mmdet3d_plugin/bevformer/modules/encoder.py
@TRANSFORMER_LAYER_SEQUENCE.register_module()
class BEVFormerEncoder(TransformerLayerSequence):
def __init__(self, *args, pc_range=None, num_points_in_pillar=4, return_intermediate=False, dataset_type='nuscenes', **kwargs):
"""
实现DET的解码器。
Args:
return_intermediate (bool, optional): 是否返回中间输出。默认为False。
coder_norm_cfg (dict, optional): 最后一层归一化层的配置。默认为`LN`。
"""
super(BEVFormerEncoder, self).__init__(*args, **kwargs)
self.return_intermediate = return_intermediate
self.num_points_in_pillar = num_points_in_pillar
self.pc_range = pc_range
self.fp16_enabled = False
8.autodl-fs/MapTR1/projects/mmdet3d_plugin/bevformer/modules/custom_base_transformer_layer.py
@TRANSFORMER_LAYER.register_module()
class MyCustomBaseTransformerLayer(BaseModule):
def __init__(self, attn_cfgs=None, ffn_cfgs=dict(type='FFN', embed_dims=256, feedforward_channels=1024, num_fcs=2, ffn_drop=0., act_cfg=dict(type='ReLU', inplace=True)), operation_order=None, norm_cfg=dict(type='LN'), init_cfg=None, batch_first=True, **kwargs):
"""
实现视觉Transformer的基础`TransformerLayer`。
它可以从`mmcv.ConfigDict`构建,并支持更灵活的定制,例如使用任意数量的`FFN`或`LN`,并通过指定名为`attn_cfgs`的`ConfigDict`列表来使用不同类型的`attention`。
当指定`operation_order`的第一个元素为`norm`时,支持`prenorm`。
Args:
attn_cfgs (list[`mmcv.ConfigDict`] | `mmcv.ConfigDict` | None, optional): `self_attention`或`cross_attention`模块的配置。
列表中的配置顺序应与`operation_order`中相应的attention一致。
如果是字典,则会使用该配置构建`operation_order`中的所有attention模块。默认为None。
ffn_cfgs (list[`mmcv.ConfigDict`] | `mmcv.ConfigDict` | None, optional): FFN的配置。
列表中的配置顺序应与`operation_order`中相应的ffn一致。
如果是字典,则会使用该配置构建`operation_order`中的所有ffn模块。默认为`dict(type='FFN', embed_dims=256, feedforward_channels=1024, num_fcs=2, ffn_drop=0., act_cfg=dict(type='ReLU', inplace=True))`。
operation_order (tuple[str], optional): Transformer中操作的执行顺序。
例如('self_attn', 'norm', 'ffn', 'norm')。
当指定`operation_order`的第一个元素为`norm`时,支持`prenorm`。
默认为None。
norm_cfg (dict, optional): 归一化层的配置字典。默认为`dict(type='LN')`。
init_cfg (obj:`mmcv.ConfigDict`, optional): 初始化的配置。默认为None。
batch_first (bool, optional): Key、Query和Value的形状是否为(batch, n, embed_dim)或(n, batch, embed_dim)。
默认为True。
"""
super(MyCustomBaseTransformerLayer, self).__init__(init_cfg)
self.batch_first = batch_first
assert set(operation_order) & set(['self_attn', 'norm', 'ffn', 'cross_attn']) == set(operation_order), f'The operation_order of {self.__class__.__name__} should contains all four operation type {['self_attn', 'norm', 'ffn', 'cross_attn']}'
num_attn = operation_order.count('self_attn') + operation_order.count('cross_attn')
if isinstance(attn_cfgs, dict):
attn_cfgs = [copy.deepcopy(attn_cfgs) for _ in range(num_attn)]
else:
assert num_attn == len(attn_cfgs), f'The length of attn_cfg {num_attn} is not consistent with the number of attention in operation_order {operation_order}.'
self.num_attn = num_attn
self.operation_order = operation_order
self.norm_cfg = norm_cfg
self.pre_norm = operation_order[0] == 'norm'
self.attentions = ModuleList()
index = 0
for operation_name in operation_order:
if operation_name in ['self_attn', 'cross_attn']:
if 'batch_first' in attn_cfgs[index]:
assert self.batch_first == attn_cfgs[index]['batch_first']
else:
attn_cfgs[index]['batch_first'] = self.batch_first
attention = build_attention(attn_cfgs[index])
attention.operation_name = operation_name
self.attentions.append(attention)
index += 1
self.embed_dims = self.attentions[0].embed_dims
self.ffns = ModuleList()
num_ffns = operation_order.count('ffn')
if isinstance(ffn_cfgs, dict):
ffn_cfgs = ConfigDict(ffn_cfgs)
if isinstance(ffn_cfgs, dict):
ffn_cfgs = [copy.deepcopy(ffn_cfgs) for _ in range(num_ffns)]
assert len(ffn_cfgs) == num_ffns
for ffn_index in range(num_ffns):
if 'embed_dims' not in ffn_cfgs[ffn_index]:
ffn_cfgs['embed_dims'] = self.embed_dims
else:
assert ffn_cfgs[ffn_index]['embed_dims'] == self.embed_dims
self.ffns.append(build_feedforward_network(ffn_cfgs[ffn_index]))
self.norms = ModuleList()
num_norms = operation_order.count('norm')
for _ in range(num_norms):
self.norms.append(build_norm_layer(norm_cfg, self.embed_dims)[1])
函数的实现逻辑如下:
- 调用父类的构造函数。
- 设置是否按批次优先的标志位。
- 检查
operation_order
是否包含所有四种操作类型。 - 计算
self_attn
和cross_attn
的数量,并根据配置构建相应的attention模块。 - 设置嵌入维度为第一个attention模块的嵌入维度。
- 根据配置构建相应数量的FFN模块。
- 根据配置构建相应数量的归一化层。
- 返回初始化后的
MyCustomBaseTransformerLayer
对象
9.autodl-fs/MapTR1/projects/mmdet3d_plugin/bevformer/modules/temporal_self_attention.py
@ATTENTION.register_module()
class TemporalSelfAttention(BaseModule):
def __init__(self, embed_dims=256, num_heads=8, num_levels=4, num_points=4, num_bev_queue=2, im2col_step=64, dropout=0.1, batch_first=True, norm_cfg=None, init_cfg=None):
"""
实现BEVFormer中基于Deformable-Detr的注意力模块。
Args:
embed_dims (int, optional): 注意力的嵌入维度。默认为256。
num_heads (int, optional): 并行注意力头的数量。默认为8。
num_levels (int, optional): 注意力中使用的特征图的数量。默认为4。
num_points (int, optional): 每个头中每个查询点的采样点数。默认为4。
num_bev_queue (int, optional): 在这个版本中,我们只使用一个历史BEV和一个当前BEV。
BEV队列的长度为2。默认为2。
im2col_step (int, optional): 在image_to_column中使用的步长。默认为64。
dropout (float, optional): `inp_identity`上的Dropout层。默认为0.1。
batch_first (bool, optional): Key、Query和Value的形状是否为(batch, n, embed_dim)或(n, batch, embed_dim)。
默认为True。
norm_cfg (dict, optional): 归一化层的配置字典。默认为None。
init_cfg (obj:`mmcv.ConfigDict`, optional): 初始化的配置。默认为None。
"""
super().__init__(init_cfg)
if embed_dims % num_heads != 0:
raise ValueError(f'embed_dims must be divisible by num_heads, but got {embed_dims} and {num_heads}')
dim_per_head = embed_dims // num_heads
self.norm_cfg = norm_cfg
self.dropout = nn.Dropout(dropout)
self.batch_first = batch_first
self.fp16_enabled = False
def _is_power_of_2(n):
if (not isinstance(n, int)) or (n < 0):
raise ValueError('invalid input for _is_power_of_2: {} (type: {})'.format(n, type(n)))
return (n & (n - 1) == 0) and n != 0
if not _is_power_of_2(dim_per_head):
warnings.warn("You'd better set embed_dims in MultiScaleDeformAttention to make the dimension of each attention head a power of 2 which is more efficient in our CUDA implementation.")
self.im2col_step = im2col_step
self.embed_dims = embed_dims
self.num_levels = num_levels
self.num_heads = num_heads
self.num_points = num_points
self.num_bev_queue = num_bev_queue
self.sampling_offsets = nn.Linear(embed_dims*self.num_bev_queue, num_bev_queue*num_heads * num_levels * num_points * 2)
self.attention_weights = nn.Linear(embed_dims*self.num_bev_queue, num_bev_queue*num_heads * num_levels * num_points)
self.value_proj = nn.Linear(embed_dims, embed_dims)
self.output_proj = nn.Linear(embed_dims, embed_dims)
self.init_weights()
函数的实现逻辑如下:
- 调用父类的构造函数。
- 检查
embed_dims
是否可以被num_heads
整除。 - 设置是否按批次优先的标志位。
- 设置是否启用FP16混合精度训练的标志位。
- 检查
dim_per_head
是否为2的幂次方。 - 设置
im2col_step
、embed_dims
、num_levels
、num_heads
、num_points
和num_bev_queue
等属性。 - 构建线性层用于计算采样偏移量和注意力权重。
- 构建线性层用于投影Value和输出。
- 初始化权重。
10.autodl-fs/MapTR1/projects/mmdet3d_plugin/maptr/modules/geometry_kernel_attention.py
@ATTENTION.register_module()
class GeometrySptialCrossAttention(BaseModule):
"""在BEVFormer中使用的注意力模块。
Args:
embed_dims (int): 注意力的嵌入维度。
默认值: 256。
num_cams (int): 相机的数量。
dropout (float): 在`inp_residual`上的Dropout层。
默认值: 0.1。
init_cfg (obj:`mmcv.ConfigDict`): 初始化的配置。
默认值: None。
deformable_attention (dict): 在SCA中使用的可变形注意力的配置。
"""
def __init__(self,
embed_dims=256,
num_cams=6,
pc_range=None,
dropout=0.1,
init_cfg=None,
batch_first=False,
attention=dict(
type='MSDeformableAttention3D',
embed_dims=256,
num_levels=4),
**kwargs
):
super(GeometrySptialCrossAttention, self).__init__(init_cfg)
self.init_cfg = init_cfg
self.dropout = nn.Dropout(dropout)
self.pc_range = pc_range
self.fp16_enabled = False
self.attention = build_attention(attention)
self.embed_dims = embed_dims
self.num_cams = num_cams
self.output_proj = nn.Linear(embed_dims, embed_dims)
self.batch_first = batch_first
self.init_weight()
上述代码是一个自定义的GeometrySptialCrossAttention
类,用于BEVFormer中的注意力模块。
该类具有以下功能:
- 实现了BEVFormer中使用的注意力模块。
- 可以处理几何信息并计算注意力。
该类的主要方法和功能如下:
__init__()
: 初始化模块的参数和属性。forward()
: 前向传播函数,计算注意力模块的输出。init_weight()
: 初始化模块的权重。
此外,还包含了一些辅助函数和属性,如dropout
、pc_range
、output_proj
等。
该类的作用是实现BEVFormer中的注意力模块,用于处理几何信息并计算注意力。
11.回到autodl-fs/MapTR1/projects/mmdet3d_plugin /maptr/detectors/maptr.py
self.grid_mask = GridMask(
True, True, rotate=1, offset=False, ratio=0.5, mode=1, prob=0.7)
self.use_grid_mask = use_grid_mask
self.fp16_enabled = False
# temporal
self.video_test_mode = video_test_mode
self.prev_frame_info = {
'prev_bev': None,
'scene_token': None,
'prev_pos': 0,
'prev_angle': 0,
}
self.modality = modality
if self.modality == 'fusion' and lidar_encoder is not None :
if lidar_encoder["voxelize"].get("max_num_points", -1) > 0:
voxelize_module = Voxelization(**lidar_encoder["voxelize"])
else:
voxelize_module = DynamicScatter(**lidar_encoder["voxelize"])
self.lidar_modal_extractor = nn.ModuleDict(
{
"voxelize": voxelize_module,
"backbone": builder.build_middle_encoder(lidar_encoder["backbone"]),
}
)
self.voxelize_reduce = lidar_encoder.get("voxelize_reduce", True)
5.2构建数据集
1.接着回到autodl-fs/MapTR1/tools/train.py进一步构建数据集
# 构建数据集
datasets = [build_dataset(cfg.data.train)] # 根据配置文件中的训练数据集配置信息构建训练数据集。
if len(cfg.workflow) == 2:
val_dataset = copy.deepcopy(cfg.data.val)
# in case we use a dataset wrapper
if 'dataset' in cfg.data.train:
val_dataset.pipeline = cfg.data.train.dataset.pipeline
else:
val_dataset.pipeline = cfg.data.train.pipeline
# set test_mode=False here in deep copied config
# which do not affect AP/AR calculation later
# refer to https://mmdetection3d.readthedocs.io/en/latest/tutorials/customize_runtime.html#customize-workflow # noqa
val_dataset.test_mode = False
datasets.append(build_dataset(val_dataset)) # 根据配置文件中的验证数据集配置信息构建验证数据集。
if cfg.checkpoint_config is not None:
# save mmdet version, config file content and class names in
# checkpoints as meta data
cfg.checkpoint_config.meta = dict(
mmdet_version=mmdet_version,
mmseg_version=mmseg_version,
mmdet3d_version=mmdet3d_version,
config=cfg.pretty_text,
CLASSES=datasets[0].CLASSES,
PALETTE=datasets[0].PALETTE # for segmentors
if hasattr(datasets[0], 'PALETTE') else None)
# add an attribute for visualization convenience
model.CLASSES = datasets[0].CLASSES
2.miniconda3/envs/maptr/lib/python3.8/site-packages/mmdet3d/datasets/builder.py
def build_dataset(cfg, default_args=None):
from mmdet3d.datasets.dataset_wrappers import CBGSDataset
from mmdet.datasets.dataset_wrappers import (ClassBalancedDataset,
ConcatDataset, RepeatDataset)
if isinstance(cfg, (list, tuple)):
# 如果cfg是一个列表或元组,则递归构建每个配置项对应的数据集,并将它们合并为一个ConcatDataset
dataset = ConcatDataset([build_dataset(c, default_args) for c in cfg])
elif cfg['type'] == 'ConcatDataset':
# 如果cfg的类型是ConcatDataset,则递归构建每个子数据集,并将它们合并为一个ConcatDataset
dataset = ConcatDataset(
[build_dataset(c, default_args) for c in cfg['datasets']],
cfg.get('separate_eval', True))
elif cfg['type'] == 'RepeatDataset':
# 如果cfg的类型是RepeatDataset,则构建一个重复数据集,将指定的数据集重复多次
dataset = RepeatDataset(
build_dataset(cfg['dataset'], default_args), cfg['times'])
elif cfg['type'] == 'ClassBalancedDataset':
# 如果cfg的类型是ClassBalancedDataset,则构建一个类平衡数据集,对指定的数据集进行类别平衡处理
dataset = ClassBalancedDataset(
build_dataset(cfg['dataset'], default_args), cfg['oversample_thr'])
elif cfg['type'] == 'CBGSDataset':
# 如果cfg的类型是CBGSDataset,则构建一个CBGSDataset,用于处理CBGS数据集
dataset = CBGSDataset(build_dataset(cfg['dataset'], default_args))
elif isinstance(cfg.get('ann_file'), (list, tuple)):
# 如果ann_file是一个列表或元组,则调用_concat_dataset函数将它们合并为一个数据集
dataset = _concat_dataset(cfg, default_args)
else:
# 否则,根据cfg中的配置项构建对应的数据集
dataset = build_from_cfg(cfg, DATASETS, default_args)
return dataset
3.miniconda3/envs/maptr/lib/python3.8/site-packages/mmcv/utils/registry.py
def build_from_cfg(cfg, registry, default_args=None):
"""Build a module from config dict.
Args:
cfg (dict): Config dict. It should at least contain the key "type".
registry (:obj:`Registry`): The registry to search the type from.
default_args (dict, optional): Default initialization arguments.
Returns:
object: The constructed object.
"""
if not isinstance(cfg, dict):
raise TypeError(f'cfg must be a dict, but got {type(cfg)}')
if 'type' not in cfg:
if default_args is None or 'type' not in default_args:
raise KeyError(
'`cfg` or `default_args` must contain the key "type", '
f'but got {cfg}\n{default_args}')
if not isinstance(registry, Registry):
raise TypeError('registry must be an mmcv.Registry object, '
f'but got {type(registry)}')
if not (isinstance(default_args, dict) or default_args is None):
raise TypeError('default_args must be a dict or None, '
f'but got {type(default_args)}')
args = cfg.copy()
if default_args is not None:
for name, value in default_args.items():
args.setdefault(name, value)
obj_type = args.pop('type')
if isinstance(obj_type, str):
obj_cls = registry.get(obj_type)
if obj_cls is None:
raise KeyError(
f'{obj_type} is not in the {registry.name} registry')
elif inspect.isclass(obj_type):
obj_cls = obj_type
else:
raise TypeError(
f'type must be a str or valid type, but got {type(obj_type)}')
try:
return obj_cls(**args)
except Exception as e:
# Normal TypeError does not print class name.
raise type(e)(f'{obj_cls.__name__}: {e}')
上述代码是一个用于构建对象的注册表(Registry)和构建对象的函数(build_from_cfg)。
Registry
类是一个注册表,用于将字符串映射到类。它具有以下功能:
- 注册对象可以从注册表中构建。
- 可以根据作用域搜索子注册表。
- 可以根据类名获取注册表记录。
build_from_cfg
函数是一个从配置字典构建对象的函数。它具有以下功能:
- 从配置字典中获取对象的类型。
- 根据类型从注册表中获取对应的类。
- 使用配置字典中的参数构建对象
(构建dataset是一个及其复杂的过程,一时半会儿还难以讲清楚,下图可以看出生成的dataset非常复杂)
5.3开始训练
1.autodl-fs/MapTR1/tools/train.py
custom_train_model(
model,
datasets,
cfg,
distributed=distributed,
validate=(not args.no_validate),
timestamp=timestamp,
meta=meta)
2.autodl-fs/MapTR1/projects/mmdet3d_plugin/bevformer/apis/train.py
def custom_train_model(model,
dataset,
cfg,
distributed=False,
validate=False,
timestamp=None,
eval_model=None,
meta=None):
"""根据配置(cfg)启动模型训练的函数包装器。
因为我们需要在runner中使用不同的eval_hook。在将来应该被弃用。
Args:
model (nn.Module): 要训练的模型。
dataset (Dataset): 用于训练的数据集。
cfg (dict): 配置字典。
distributed (bool, optional): 是否使用分布式训练。默认为False。
validate (bool, optional): 是否在训练过程中进行验证。默认为False。
timestamp (str, optional): 用于保存检查点和日志的时间戳。默认为None。
eval_model (nn.Module, optional): 用于评估的模型。默认为None。
meta (dict, optional): 附加元数据。默认为None。
"""
if cfg.model.type in ['EncoderDecoder3D']:
assert False
else:
custom_train_detector(
model,
dataset,
cfg,
distributed=distributed,
validate=validate,
timestamp=timestamp,
eval_model=eval_model,
meta=meta)
3.autodl-fs/MapTR1/projects/mmdet3d_plugin/bevformer/apis/mmdet_train.py
def custom_train_detector(model,
dataset,
cfg,
distributed=False,
validate=False,
timestamp=None,
eval_model=None,
meta=None):
logger = get_root_logger(cfg.log_level)
# 准备数据加载器
dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
#assert len(dataset)==1s
if 'imgs_per_gpu' in cfg.data:
logger.warning('在MMDet V2.0中,"imgs_per_gpu"已弃用。请使用"samples_per_gpu"代替')
if 'samples_per_gpu' in cfg.data:
logger.warning(
f'得到"imgs_per_gpu"={cfg.data.imgs_per_gpu}和'
f'"samples_per_gpu"={cfg.data.samples_per_gpu},在这个实验中使用"imgs_per_gpu"'
f'={cfg.data.imgs_per_gpu}')
else:
logger.warning(
'在这个实验中,自动设置"samples_per_gpu"="imgs_per_gpu"='
f'{cfg.data.imgs_per_gpu}')
cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu
data_loaders = [
build_dataloader(
ds,
cfg.data.samples_per_gpu,
cfg.data.workers_per_gpu,
# 如果是分布式训练,将忽略cfg.gpus
len(cfg.gpu_ids),
dist=distributed,
seed=cfg.seed,
shuffler_sampler=cfg.data.shuffler_sampler, # dict(type='DistributedGroupSampler'),
nonshuffler_sampler=cfg.data.nonshuffler_sampler, # dict(type='DistributedSampler'),
) for ds in dataset
]
# 将模型放在GPU上
if distributed:
find_unused_parameters = cfg.get('find_unused_parameters', False)
# 设置torch.nn.parallel.DistributedDataParallel中的`find_unused_parameters`参数
model = MMDistributedDataParallel(
model.cuda(),
device_ids=[torch.cuda.current_device()],
broadcast_buffers=False,
find_unused_parameters=find_unused_parameters)
if eval_model is not None:
eval_model = MMDistributedDataParallel(
eval_model.cuda(),
device_ids=[torch.cuda.current_device()],
broadcast_buffers=False,
find_unused_parameters=find_unused_parameters)
else:
model = MMDataParallel(
model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)
if eval_model is not None:
eval_model = MMDataParallel(
eval_model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)
# 构建runner
optimizer = build_optimizer(model, cfg.optimizer)
if 'runner' not in cfg:
cfg.runner = {
'type': 'EpochBasedRunner',
'max_epochs': cfg.total_epochs
}
warnings.warn(
'config现在需要有一个`runner`部分,请在您的配置中设置`runner`。',
UserWarning)
else:
if 'total_epochs' in cfg:
assert cfg.total_epochs == cfg.runner.max_epochs
if eval_model is not None:
runner = build_runner(
cfg.runner,
default_args=dict(
model=model,
eval_model=eval_model,
optimizer=optimizer,
work_dir=cfg.work_dir,
logger=logger,
meta=meta))
else:
runner = build_runner(
cfg.runner,
default_args=dict(
model=model,
optimizer=optimizer,
work_dir=cfg.work_dir,
logger=logger,
meta=meta))
# 一个丑陋的解决方案,使.log和.log.json文件名相同
runner.timestamp = timestamp
# fp16设置
fp16_cfg = cfg.get('fp16', None)
if fp16_cfg is not None:
optimizer_config = Fp16OptimizerHook(
**cfg.optimizer_config, **fp16_cfg, distributed=distributed)
elif distributed and 'type' not in cfg.optimizer_config:
optimizer_config = OptimizerHook(**cfg.optimizer_config)
else:
optimizer_config = cfg.optimizer_config
# 注册hooks
runner.register_training_hooks(cfg.lr_config, optimizer_config,
cfg.checkpoint_config, cfg.log_config,
cfg.get('momentum_config', None))
# 注册profiler hook
#trace_config = dict(type='tb_trace', dir_name='work_dir')
#profiler_config = dict(on_trace_ready=trace_config)
#runner.register_profiler_hook(profiler_config)
if distributed:
if isinstance(runner, EpochBasedRunner):
runner.register_hook(DistSamplerSeedHook())
# 注册评估hooks
if validate:
# 支持验证中的batch_size > 1
val_samples_per_gpu = cfg.data.val.pop('samples_per_gpu', 1)
if val_samples_per_gpu > 1:
assert False
# 将'ImageToTensor'替换为'DefaultFormatBundle'
cfg.data.val.pipeline = replace_ImageToTensor(
cfg.data.val.pipeline)
val_dataset = custom_build_dataset(cfg.data.val, dict(test_mode=True))
val_dataloader = build_dataloader(
val_dataset,
samples_per_gpu=val_samples_per_gpu,
workers_per_gpu=cfg.data.workers_per_gpu,
dist=distributed,
shuffle=False,
shuffler_sampler=cfg.data.shuffler_sampler, # dict(type='DistributedGroupSampler'),
nonshuffler_sampler=cfg.data.nonshuffler_sampler, # dict(type='DistributedSampler'),
)
eval_cfg = cfg.get('evaluation', {})
eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner'
eval_cfg['jsonfile_prefix'] = osp.join('val', cfg.work_dir, time.ctime().replace(' ','_').replace(':','_'))
eval_hook = CustomDistEvalHook if distributed else EvalHook
runner.register_hook(eval_hook(val_dataloader, **eval_cfg))
# 用户自定义hooks
if cfg.get('custom_hooks', None):
custom_hooks = cfg.custom_hooks
assert isinstance(custom_hooks, list), \
f'custom_hooks期望是list类型,但得到了{type(custom_hooks)}'
for hook_cfg in cfg.custom_hooks:
assert isinstance(hook_cfg, dict), \
'custom_hooks中的每个项期望是dict类型,但得到了' \
f'{type(hook_cfg)}'
hook_cfg = hook_cfg.copy()
priority = hook_cfg.pop('priority', 'NORMAL')
hook = build_from_cfg(hook_cfg, HOOKS)
runner.register_hook(hook, priority=priority)
if cfg.resume_from:
runner.resume(cfg.resume_from)
elif cfg.load_from:
runner.load_checkpoint(cfg.load_from)
runner.run(data_loaders, cfg.workflow)
custom_train_detector
函数是一个用于训练检测器模型的函数。它接受模型、数据集、配置和其他可选参数,并执行模型训练的过程。
其实也就是定制训练过程,以满足不同的需求。
6.总结
经过上面的步骤,基本疏通了maptr的训练步骤,但是里面存在许多细节,由于还在看源码,以及有一些问题还没解决,后续的详解版本会对代码里面的变量进行详细注解(已经在写了,如果没啥问题的话),维度以及作用,一方面是加深对maptr的理解,另一方面提高自己对mmlab类模型的认知。