Focal Loss for Dense Object Detection:RetinaNet
RetinaNet
mmdetection配置文件分析
1.retinanet_r50_fpn_1x_coco.py
model = dict(
type='RetinaNet', # 检测器名称
pretrained='torchvision://resnet50', # 基于ImageNet的预训练模型
backbone=dict( # 骨架配置信息
type='ResNet', # 骨架网络名称
depth=50, # 骨架网络的深度
num_stages=4, # 骨架网络的阶段数(一般提取特征层从阶段集合中提取)
out_indices=(0, 1, 2, 3), # 提取特征图层下标
frozen_stages=1, # 骨架第一阶段权重系数被冻结
norm_cfg=dict( # 归一化层的配置
type='BN', # 归一化层的类型,一般有BN和GN两种
requires_grad=True), # 是否训练归一化层权重
norm_eval=True, # Whether to freeze the statistics in BN
style='pytorch'),
# The style of backbone, 'pytorch' means that stride 2 layers are in 3x3 conv, 'caffe' means stride 2 layers are in 1x1 convs.
neck=dict( # 检测器网络颈部
type='FPN', # FPN特征金字塔还有NASFPN,PAFPN等等
in_channels=[256, 512, 1024, 2048], # FPN特征融合金字塔的输入通道与backbone.out_indices应该一致
out_channels=256, # 特征金字塔每层输出的通道数
start_level=1,
add_extra_convs='on_input',
num_outs=5), # The number of output scales
bbox_head=dict( # Config of box head in the RetinaHead.
type='RetinaHead',
num_classes=80, # 类别数
in_channels=256, # 输入通道
stacked_convs=4,
feat_channels=256,
anchor_generator=dict(
type='AnchorGenerator',
octave_base_scale=4,
scales_per_octave=3,
ratios=[0.5, 1.0, 2.0],
strides=[8, 16, 32, 64, 128]),
bbox_coder=dict(
type='DeltaXYWHBBoxCoder',
target_means=[0.0, 0.0, 0.0, 0.0],
target_stds=[1.0, 1.0, 1.0, 1.0]),
loss_cls=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
loss_bbox=dict(type='L1Loss', loss_weight=1.0)))
train_cfg = dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.5,
neg_iou_thr=0.4,
min_pos_iou=0,
ignore_iof_thr=-1),
allowed_border=-1,
pos_weight=-1,
debug=False)
test_cfg = dict(
nms_pre=1000,
min_bbox_size=0,
score_thr=0.05,
nms=dict(type='nms', iou_threshold=0.5),
max_per_img=100)
dataset_type = 'CocoDataset'
data_root = '../data/coco/'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations', with_bbox=True),
dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
dict(type='RandomFlip', flip_ratio=0.5),
dict(
type='Normalize',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
to_rgb=True),
dict(type='Pad', size_divisor=32),
dict(type='DefaultFormatBundle'),
dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(1333, 800),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(
type='Normalize',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
to_rgb=True),
dict(type='Pad', size_divisor=32),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img'])
])
]
data = dict(
samples_per_gpu=2,
workers_per_gpu=1,
train=dict(
type='CocoDataset',
ann_file='../data/coco/annotations/instances_val2017.json',
img_prefix='../data/coco/val2017/',
pipeline=[
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations', with_bbox=True),
dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
dict(type='RandomFlip', flip_ratio=0.5),
dict(
type='Normalize',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
to_rgb=True),
dict(type='Pad', size_divisor=32),
dict(type='DefaultFormatBundle'),
dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
]),
val=dict(
type='CocoDataset',
ann_file='../data/coco/annotations/instances_val2017.json',
img_prefix='../data/coco/val2017/',
pipeline=[
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(1333, 800),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(
type='Normalize',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12,