前言
最近因为学习的需要在mmdetection框架进行网络的训练和model的修改。记录一下mmdetection中的部分源码学习和注释。注释有不对的地方还请批评指正。
###后续会逐步完善
SSDVGG model——backbone
SSD网络作为经典的One-stage 算法,可以在不同的特征图尺寸大小下进行目标框的预测。网络的backbone使用VGG 结构再加上几个卷积层,
具体cfg配置文件为:
文件在mmdetection位置:~/mmdetection/configs/base/models/ssd300.py
model = dict(
type='SingleStageDetector',
backbone=dict(
type='SSDVGG',
depth=16,
with_last_pool=False,
ceil_mode=True,
out_indices=(3, 4),
out_feature_indices=(22, 34),
init_cfg=dict(
type='Pretrained', checkpoint='open-mmlab://vgg16_caffe')),
默认使用的VGG_16的网络大小,open-mmlab://vgg16_caffe 预训练模型。
depth :网络深度,有四种大小可选(即对应的VGG网络的四种深度)
with_last_pool:Whether to add a pooling layer at the last of the model: 需不需要加个pool层,在model的最后。
ceil_mode: 在做池化的时候,对于边界的处理(是否保留边界的最值)。
out_indices=(3, 4) 输出的stage的索引
out_feature_indices=(22, 34), 输出的feature层的索引
根据cfg文件指向到SSDVGG的backbone.py,具体位置在:~/mmdetection/mmdet/models/backbones/ssd_vgg.py
引用部分中,值得注意的是mmcv.cnn。因为SSD的前部分框架是以VGG网络为基础的,所以要先扒一扒mmcv.cnn中的vgg。
def conv3x3(in_planes: int, out_planes: int, dilation: int = 1) -> nn.Module:
"""3x3 convolution with padding."""
#带pad的3*3卷积公式
#dilation 卷积核的膨胀系数,为1 就是卷积核不变
#-> nn.Module 表示这个程序的标杆是nn.Module,用于判断数据类型对不对
return nn.Conv2d(
in_planes,
out_planes,
kernel_size=3,
padding=dilation,
dilation=dilation)
def make_vgg_layer(inplanes: int,
planes: int,
num_blocks: int,
dilation: int = 1,
with_bn: bool = False,
ceil_mode: bool = False) -> List[nn.Module]:
#程序结构是List,数据类型是nn.Module
layers = []
#根据blocks的数量训练建立layers
#若blocks为3,则网络的一次循环结果为为:conv3x3 -> nn.BatchNorm2d -> nn.ReLU,然后把输出替换为输入进行下一次的循环,
#循环结束加入nn.MaxPool2d
for _ in range(num_blocks):
layers.append(conv3x3(inplanes, planes, dilation))
if with_bn:
layers.append(nn.BatchNorm2d(planes))
layers.append(nn.ReLU(inplace=True))
inplanes = planes
layers.append(nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=ceil_mode))
return layers
class VGG(nn.Module):
"""VGG backbone.
Args:
depth (int): Depth of vgg, from {11, 13, 16, 19}. VGG的网络深度
with_bn (bool): Use BatchNorm or not. 是否加bn层
num_classes (int): number of classes for classification. 分类的种类数
num_stages (int): VGG stages, normally 5. vgg的阶段数
dilations (Sequence[int]): Dilation of each stage. 卷积膨胀系数
out_indices (Sequence[int]): Output from which stages. 每个阶段的输出
frozen_stages (int): Stages to be frozen (all param fixed). -1 means
not freezing any parameters. 冻层
bn_eval (bool): Whether to set BN layers as eval mode, namely, freeze
running stats (mean and var). 是否把bn作为eval
bn_frozen (bool): Whether to freeze weight and bias of BN layers. 是否冻结BN 的权重和bias
"""
#他的这个阶段普遍就是五个,标志就是每个阶段后面加一个maxpool,这里面的数字(1,2,3,4)代表了有几个卷积吧
#cfgs = {
#'vgg11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
#'vgg13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
#'vgg16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
#'vgg19': [focu,32,32,64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'],
arch_settings = {
11: (1, 1, 2, 2, 2),
13: (2, 2, 2, 2, 2),
16: (2, 2, 3, 3, 3),
19: (2, 2, 4, 4, 4)
}
#以depth 16为例
def __init__(self,
depth: int,
with_bn: bool = False,
num_classes: int = -1,
num_stages: int = 5,
dilations: Sequence[int] = (1, 1, 1, 1, 1),
out_indices: Sequence[int] = (0, 1, 2, 3, 4),
frozen_stages: int = -1,
bn_eval: bool = True,
bn_frozen: bool = False,
ceil_mode: bool = False,
with_last_pool: bool = True):
super().__init__()
if depth not in self.arch_settings:
raise KeyError(f'invalid depth {depth} for vgg')
assert num_stages >= 1 and num_stages <= 5 #判断一下stage是否正确
stage_blocks = self.arch_settings[depth] #stage_blocks = (2, 2, 3, 3, 3)
self.stage_blocks = stage_blocks[:num_stages] #这里全取,self.stage_blocks =[2, 2, 3, 3, 3]
assert len(dilations) == num_stages #判断一下膨胀系数和out与 num_stages = 5
assert max(out_indices) <= num_stages
#统一一下参数,设置一下初始的参数和layer
self.num_classes = num_classes
self.out_indices = out_indices
self.frozen_stages = frozen_stages
self.bn_eval = bn_eval
self.bn_frozen = bn_frozen
self.inplanes = 3 #图像输入的通道
start_idx = 0
vgg_layers = []
self.range_sub_modules = []
for i, num_blocks in enumerate(self.stage_blocks):
#i为当前步的量,num_blocks从[2, 2, 3, 3, 3]一个一个的取
#bn层有的话,with_bn=1,无的话with_bn=0
#enumerate从0开始
#i=0时,num_modules = 2*3+1 =7
#i=1,num_modules=7
#i=2,num_modules=7
#i=3,num_modules=10
#i=4,num_modules=10
num_modules = num_blocks * (2 + with_bn) + 1
#end_idx = 7
end_idx = start_idx + num_modules
dilation = dilations[i]
#这里就是输出的通道数了,64,128,256,512
planes = 64 * 2**i if i < 4 else 512
vgg_layer = make_vgg_layer(
self.inplanes,
planes,
num_blocks,
dilation=dilation,
with_bn=with_bn,
ceil_mode=ceil_mode)
vgg_layers.extend(vgg_layer)
#为下一个stage的输入进行传递
self.inplanes = planes
#self.range_sub_modules 记录每个stage_blocks的起始idx
self.range_sub_modules.append([start_idx, end_idx])
start_idx = end_idx
#如果不想要最后stage的pool层的话
if not with_last_pool:
vgg_layers.pop(-1)
#本质上就是嵌套的列表中,最后一个嵌套的第二项减1,也就是第一个-1表示去最后一个嵌套的列表
#第二个1表示取该嵌套列表中的第二个元素
self.range_sub_modules[-1][1] -= 1
self.module_name = 'features'
#pytorch中的add_module模块,第一个参数是名字,第二个参数是用nn.Sequential搭建成的layer
self.add_module(self.module_name, nn.Sequential(*vgg_layers))
#下面构建的是卷积层组后面的分类模块了,全连接->relu->dropout->fc->relu->dropout->fc
if self.num_classes > 0:
self.classifier = nn.Sequential(
nn.Linear(512 * 7 * 7, 4096),
nn.ReLU(True),
nn.Dropout(),
nn.Linear(4096, 4096),
nn.ReLU(True),
nn.Dropout(),
nn.Linear(4096, num_classes),
)
def init_weights(self, pretrained: Optional[str] = None) -> None:
if isinstance(pretrained, str):
logger = logging.getLogger()
#from ..runner import load_checkpoint
#load_checkpoint(self, pretrained, strict=False, logger=logger)
elif pretrained is None:
#卷积用kaiming_init进行初始化
#nn.BatchNorm2d bn用固定值1初始化
#fc 用normal初始化
#self.modules() self.modules()是继承torch.nn.Modules()的类拥有的方法,
# 以迭代器形式返回此前声明的所有layers,即得到整个模型的layer
for m in self.modules():
if isinstance(m, nn.Conv2d):
kaiming_init(m)
elif isinstance(m, nn.BatchNorm2d):
constant_init(m, 1)
elif isinstance(m, nn.Linear):
normal_init(m, std=0.01)
else:
raise TypeError('pretrained must be a str or None')