文章前言
该文链接至
YOLO-V3-SPP
有兴趣请查看上文对YOLO-V3-SPP的详细解读
model流程
(霹雳啪啦UP分享的源码版本)
models.py
现在暂时更新到源码注释这块,后面会该model.py的流程做一个流程图
在霹雳吧啦Wz对源码的注释的基础上,我也对源码进行了详细的解析
(阅读请忽略ONNX模型的相关知识,我还没接触该ONNX模型,待使用到再进行详细注释)
from build_utils.layers import *
from build_utils.parse_config import *
from build_utils import torch_utils
ONNX_EXPORT = False
# 注意第一个参数包含了:,这是类型建议符,非强制
def create_modules(modules_defs: list, img_size):
"""
Constructs module list of layer blocks from module configuration in module_defs
:param modules_defs: 通过.cfg文件解析得到的每个层结构的列表
:param img_size:
:return:
"""
img_size = [img_size] * 2 if isinstance(img_size, int) else img_size
# 删除解析cfg列表中的第一个配置(对应[net]的配置)
modules_defs.pop(0) # cfg training hyperparams (unused)
#output_filters,每层的输入通道list,第0个是图像的输入通道,即3通道
output_filters = [3] # input channels
module_list = nn.ModuleList()
# 统计哪些特征层的输出会被后续的层使用到(可能是特征融合,也可能是拼接),类似残差模块
routs = [] # list of layers which rout to deeper layers
yolo_index = -1
# 遍历搭建每个层结构,i是物理序号,从0开始,表示layer序号
for i, mdef in enumerate(modules_defs):
modules = nn.Sequential()
if mdef["type"] == "convolutional":
bn = mdef["batch_normalize"] # 1 or 0 / use or not
filters = mdef["filters"]
k = mdef["size"] # kernel size
stride = mdef["stride"] if "stride" in mdef else (mdef['stride_y'], mdef["stride_x"])
if isinstance(k, int):
modules.add_module("Conv2d", nn.Conv2d(in_channels=output_filters[-1],
out_channels=filters,
kernel_size=k,
stride=stride,
padding=k // 2 if mdef["pad"] else 0,
bias=not bn))
else:
raise TypeError("conv2d filter size must be int type.")
if bn:
modules.add_module("BatchNorm2d", nn.BatchNorm2d(filters))
else:
# 如果该卷积操作没有bn层,意味着该层为yolo的predictor
routs.append(i) # detection output (goes into yolo layer)
if mdef["activation"] == "leaky":
modules.add_module("activation", nn.LeakyReLU(0.1, inplace=True))
else:
pass
elif mdef["type"] == "BatchNorm2d":
pass
elif mdef["type"] == "maxpool":
k = mdef["size"] # kernel size
stride = mdef["stride"]
modules = nn.MaxPool2d(kernel_size=k, stride=stride, padding=(k - 1) // 2)
elif mdef["type"] == "upsample":
if ONNX_EXPORT: # explicitly state size, avoid scale_factor
g = (yolo_index + 1) * 2 / 32 # gain
modules = nn.Upsample(size=tuple(int(x * g) for x in img_size))
else:
modules = nn.Upsample(scale_factor=mdef["stride"])
# SPP网络结构和FPN网络结构才会出现route层
elif mdef["type"] == "route": # [-2], [-1,-3,-5,-6], [-1, 61]
layers = mdef["layers"]
# 如果l>0,则需要+1,l是表示网络的第几层,而对于output_filters来说,图像也算层,而layers没有将图像作为第一层,于是将layers+1
# 而当l≤0,l表示的是当前层的往后的l层,是一个偏移信息,不是表示第几层,无需+1纠正索引
filters = sum([output_filters[l + 1 if l > 0 else l] for l in layers])
# extend多个值在list尾部
routs.extend([i + l if l < 0 else l for l in layers])
# 自定义的拼接函数
modules = FeatureConcat(layers=layers)
# 残差模块
elif mdef["type"] == "shortcut":
# 注意!当前层为残差模块的下一层
layers = mdef["from"]
# 获得残差块最后一层卷积层的通道数,output_filters[-1]表示当前层的前一层,即残差块的最后一层
filters = output_filters[-1]
# routs.extend([i + l if l < 0 else l for l in layers])
# i + layers[0]为残差模块前一层的索引层
routs.append(i + layers[0])
modules = WeightedFeatureFusion(layers=layers, weight="weights_type" in mdef)
elif mdef["type"] == "yolo":
# yolo_index初始值为-1
yolo_index += 1 # 记录是第几个yolo_layer [0, 1, 2]
stride = [32, 16, 8] # 预测特征层对应原图的缩放比例
# 使用cfg文件中anchors有很多个,选mask建议的anchors
modules = YOLOLayer(anchors=mdef["anchors"][mdef["mask"]], # anchor list
nc=mdef["classes"], # number of classes
img_size=img_size,
stride=stride[yolo_index])
# Initialize preceding Conv2d() bias (https://arxiv.org/pdf/1708.02002.pdf section 3.3)
# 这里对bias的处理始终没用上,因为bias没传入module_list里面,这顿瞎操作
try:
# j表示YOLOlayer上一层
j = -1
# 最后一个module的Conv2d的偏置bias,因为最后一层的卷积核个数为255个
bias_ = module_list[j][0].bias # shape(255,) 索引0对应Sequential中的Conv2d
bias = bias_.view(modules.na, -1) # shape(3, 85)
bias[:, 4] += -4.5 # obj 第5列为obj预测概率的偏置
# 第6列开始的类别概率偏置
bias[:, 5:] += math.log(0.6 / (modules.nc - 0.99)) # cls (sigmoid(p) = 1/nc)
module_list[j][0].bias = torch.nn.Parameter(bias_, requires_grad=bias_.requires_grad)
except Exception as e:
print('WARNING: smart bias initialization failure.', e)
else:
print("Warning: Unrecognized Layer Type: " + mdef["type"])
# Register module list and number of output filters
module_list.append(modules)
output_filters.append(filters)
# 生成len(modules_defs)个Fasle的list
routs_binary = [False] * len(modules_defs)
# 需要记录输出的位置记为True,需要记录的输出有SPP,FPN,Resnet,Predictor模块里的各个层索引
for i in routs:
routs_binary[i] = True
return module_list, routs_binary
# yolo层是接在网络三个preditor之后的层
# 在create module处调用
class YOLOLayer(nn.Module):
"""
对YOLO的输出进行处理
"""
def __init__(self, anchors, nc, img_size, stride):
super(YOLOLayer, self).__init__()
self.anchors = torch.Tensor(anchors)# 将numpy格式的anchors转换为tensor格式
self.stride = stride # layer stride 特征图上一步对应原图上的步距 [32, 16, 8]
self.na = len(anchors) # number of anchors (3)
self.nc = nc # number of classes (80)
# 每个anchor预测参数数量
self.no = nc + 5 # number of outputs (85: x, y, w, h, obj, cls1, ...)
self.nx, self.ny, self.ng = 0, 0, (0, 0) # initialize number of x, y gridpoints
# 将anchors大小缩放到grid尺度
self.anchor_vec = self.anchors / self.stride
# view之后的维度分别代表:batch_size, na, grid_h, grid_w, wh,
# 值为1的维度对应的值不是固定值,后续操作可根据broadcast广播机制自动扩充,类似reshape
self.anchor_wh = self.anchor_vec.view(1, self.na, 1, 1, 2)
self.grid = None
if ONNX_EXPORT:
self.training = False
self.create_grids((img_size[1] // stride, img_size[0] // stride)) # number x, y grid points
def create_grids(self, ng=(13, 13), device="cpu"):
"""
更新grids信息并生成新的grids参数
:param ng: 特征图大小
:param device:
:return:
"""
self.nx, self.ny = ng
self.ng = torch.tensor(ng, dtype=torch.float)
# build xy offsets 构建每个cell处的anchor的xy偏移量(在feature map上的)
if not self.training: # 训练模式不需要回归到最终预测boxes,也就不需要去计算grid
# torch.arange(start=0, end=13)的结果并不包含end,类型为int64.即得到
# torch.arange(num),输出0->num-1的tensor
# torch.meshgrid([0,...,ny-1],[0,...,nx-1])
# yv是以[0,...,ny-1]中的每个元素为每行开头元素(即变成列,复制nx列)得到一个矩阵shape为(ny,nx),表示整个grid的y坐标信息
# xv是以[0,...,nx-1]中的每个元素为每列开头元素(即变成行,复制ny行)得到一个矩阵shape为(ny,nx),表示整个grid的x坐标信息
yv, xv = torch.meshgrid([torch.arange(self.ny, device=device),
torch.arange(self.nx, device=device)])
# batch_size, na, grid_h, grid_w, wh
# 将xv和yv在第三个维度堆叠,生成13x13x2的shape的tensor,堆叠生成的为一个grid坐标。x坐标排列以xv顺序优先排列,y坐标排列以yv顺序排列
# view添加两个维度作为batch_size和na(anchor)
self.grid = torch.stack((xv, yv), 2).view((1, 1, self.ny, self.nx, 2)).float()
#
if self.anchor_vec.device != device:
self.anchor_vec = self.anchor_vec.to(device)
self.anchor_wh = self.anchor_wh.to(device)
def forward(self, p):
if ONNX_EXPORT:
bs = 1 # batch size
else:
bs, _, ny, nx = p.shape # batch_size, predict_param(255), grid(13), grid(13)
if (self.nx, self.ny) != (nx, ny) or self.grid is None: # fix no grid bug
self.create_grids((nx, ny), p.device)
# view: (batch_size, 255, 13, 13) -> (batch_size, 3, 85, 13, 13)
# permute: (batch_size, 3, 85, 13, 13) -> (batch_size, 3, 13, 13, 85)
# [bs, anchor, grid, grid, xywh + obj + classes]
# permute将tensor维度换位,换位之后,p在内存当中不再连续,需要调用contiguous使p在内存中连续
p = p.view(bs, self.na, self.no, self.ny, self.nx).permute(0, 1, 3, 4, 2).contiguous() # prediction
if self.training:# 如果是训练模式,返回p
return p
elif ONNX_EXPORT:
# Avoid broadcasting for ANE operations
m = self.na * self.nx * self.ny # 3*
ng = 1. / self.ng.repeat(m, 1)
grid = self.grid.repeat(1, self.na, 1, 1, 1).view(m, 2)
anchor_wh = self.anchor_wh.repeat(1, 1, self.nx, self.ny, 1).view(m, 2) * ng
p = p.view(m, self.no)
# xy = torch.sigmoid(p[:, 0:2]) + grid # x, y
# wh = torch.exp(p[:, 2:4]) * anchor_wh # width, height
# p_cls = torch.sigmoid(p[:, 4:5]) if self.nc == 1 else \
# torch.sigmoid(p[:, 5:self.no]) * torch.sigmoid(p[:, 4:5]) # conf
p[:, :2] = (torch.sigmoid(p[:, 0:2]) + grid) * ng # x, y
p[:, 2:4] = torch.exp(p[:, 2:4]) * anchor_wh # width, height
p[:, 4:] = torch.sigmoid(p[:, 4:])
p[:, 5:] = p[:, 5:self.no] * p[:, 4:5]
return p
else: # inference 如果是验证或者推理阶段
# [bs, anchor, grid, grid, xywh + obj + classes]
io = p.clone() # inference output
# clone返回一个张量的副本,其与原张量的尺寸和数据类型相同。
# 与copy_()不同,这个函数记录在计算图中。传递到克隆张量的梯度将传播到原始张量
# grid的shape=[batch_size, na, grid_h, grid_w, wh],和io最后一维取前两个xy后的shape一致,进行加法
io[..., :2] = torch.sigmoid(io[..., :2]) + self.grid # xy 计算在feature map上的xy坐标,对应论文的sigmoid(tx)+cx
# anchor_wh的shape:[batch_size, na, grid_h, grid_w, wh]与io最后一维取第3,4个,即wh后的shape一致,进行乘法
io[..., 2:4] = torch.exp(io[..., 2:4]) * self.anchor_wh # wh yolo method 计算在feature map上的wh
io[..., :4] *= self.stride # 换算映射回原图尺度
# obj和类别预测经过sigmoid
torch.sigmoid_(io[..., 4:])
return io.view(bs, -1, self.no), p # view [1, 3, 13, 13, 85] as [1, 507, 85],3X13X13=507
# p在这里的shape是[bs, anchor, grid, grid, xywh + obj + classes]
class Darknet(nn.Module):
"""
YOLOv3 spp object detection model
"""
# verbose为打印开关,默认关闭
def __init__(self, cfg, img_size=(416, 416), verbose=False):
super(Darknet, self).__init__()
# 这里传入的img_size只在导出ONNX模型时起作用,isinstance判断img_size是否为int变量,返回布尔值,和ONNX模型有关
self.input_size = [img_size] * 2 if isinstance(img_size, int) else img_size
# 解析网络对应的.cfg文件,返回module字典
self.module_defs = parse_model_cfg(cfg)
# 根据解析的网络结构一层一层去搭建,调用create_modules
self.module_list, self.routs = create_modules(self.module_defs, img_size)
# 获取所有YOLOLayer层的索引89,101,113
self.yolo_layers = get_yolo_layers(self)
# 打印下模型的信息,如果verbose为True则打印详细信息
self.info(verbose) if not ONNX_EXPORT else None # print model description
# x为输入数据
def forward(self, x, verbose=False):
return self.forward_once(x, verbose=verbose)
def forward_once(self, x, verbose=False):
# yolo_out收集每个yolo_layer层的输出
# out收集每个需要保存的模块的输出,即routs记录的模块
yolo_out, out = [], []
if verbose:
print('0', x.shape)
str = ""
for i, module in enumerate(self.module_list):
name = module.__class__.__name__
if name in ["WeightedFeatureFusion", "FeatureConcat"]: # sum, concat
if verbose:
# 属于SPP或者FPN或者resnet模块module类有layers属性,layers为相对索引
# 从当前i层的前一层计算layers相对索引l列表
l = [i - 1] + module.layers # layers
# 对x的shape和需要进行融合和层的shape进行想加,但特征融合,即WeightedFeatureFusion的shape并不是加起来的,而是不变的。这里代码可能有点小问题
sh = [list(x.shape)] + [list(out[i].shape) for i in module.layers] # shapes
# x为一对(l,sh)
str = ' >> ' + ' + '.join(['layer %g %s' % x for x in zip(l, sh)])
# 这里module传入两个参数,因为FPN和SPP和resnet模块的module类的forward有两个参数x,out
x = module(x, out) # WeightedFeatureFusion(), FeatureConcat()
elif name == "YOLOLayer":
yolo_out.append(module(x))
else: # run module directly, i.e. mtype = 'convolutional', 'upsample', 'maxpool', 'batchnorm2d' etc.
x = module(x)
# 如果rout记录该层索引是要保存的,则保存到out里,如果不需要保存,则添加一个[]元素,out的索引和网络层的索引一致
out.append(x if self.routs[i] else [])
if verbose:
print('%g/%g %s -' % (i, len(self.module_list), name), list(x.shape), str)
str = ''
if self.training: # train
return yolo_out
elif ONNX_EXPORT: # export
# x = [torch.cat(x, 0) for x in zip(*yolo_out)]
# return x[0], torch.cat(x[1:3], 1) # scores, boxes: 3780x80, 3780x4
p = torch.cat(yolo_out, dim=0)
# # 根据objectness虑除低概率目标
# mask = torch.nonzero(torch.gt(p[:, 4], 0.1), as_tuple=False).squeeze(1)
# # onnx不支持超过一维的索引(pytorch太灵活了)
# # p = p[mask]
# p = torch.index_select(p, dim=0, index=mask)
#
# # 虑除小面积目标,w > 2 and h > 2 pixel
# # ONNX暂不支持bitwise_and和all操作
# mask_s = torch.gt(p[:, 2], 2./self.input_size[0]) & torch.gt(p[:, 3], 2./self.input_size[1])
# mask_s = torch.nonzero(mask_s, as_tuple=False).squeeze(1)
# p = torch.index_select(p, dim=0, index=mask_s) # width-height 虑除小目标
#
# if mask_s.numel() == 0:
# return torch.empty([0, 85])
return p
else: # inference or test
# 将yolo_out输出成x和p的列表,yolo_out有3个元祖,每个元祖包含了一个
# [bs,grid*grid*anchor,xywh + obj + classes]的shape数据和一个[bs,anchor,grid,grid,xywh + obj + classes]的shape数据
x, p = zip(*yolo_out) # inference output, training output
# 拆分之后的x为3个元素的list,对每个元素在1维度进行cat拼接
# 假设是20类别训练,grid为16
x = torch.cat(x, 1) # cat yolo outputs
return x, p
def info(self, verbose=False):
"""
打印模型的信息
:param verbose:
:return:
"""
torch_utils.model_info(self, verbose)
def get_yolo_layers(self):
"""
获取网络中三个"YOLOLayer"模块对应的索引
:param self:
:return:
"""
# 遍历每个module的class,这个class是YOLOLayer的类名
return [i for i, m in enumerate(self.module_list) if m.__class__.__name__ == 'YOLOLayer'] # [89, 101, 113]