理论知识
YOLOX中的解耦合头与理解
解耦合头是在YOLOX中提出的,可以看出,原先YOLO的检测头是耦合的,通道数包括置信度,类别和预测框坐标这三类。而解耦合头是将上述的三个预测任务分别进行预测而非统一预测。这样的好处在于能够进一步提高检测精度。
在CVPR2020的“Revisiting the Sibling Head in Object Detector”一文中,原文的作者认为,在目标检测的定位和分类任务中,两个任务所聚焦和感兴趣的地方不同,分类任务更关注所提取的特征与已有类别哪一类最为相近,而定位任务更加关注与GT Box的位置坐标从而进行边界框参数修正。因此如果采取用同一个特征图(耦合头)进行分类和定位,效果会不好。
此外,在论文Rethinking Classification and Localization for Object Detection中也提出了类似的观点(fc-head更适合分类任务,conv-head更适合定位任务)。
YOLOv6中的解耦合头
YOLOv6对解耦头进行了精简设计后,同时综合考虑到相关算子表征能力和硬件上计算开销这两者的平衡,采用 Hybrid Channels 策略重新设计了一个更高效的解耦头结构,在维持精度的同时降低了延时,缓解了解耦头中 3x3 卷积带来的额外延时开销。通过在 nano 尺寸模型上进行消融实验,对比相同通道数的解耦头结构,精度提升 0.2% AP 的同时,速度提升6.8%。具体来说,改进是两点:
- 将解耦头中最后的两个3*3卷积层改成一层;
- 将这个3*3卷积层的输出通道改为与输入通道一致。
YOLOv5中使用YOLOv6提出的Efficient Decoupled Head解耦合头
在yolo.py
中引入下面的Decoupled_Detect
类
class Decoupled_Detect(nn.Module):
# YOLOv5 Detect head for detection models
stride = None # strides computed during build
dynamic = False # force grid reconstruction
export = False # export mode
def __init__(self, nc=80, anchors=(), ch=(), inplace=True): # detection layer
super().__init__()
self.nc = nc # number of classes
self.no = nc + 5 # number of outputs per anchor
self.nl = len(anchors) # number of detection layers
self.na = len(anchors[0]) // 2 # number of anchors
self.grid = [torch.empty(0) for _ in range(self.nl)] # init grid
self.anchor_grid = [torch.empty(0) for _ in range(self.nl)] # init anchor grid
self.register_buffer('anchors', torch.tensor(anchors).float().view(self.nl, -1, 2)) # shape(nl,na,2)
self.m_stem = nn.ModuleList(Conv(x, x, 1) for x in ch) # stem conv
self.m_cls = nn.ModuleList(nn.Sequential(Conv(x, x, 3), nn.Conv2d(x, self.na * self.nc, 1)) for x in ch) # cls conv
self.m_reg_conf = nn.ModuleList(Conv(x, x, 3) for x in ch) # reg_conf stem conv
self.m_reg = nn.ModuleList(nn.Conv2d(x, self.na * 4, 1) for x in ch) # reg conv
self.m_conf = nn.ModuleList(nn.Conv2d(x, self.na * 1, 1) for x in ch) # conf conv
self.inplace = inplace # use inplace ops (e.g. slice assignment)
def forward(self, x):
z = [] # inference output
for i in range(self.nl):
x[i] = self.m_stem[i](x[i]) # conv
bs, _, ny, nx = x[i].shape
x_cls = self.m_cls[i](x[i]).view(bs, self.na, self.nc, ny, nx).permute(0, 1, 3, 4, 2).contiguous()
x_reg_conf = self.m_reg_conf[i](x[i])
x_reg = self.m_reg[i](x_reg_conf).view(bs, self.na, 4, ny, nx).permute(0, 1, 3, 4, 2).contiguous()
x_conf = self.m_conf[i](x_reg_conf).view(bs, self.na, 1, ny, nx).permute(0, 1, 3, 4, 2).contiguous()
x[i] = torch.cat([x_reg, x_conf, x_cls], dim=4)
if not self.training: # inference
if self.dynamic or self.grid[i].shape[2:4] != x[i].shape[2:4]:
self.grid[i], self.anchor_grid[i] = self._make_grid(nx, ny, i)
if isinstance(self, Segment): # (boxes + masks)
xy, wh, conf, mask = x[i].split((2, 2, self.nc + 1, self.no - self.nc - 5), 4)
xy = (xy.sigmoid() * 2 + self.grid[i]) * self.stride[i] # xy
wh = (wh.sigmoid() * 2) ** 2 * self.anchor_grid[i] # wh
y = torch.cat((xy, wh, conf.sigmoid(), mask), 4)
else: # Detect (boxes only)
xy, wh, conf = x[i].sigmoid().split((2, 2, self.nc + 1), 4)
xy = (xy * 2 + self.grid[i]) * self.stride[i] # xy
wh = (wh * 2) ** 2 * self.anchor_grid[i] # wh
y = torch.cat((xy, wh, conf), 4)
z.append(y.view(bs, self.na * nx * ny, self.no))
return x if self.training else (torch.cat(z, 1),) if self.export else (torch.cat(z, 1), x)
def _make_grid(self, nx=20, ny=20, i=0, torch_1_10=check_version(torch.__version__, '1.10.0')):
d = self.anchors[i].device
t = self.anchors[i].dtype
shape = 1, self.na, ny, nx, 2 # grid shape
y, x = torch.arange(ny, device=d, dtype=t), torch.arange(nx, device=d, dtype=t)
yv, xv = torch.meshgrid(y, x, indexing='ij') if torch_1_10 else torch.meshgrid(y, x) # torch>=0.7 compatibility
grid = torch.stack((xv, yv), 2).expand(shape) - 0.5 # add grid offset, i.e. y = 2.0 * x - 0.5
anchor_grid = (self.anchors[i] * self.stride[i]).view((1, self.na, 1, 1, 2)).expand(shape)
return grid, anchor_grid
此外,在yolo.py
的BaseModel
类和DetectionModel
中添加如下代码
# ------------------------ BaseModel类中添加 ---------------------
def _apply(self, fn):
# Apply to(), cpu(), cuda(), half() to model tensors that are not parameters or registered buffers
self = super()._apply(fn)
m = self.model[-1] # Detect()
if isinstance(m, (Detect, Decoupled_Detect, Segment)):
m.stride = fn(m.stride)
# ------------------------ DetectionModel类中添加 ---------------------
if isinstance(m, (Detect, Decoupled_Detect, Segment)):
s = 256 # 2x min stride
m.inplace = self.inplace
最后,在DetectionModel
类中修改该函数def _initialize_biases
def _initialize_biases(self, cf=None): # initialize biases into Detect(), cf is class frequency
# https://arxiv.org/abs/1708.02002 section 3.3
# cf = torch.bincount(torch.tensor(np.concatenate(dataset.labels, 0)[:, 0]).long(), minlength=nc) + 1.
m = self.model[-1] # Detect() module
if isinstance(m, Detect):
for mi, s in zip(m.m, m.stride): # from
b = mi.bias.view(m.na, -1) # conv.bias(255) to (3,85)
b.data[:, 4] += math.log(8 / (640 / s) ** 2) # obj (8 objects per 640 image)
b.data[:, 5:5 + m.nc] += math.log(0.6 / (m.nc - 0.99999)) if cf is None else torch.log(
cf / cf.sum()) # cls
mi.bias = torch.nn.Parameter(b.view(-1), requires_grad=True)
elif isinstance(m, Decoupled_Detect):
for mi, s in zip(m.m_conf, m.stride): # from
b = mi.bias.view(m.na, -1) # conv.bias(255) to (3,85)
b.data += math.log(8 / (640 / s) ** 2) # obj (8 objects per 640 image)
mi.bias = torch.nn.Parameter(b.view(-1), requires_grad=True)
for mi, s in zip(m.m_cls, m.stride): # from
b = mi[-1].bias.view(m.na, -1) # conv.bias(255) to (3,85)
b.data += math.log(0.6 / (m.nc - 0.99999)) if cf is None else torch.log(cf / cf.sum()) # cls
mi[-1].bias = torch.nn.Parameter(b.view(-1), requires_grad=True)
修改原有的yolov5s.yaml
文件,将Detect
改为Decoupled_Detect
即可
# YOLOv5 🚀 by Ultralytics, AGPL-3.0 license
# Parameters
nc: 80 # number of classes
depth_multiple: 0.33 # model depth multiple
width_multiple: 0.50 # layer channel multiple
anchors:
- [10,13, 16,30, 33,23] # P3/8
- [30,61, 62,45, 59,119] # P4/16
- [116,90, 156,198, 373,326] # P5/32
# YOLOv5 v6.0 backbone
backbone:
# [from, number, module, args]
[[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2
[-1, 1, Conv, [128, 3, 2]], # 1-P2/4
[-1, 3, C3, [128]],
[-1, 1, Conv, [256, 3, 2]], # 3-P3/8
[-1, 6, C3, [256]],
[-1, 1, Conv, [512, 3, 2]], # 5-P4/16
[-1, 9, C3, [512]],
[-1, 1, Conv, [1024, 3, 2]], # 7-P5/32
[-1, 3, C3, [1024]],
[-1, 1, SPPF, [1024, 5]], # 9
]
# YOLOv5 v6.0 head
head:
[[-1, 1, Conv, [512, 1, 1]],
[-1, 1, nn.Upsample, [None, 2, 'nearest']],
[[-1, 6], 1, Concat, [1]], # cat backbone P4
[-1, 3, C3, [512, False]], # 13
[-1, 1, Conv, [256, 1, 1]],
[-1, 1, nn.Upsample, [None, 2, 'nearest']],
[[-1, 4], 1, Concat, [1]], # cat backbone P3
[-1, 3, C3, [256, False]], # 17 (P3/8-small)
[-1, 1, Conv, [256, 3, 2]],
[[-1, 14], 1, Concat, [1]], # cat head P4
[-1, 3, C3, [512, False]], # 20 (P4/16-medium)
[-1, 1, Conv, [512, 3, 2]],
[[-1, 10], 1, Concat, [1]], # cat head P5
[-1, 3, C3, [1024, False]], # 23 (P5/32-large)
[[17, 20, 23], 1, Decoupled_Detect, [nc, anchors]], # Detect(P3, P4, P5)
]
实验
VisDrone数据集
VisDrone数据集链接
VisDrone数据集发布于2018年,并在2019年由天津大学等进行扩展。各领域对于无人机的应用需求非常广泛。为目标检测提供了10209张图片,其中6471张图像用于训练,548张用于验证,3190张用于测试,同时还提供了96个用于目标检测的视频剪辑,包括56个用于训练(共计24201帧),7个用于验证(共计2819帧)和33个用于测试(共计12968帧)。相对而言,VisDrone数据集场景复杂、目标尺度变化多、小目标多、目标密集、遮挡情况严重,对算法的要求也随之上升。
实验结果
对修改后网络的参数量进行计算分析
from n params module arguments
0 -1 1 3520 models.common.Conv [3, 32, 6, 2, 2]
1 -1 1 18560 models.common.Conv [32, 64, 3, 2]
2 -1 1 18816 models.common.C3 [64, 64, 1]
3 -1 1 73984 models.common.Conv [64, 128, 3, 2]
4 -1 2 115712 models.common.C3 [128, 128, 2]
5 -1 1 295424 models.common.Conv [128, 256, 3, 2]
6 -1 3 625152 models.common.C3 [256, 256, 3]
7 -1 1 1180672 models.common.Conv [256, 512, 3, 2]
8 -1 1 1182720 models.common.C3 [512, 512, 1]
9 -1 1 656896 models.common.SPPF [512, 512, 5]
10 -1 1 131584 models.common.Conv [512, 256, 1, 1]
11 -1 1 0 torch.nn.modules.upsampling.Upsample [None, 2, 'nearest']
12 [-1, 6] 1 0 models.common.Concat [1]
13 -1 1 361984 models.common.C3 [512, 256, 1, False]
14 -1 1 33024 models.common.Conv [256, 128, 1, 1]
15 -1 1 0 torch.nn.modules.upsampling.Upsample [None, 2, 'nearest']
16 [-1, 4] 1 0 models.common.Concat [1]
17 -1 1 90880 models.common.C3 [256, 128, 1, False]
18 -1 1 147712 models.common.Conv [128, 128, 3, 2]
19 [-1, 14] 1 0 models.common.Concat [1]
20 -1 1 296448 models.common.C3 [256, 256, 1, False]
21 -1 1 590336 models.common.Conv [256, 256, 3, 2]
22 [-1, 10] 1 0 models.common.Concat [1]
23 -1 1 1182720 models.common.C3 [512, 512, 1, False]
24 [17, 20, 23] 1 6771837 Decoupled_Detect [80, [[10, 13, 16, 30, 33, 23], [30, 61, 62, 45, 59, 119], [116, 90, 156, 198, 373, 326]], [128, 256, 512]]
YOLOv5sEDhead summary: 254 layers, 13777981 parameters, 13777981 gradients, 28.6 GFLOPs
24 [17, 20, 23] 1 229245 Detect [80, [[10, 13, 16, 30, 33, 23], [30, 61, 62, 45, 59, 119], [116, 90, 156, 198, 373, 326]], [128, 256, 512]]
YOLOv5s summary: 214 layers, 7235389 parameters, 7235389 gradients, 16.6 GFLOPs
可以看出,相比较于YOLOv5s而言,修改后的网络参数量和计算量大幅增加。解耦合头的问题就在于此。
VisDrone数据集在YOLOv5s中的mAP为32.9%,使用解耦合头后,mAP出现较大增幅,达到34.7%。证明了解耦合头的有效性。但是模型的大小由原有的14MB增长到26.2MB,模型增长过大,难以在参数量和精度达成较好的平衡。不适用于边缘计算平台(Jetson等)。
训练模型下载地址,提取码 pwx6