ConvNeXt-Yolo5

暴走辉

已于 2022-01-23 09:00:48 修改

阅读量5.4k

点赞数 14

分类专栏：深度学习深度学习项目实践文章标签： python 深度学习计算机视觉

于 2022-01-23 08:57:22 首次发布

本文链接：https://blog.csdn.net/m0_46179553/article/details/122647257

版权

深度学习同时被 2 个专栏收录

10 篇文章 1 订阅

订阅专栏

深度学习项目实践

2 篇文章 0 订阅

订阅专栏

背景简介

传说中的CV算法工程师抄作业必备手册，既然大佬们烧了那么多的电费为我们总结了这么多能work的trick,那岂有不抄的道理，具体论文的细节本文不做详述，论文地址如下：

https://arxiv.org/abs/2201.03545

以及大佬对此的详细解读

https://mp.weixin.qq.com/s/c6MRbzQE9ErFUWdWKh8PQA

本文仅做对工程应用的整合。

ConvNeXt-YoloV5

仍然以目标检测经典模型yolov5为例，对源代码做如下的修改增加

common.py

# 增加如下代码
#-------------------------------------ConvNeXt------------------------------------------------------
class Block(nn.Module):
    def __init__(self, dim, drop_path=0., layer_scale_init_value=1e-6):
        super().__init__()
        self.dwconv = nn.Conv2d(dim, dim, kernel_size=7, padding=3, groups=dim)  # depthwise conv
        self.norm = LayerNorm(dim, eps=1e-6)
        self.pwconv1 = nn.Linear(dim, 4 * dim)  # pointwise/1x1 convs, implemented with linear layers
        self.act = nn.GELU()
        self.pwconv2 = nn.Linear(4 * dim, dim)
        self.gamma = nn.Parameter(layer_scale_init_value * torch.ones((dim)),
                                  requires_grad=True) if layer_scale_init_value > 0 else None
        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()

    def forward(self, x):
        input = x
        x = self.dwconv(x)
        x = x.permute(0, 2, 3, 1)  # (N, C, H, W) -> (N, H, W, C)
        x = self.norm(x)
        x = self.pwconv1(x)
        x = self.act(x)
        x = self.pwconv2(x)
        if self.gamma is not None:
            x = self.gamma * x
        x = x.permute(0, 3, 1, 2)  # (N, H, W, C) -> (N, C, H, W)

        x = input + self.drop_path(x)
        return x


class LayerNorm(nn.Module):
    def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(normalized_shape))
        self.bias = nn.Parameter(torch.zeros(normalized_shape))
        self.eps = eps
        self.data_format = data_format
        if self.data_format not in ["channels_last", "channels_first"]:
            raise NotImplementedError
        self.normalized_shape = (normalized_shape,)

    def forward(self, x):
        if self.data_format == "channels_last":
            return F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
        elif self.data_format == "channels_first":
            u = x.mean(1, keepdim=True)
            s = (x - u).pow(2).mean(1, keepdim=True)
            x = (x - u) / torch.sqrt(s + self.eps)
            x = self.weight[:, None, None] * x + self.bias[:, None, None]
            return x

class ConvNeXt_Block(nn.Module):  # index 0~3
    def __init__(self, index, in_chans, depths, dims, drop_path_rate=0., layer_scale_init_value=1e-6):
        super().__init__()

        self.index = index
        self.downsample_layers = nn.ModuleList()  # stem and 3 intermediate downsampling conv layers
        stem = nn.Sequential(
            nn.Conv2d(in_chans, dims[0], kernel_size=4, stride=4),
            LayerNorm(dims[0], eps=1e-6, data_format="channels_first")
        )
        self.downsample_layers.append(stem)
        for i in range(3):
            downsample_layer = nn.Sequential(
                LayerNorm(dims[i], eps=1e-6, data_format="channels_first"),
                nn.Conv2d(dims[i], dims[i + 1], kernel_size=2, stride=2),
            )
            self.downsample_layers.append(downsample_layer)

        self.stages = nn.ModuleList()  # 4 feature resolution stages, each consisting of multiple residual blocks
        dp_rates = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]
        cur = 0
        for i in range(4):
            stage = nn.Sequential(
                *[Block(dim=dims[i], drop_path=dp_rates[cur + j],
                        layer_scale_init_value=layer_scale_init_value) for j in range(depths[i])]
            )
            self.stages.append(stage)
            cur += depths[i]
        self.apply(self._init_weights)

    def _init_weights(self, m):
        if isinstance(m, (nn.Conv2d, nn.Linear)):
            trunc_normal_(m.weight, std=.02)
            nn.init.constant_(m.bias, 0)

    def forward(self, x):
        x = self.downsample_layers[self.index](x)
        x = self.stages[self.index](x)
        return x

yolo.py

# 修改parse_model函数
def parse_model(d, ch):  # model_dict, input_channels(3)
    LOGGER.info('\n%3s%18s%3s%10s  %-40s%-30s' % ('', 'from', 'n', 'params', 'module', 'arguments'))
    anchors, nc, gd, gw = d['anchors'], d['nc'], d['depth_multiple'], d['width_multiple']
    na = (len(anchors[0]) // 2) if isinstance(anchors, list) else anchors  # number of anchors
    no = na * (nc + 5)  # number of outputs = anchors * (classes + 5)

    layers, save, c2 = [], [], ch[-1]  # layers, savelist, ch out
    for i, (f, n, m, args) in enumerate(d['backbone'] + d['head']):  # from, number, module, args
        m = eval(m) if isinstance(m, str) else m  # eval strings
        for j, a in enumerate(args):
            try:
                args[j] = eval(a) if isinstance(a, str) else a  # eval strings
            except:
                pass

        n = n_ = max(round(n * gd), 1) if n > 1 else n  # depth gain
        if m in [Conv, GhostConv, Bottleneck, GhostBottleneck, SPP, SPPF, DWConv, MixConv2d, Focus, CrossConv,
                 BottleneckCSP, C3, C3TR, C3SPP, C3Ghost]:
            c1, c2 = ch[f], args[0]   
            if c2 != no:  # if not output
                c2 = make_divisible(c2 * gw, 8)

            args = [c1, c2, *args[1:]]

            if m in [BottleneckCSP, C3, C3TR, C3Ghost]:
                args.insert(2, n)  # number of repeats
                n = 1
        elif m is nn.BatchNorm2d:
            args = [ch[f]]
        elif m is Concat:
            c2 = sum([ch[x] for x in f])
        elif m is Detect:
            args.append([ch[x] for x in f])
            if isinstance(args[1], int):  # number of anchors
                args[1] = [list(range(args[1] * 2))] * len(f)
        elif m is Contract:
            c2 = ch[f] * args[0] ** 2
        elif m is Expand:
            c2 = ch[f] // args[0] ** 2
# 添加加该部分代码
#---------------------------------------------            
        elif m is ConvNeXt_Block:
            c2 = args[0]
            args = args[1:]
#----------------------------------------------
        else:
            c2 = ch[f]

        m_ = nn.Sequential(*[m(*args) for _ in range(n)]) if n > 1 else m(*args)  # module
        t = str(m)[8:-2].replace('__main__.', '')  # module type
        np = sum([x.numel() for x in m_.parameters()])  # number params
        m_.i, m_.f, m_.type, m_.np = i, f, t, np  # attach index, 'from' index, type, number params
        LOGGER.info('%3s%18s%3s%10.0f  %-40s%-30s' % (i, f, n_, np, t, args))  # print
        save.extend(x % i for x in ([f] if isinstance(f, int) else f) if x != -1)  # append to savelist
        layers.append(m_)
        if i == 0:
            ch = []
        ch.append(c2)
    return nn.Sequential(*layers), sorted(save)

yolov5_ConvNeXt.yaml


# Parameters
# 以convnext_tiny_1k为例
nc: 80  # number of classes
anchors:
  - [10,13, 16,30, 33,23]  # P3/8
  - [30,61, 62,45, 59,119]  # P4/16
  - [116,90, 156,198, 373,326]  # P5/32

# YOLOv5 backbone
backbone:
  [[-1, 1, ConvNeXt_Block, [96, 0, 3, [3, 3, 9, 3], [96, 192, 384, 768]]],
   [-1, 1, ConvNeXt_Block, [192, 1, 3, [3, 3, 9, 3], [96, 192, 384, 768]]],
   [-1, 1, ConvNeXt_Block, [384, 2, 3, [3, 3, 9, 3], [96, 192, 384, 768]]],
   [-1, 1, ConvNeXt_Block, [768, 3, 3, [3, 3, 9, 3], [96, 192, 384, 768]]],
  ]

# YOLOv5 head
head:
  [[-1, 1, Conv, [768, 1, 1]],
   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
   [[-1, 2], 1, Concat, [1]],
   [-1, 3, C3, [768, False]],

   [-1, 1, Conv, [384, 1, 1]],
   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
   [[-1, 1], 1, Concat, [1]],
   [-1, 3, C3, [384, False]],

   [-1, 1, Conv, [384, 3, 2]],
   [[-1, 8], 1, Concat, [1]],
   [-1, 3, C3, [768, False]],

   [-1, 1, Conv, [768, 3, 2]],
   [[-1, 4], 1, Concat, [1]],  # cat head P5
   [-1, 3, C3, [768, False]],  # 23 (P5/32-large)

   [[11, 14, 17], 1, Detect, [nc, anchors]],  # Detect(P3, P4, P5)
  ]

加载骨干网络预训练权重

网络搭建完成后因为缺少预训练权重，而从零开始训练，会需要大量的时间和算力，也浪费了大佬们花了那么多电费为我们提供的骨干网络权重，因此我们需要将骨干网络的预训练权重更新到我们搭建的检测网络中，代码如下（以convnext_tiny_1k为例）：

    model_urls = {
        "convnext_tiny_1k": "https://dl.fbaipublicfiles.com/convnext/convnext_tiny_1k_224_ema.pth",
        "convnext_small_1k": "https://dl.fbaipublicfiles.com/convnext/convnext_small_1k_224_ema.pth",
        "convnext_base_1k": "https://dl.fbaipublicfiles.com/convnext/convnext_base_1k_224_ema.pth",
        "convnext_large_1k": "https://dl.fbaipublicfiles.com/convnext/convnext_large_1k_224_ema.pth",
        "convnext_base_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_224.pth",
        "convnext_large_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_224.pth",
        "convnext_xlarge_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_224.pth",
    }

    url = model_urls['convnext_tiny_1k']
    checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu", check_hash=True)
    # 提取模型需要的部分权重
    init_dict = {}
    for index in range(4):
        for k, v in list(checkpoint['model'].items()):
            if k.startswith('norm') or k.startswith('head'):
                pass
            else:
                init_dict['.'.join(['model', str(index), k])] = v

    # Create model
    model = Model(opt.cfg).to(device)
    model.train()

    # 更新Backbone部分网络权重
    model_dict = model.state_dict()
    model_dict.update(init_dict)
    model.load_state_dict(model_dict)