YoloV5系列(2)-model解析

Asbtract

Yolov5的模型的创建和修改对于新手来说都是格外的友好.它也是通过yaml文件来管理model的组成的.我们可以通过对yaml的修改来针对各种场景,做出模型的改变.使得我们不需要再关注内部是怎么处理这种文件的.

同时,yolov5网络综合了各种奇巧淫玩,使得其效果让世界为之一惊,在很多目标检测比赛中.选手们选择yolov5作为主要网络的人非常多.根本原因就在于它对模型的管理非常方便,可以简单的修改yaml从而配置整个文件

yolov5的代码体系很值得我们学习. 阅读优秀的代码可以很好的提高我们写代码的规范水准.yolov5这种类似于工厂模式的模型创建方式.对于大项目来说, 很值得借鉴

Introduction

1 Yolov5主要的网络构成

和很多一阶网络一样,yolov5属于拥有一个backbone,用来抽象提取图片的特征,还有一个neck,将特征以不同的尺寸进行输出,可以更好的检测不同尺寸的目标(现在有论文已经证明,网络输出的最后一个特征层信息已经足够丰富来检测大小目标.不在需要这种fpn模式),最后还有一个prediction结构,将最终特征映射到x,y,w,h.confidence,class中去

如下图所示,我们先来看看yolov5是如何构建整个网络的
在这里插入图片描述
首先可以看到输入端由608x608x3的图片组成,经过各个模块,这些主要模块可以分为Focus,CBL,CSP1,CSP2,SPP组成,最终输出3个特征层.

为了探究这些模块是如何工作的.我们可以查看yolov5/models文件夹下,有两个构造这些模块的py文件,分别是common.py和experiencal.py

我们首先来探究下common.py文件

2, common.py文件组成

这是commonpy依赖的头文件.可以看到,它包含了一些matplotlib的绘制模块以及xywh转换工具
同时requests这个库也许会让你感到莫名其妙,其实他是yolov5提供的一个很好的技巧,可以让我们直接从http协议拉取视频流放入model中检测

import math
from pathlib import Path

import numpy as np
import requests
import torch
import torch.nn as nn
from PIL import Image

from utils.datasets import letterbox
from utils.general import non_max_suppression, make_divisible, scale_coords, xyxy2xywh
from utils.plots import color_list, plot_one_box
from utils.torch_utils import time_synchronized

通过卷积核的大小来计算需要的padding为多少才能把tensor补成原来的形状

def autopad(k, p=None):  # kernel, padding
    # Pad to 'same'
    if p is None:
        p = k // 2 if isinstance(k, int) else [x // 2 for x in k]  # auto-pad
    return p

基本的卷积层结构
参数:
c1:输入channal
c2:输出channal
k:卷积核大小
s:步长
act:是否使用激活函数
返回:
基本卷积层

def DWConv(c1, c2, k=1, s=1, act=True):
    # Depthwise convolution
    return Conv(c1, c2, k, s, g=math.gcd(c1, c2), act=act)

Conv实现了将输入特征经过卷积层,激活函数,归一化层,得到输出层
同时可以指定是否使用归一化层


class Conv(nn.Module):
    # Standard convolution
    def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True):  # ch_in, ch_out, kernel, stride, padding, groups
        super(Conv, self).__init__()
        self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p), groups=g, bias=False)
        self.bn = nn.BatchNorm2d(c2)

        self.act = nn.SiLU() if act is True else (act if isinstance(act, nn.Module) else nn.Identity())

    def forward(self, x):
        return self.act(self.bn(self.conv(x)))

    def fuseforward(self, x):
        return self.act(self.conv(x))

残差块结构


class Bottleneck(nn.Module):
    # Standard bottleneck
    
    def __init__(self, c1, c2, shortcut=True, g=1, e=0.5):  # ch_in, ch_out, shortcut, groups, expansion
        super(Bottleneck, self).__init__()
        #这边的e有点类似于一种控制瓶颈的朝参,e越小,瓶颈越窄
        #瓶颈指的是:首先经过一个Conv将通道数缩小,然后再通过一个Conv变成原来的通道数,而e就是控制这个窄度的
        #有实验表明,这种瓶颈结构可以很好的减少训练参数
        c_ = int(c2 * e)  # hidden channels
        self.cv1 = Conv(c1, c_, 1, 1)
        self.cv2 = Conv(c_, c2, 3, 1, g=g)
        #如果shortcut为True的话就会将输入和输出相加再输出
        self.add = shortcut and c1 == c2
    
    def forward(self, x):
        return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))

基于CSP的残差块

class BottleneckCSP(nn.Module):
    # CSP Bottleneck https://github.com/WongKinYiu/CrossStagePartialNetworks
    
    #n指的是有多少个残差组件Bottleneck
    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):  # ch_in, ch_out, number, shortcut, groups, expansion
        super(BottleneckCSP, self).__init__()
        c_ = int(c2 * e)  # hidden channels
        self.cv1 = Conv(c1, c_, 1, 1)
        self.cv2 = nn.Conv2d(c1, c_, 1, 1, bias=False)
        self.cv3 = nn.Conv2d(c_, c_, 1, 1, bias=False)
        self.cv4 = Conv(2 * c_, c2, 1, 1)
        self.bn = nn.BatchNorm2d(2 * c_)  # applied to cat(cv2, cv3)
        self.act = nn.LeakyReLU(0.1, inplace=True)
        self.m = nn.Sequential(*[Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)])
   
    def forward(self, x):
 
        #输入x->Conv模块->n个BottleNeck模块->Conv模块->y1
        y1 = self.cv3(self.m(self.cv1(x)))
    
        #输入x->Conv模块->输出y2
        y2 = self.cv2(x)
     
        #输入y1,y2->按通道数融合->归一化->激活函数->Conv输出->输出
        return self.cv4(self.act(self.bn(torch.cat((y1, y2), dim=1))))

C3模块和BottleneckCSP类似, 但是少了一个Conv模块.

class C3(nn.Module):
    # CSP Bottleneck with 3 convolutions
    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):  # ch_in, ch_out, number, shortcut, groups, expansion
        super(C3, self).__init__()
        c_ = int(c2 * e)  # hidden channels
        self.cv1 = Conv(c1, c_, 1, 1)
        self.cv2 = Conv(c1, c_, 1, 1)
        self.cv3 = Conv(2 * c_, c2, 1)  # act=FReLU(c2)
        self.m = nn.Sequential(*[Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)])
        # self.m = nn.Sequential(*[CrossConv(c_, c_, 3, 1, g, 1.0, shortcut) for _ in range(n)])

    def forward(self, x):
        return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), dim=1))

SPP模块
简单来说,输入x首先经过一个公共的Conv层,得到新的x,然后,再将x分别经过len(k)个卷积层,变成拥有不同通道数但是有相同形状的特征
输出形状:[x,x1,x2,…,x(len(k))]
然后将这些特征都分别经过各自的maxpool结构得到新的输出
接着在将这些输出按通道数进行集合,再经过一个Conv层,最后输出

class SPP(nn.Module):
    # Spatial pyramid pooling layer used in YOLOv3-SPP
    def __init__(self, c1, c2, k=(5, 9, 13)):
        super(SPP, self).__init__()
        c_ = c1 // 2  # hidden channels
        self.cv1 = Conv(c1, c_, 1, 1)
        self.cv2 = Conv(c_ * (len(k) + 1), c2, 1, 1)
        self.m = nn.ModuleList([nn.MaxPool2d(kernel_size=x, stride=1, padding=x // 2) for x in k])

    def forward(self, x):
        x = self.cv1(x)
        return self.cv2(torch.cat([x] + [m(x) for m in self.m], 1))

Focus模块
将 输入(b,c,w,h) 的shape变成了 输出(b,4c,w/2,h/2)
它将特征层的长和宽都缩减为原来的一半,然后通道数变成原来的4倍,你可以理解将一个图片等分切成4个,然后将这4个小的上下堆叠起来
最后再经过一个conv输出

class Focus(nn.Module):
    # Focus wh information into c-space
    def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True):  # ch_in, ch_out, kernel, stride, padding, groups
        super(Focus, self).__init__()
        self.conv = Conv(c1 * 4, c2, k, s, p, g, act)
        # self.contract = Contract(gain=2)
    def forward(self, x):  # x(b,c,w,h) -> y(b,4c,w/2,h/2)
        return self.conv(torch.cat([x[..., ::2, ::2], x[..., 1::2, ::2], x[..., ::2, 1::2], x[..., 1::2, 1::2]], 1))
        # return self.conv(self.contract(x))

以下模块是用来处理输入特征的shape的

class Contract(nn.Module):
    # Contract width-height into channels, i.e. x(1,64,80,80) to x(1,256,40,40)
    def __init__(self, gain=2):
        super().__init__()
        self.gain = gain

    def forward(self, x):
        N, C, H, W = x.size()  # assert (H / s == 0) and (W / s == 0), 'Indivisible gain'
        s = self.gain
        x = x.view(N, C, H // s, s, W // s, s)  # x(1,64,40,2,40,2)
        x = x.permute(0, 3, 5, 1, 2, 4).contiguous()  # x(1,2,2,64,40,40)
        return x.view(N, C * s * s, H // s, W // s)  # x(1,256,40,40)

class Expand(nn.Module):
    # Expand channels into width-height, i.e. x(1,64,80,80) to x(1,16,160,160)
    def __init__(self, gain=2):
        super().__init__()
        self.gain = gain

    def forward(self, x):
        N, C, H, W = x.size()  # assert C / s ** 2 == 0, 'Indivisible gain'
        s = self.gain
        x = x.view(N, s, s, C // s ** 2, H, W)  # x(1,2,2,16,80,80)
        x = x.permute(0, 3, 4, 1, 5, 2).contiguous()  # x(1,16,80,2,80,2)
        return x.view(N, C // s ** 2, H * s, W * s)  # x(1,16,160,160)


class Concat(nn.Module):
    # Concatenate a list of tensors along dimension
    def __init__(self, dimension=1):
        super(Concat, self).__init__()
        self.d = dimension

    def forward(self, x):
        return torch.cat(x, self.d)

nms是一个抑制proposals的东西,将x输入到non_max_suppression里面后
它会处理得到两个输入参数一个是bbox[n,4],还有一个是scores[n,classes]
首先找出每个bbox的score最大的class,然后判断这个score如果大于conf就保留
接着把保留的bbox判断哪些之间的交并比,如果交并比大于iou就只保留score比较高的框


class NMS(nn.Module):
    # Non-Maximum Suppression (NMS) module
    conf = 0.25  # confidence threshold
    iou = 0.45  # IoU threshold
    classes = None  # (optional list) filter by class

    def __init__(self):
        super(NMS, self).__init__()

    def forward(self, x):
        return non_max_suppression(x[0], conf_thres=self.conf, iou_thres=self.iou, classes=self.classes)

autoShape模块在train中不会被调用
当模型训练结束后,会通过这个模块对图片进行重塑,来方便模型的预测

class autoShape(nn.Module):
    # input-robust model wrapper for passing cv2/np/PIL/torch inputs. Includes preprocessing, inference and NMS
    conf = 0.25  # NMS confidence threshold
    iou = 0.45  # NMS IoU threshold
    classes = None  # (optional list) filter by class
    def __init__(self, model):
        super(autoShape, self).__init__()
        self.model = model.eval()

    def autoshape(self):
        print('autoShape already enabled, skipping... ')  # model already converted to model.autoshape()
        return self
    /*
    这边的imgs针对不同的方法读入,官方也给了具体的方法,size是你图片的尺寸,就比如最上面图片里面的输入608x608x3
    */
    def forward(self, imgs, size=640, augment=False, profile=False):
        # Inference from various sources. For height=720, width=1280, RGB images example inputs are:
        #   filename:   imgs = 'data/samples/zidane.jpg'
        #   URI:             = 'https://github.com/ultralytics/yolov5/releases/download/v1.0/zidane.jpg'
        #   OpenCV:          = cv2.imread('image.jpg')[:,:,::-1]  # HWC BGR to RGB x(720,1280,3)
        #   PIL:             = Image.open('image.jpg')  # HWC x(720,1280,3)
        #   numpy:           = np.zeros((720,1280,3))  # HWC
        #   torch:           = torch.zeros(16,3,720,1280)  # BCHW
        #   multiple:        = [Image.open('image1.jpg'), Image.open('image2.jpg'), ...]  # list of images

        t = [time_synchronized()]
        p = next(self.model.parameters())  # for device and type
        #把图片转化成模型输入的形式
        if isinstance(imgs, torch.Tensor):  # torch
            return self.model(imgs.to(p.device).type_as(p), augment, profile)  # inference

        # Pre-process
      
        #输入的可能不是一张图片,因为你输入的形式应该是[batch_size,image style],这里获取到你输入了n张图片,然后每张图片存储在列表imgs里面
        n, imgs = (len(imgs), imgs) if isinstance(imgs, list) else (1, [imgs])  # number of images, list of images
        shape0, shape1, files = [], [], []  # image and inference shapes, filenames
        #遍历图片
        for i, im in enumerate(imgs):
            
            #这个if条件基本上不会满足,它是针对一些特殊需求的,输入的是uri,
            #这其实也是相当有效的,比如说你训练了一个黄图检测网络,然后你通过爬虫从地址池里面爬img的uri,储存起来,一一输入model,效率比较高
          
            if isinstance(im, str):  # filename or uri
                im, f = Image.open(requests.get(im, stream=True).raw if im.startswith('http') else im), im  # open
                im.filename = f  # for uri
              #这个应该放的是图像的名字
            files.append(Path(im.filename).with_suffix('.jpg').name if isinstance(im, Image.Image) else f'image{i}.jpg')
            
             '''
            下面一系列操作我具体说明它做了什么
            首先将图片转换成numpy格式
            然后将图片原来的[H,W,C]的shape转换成[C,H,W]
            提取图像的[H,W]保存到shape0列表里面
            用输入图像的size(上面默认的640)/max[H,W],就可以得到缩放比例g
            再将原始的[H,W]x缩放比例g得到模型输入的shape保存到shape1列表里面
            然后更新imgs
            '''
            im = np.array(im)  # to numpy
            if im.shape[0] < 5:  # image in CHW
                im = im.transpose((1, 2, 0))  # reverse dataloader .transpose(2, 0, 1)
            im = im[:, :, :3] if im.ndim == 3 else np.tile(im[:, :, None], 3)  # enforce 3ch input
            s = im.shape[:2]  # HWC
            shape0.append(s)  # image shape
            g = (size / max(s))  # gain
            shape1.append([y * g for y in s])
            imgs[i] = im  # update
  
        shape1 = [make_divisible(x, int(self.stride.max())) for x in np.stack(shape1, 0).max(0)]  # inference shape
        x = [letterbox(im, new_shape=shape1, auto=False)[0] for im in imgs]  # pad
        x = np.stack(x, 0) if n > 1 else x[0][None]  # stack
        x = np.ascontiguousarray(x.transpose((0, 3, 1, 2)))  # BHWC to BCHW
        x = torch.from_numpy(x).to(p.device).type_as(p) / 255.  # uint8 to fp16/32
        t.append(time_synchronized())
        # Inference
        with torch.no_grad():
            y = self.model(x, augment, profile)[0]  # forward
        t.append(time_synchronized())
        
   
        #然后对y进行nms处理,就终于得到了最终的preds [batch_size,n,4] 和scores [batch_size,n,1]了

        # Post-process
        y = non_max_suppression(y, conf_thres=self.conf, iou_thres=self.iou, classes=self.classes)  # NMS
        for i in range(n):
            scale_coords(shape1, y[i][:, :4], shape0[i])
        t.append(time_synchronized())
       #把处理好的内容丢到Detections类中做最后的处理
       
        return Detections(imgs, y, files, t, self.names, x.shape)

上述内容我们介绍了yolov5各个模块的主要的构建方法.我们可以灵活的增加自己的模块.来实现新的功能,并在下面的yaml文件中进行配置

3, model的yaml文件配置

yolov5包含了四种不同大小的模型,有yolov5s.yaml,yolov5m.yaml yolov5l.yaml yolov5x.yaml
在yaml配置文件中,可以通过depth_multiple和width_multiple来调整网络的大小从而实现不同的模型

# parameters

#nc是数据集类别的数量,当你要跑自己的数据集的时候,必须要修改这一块内容,
nc: 80  # number of classes

#depth_multiple和width_multiple使用来控制深度和通道数的
#每个模块可能会调用n次,通过n*multple来控制模块数量
depth_multiple: 0.33  # model depth multiple
width_multiple: 0.50  # layer channel multiple

#每个特征层都拥有3个anchor框
#anchor框通过对数据集进行聚类得到的
#下面的anchor参数是聚类coco得到的
# anchors
anchors:
  - [10,13, 16,30, 33,23]  # P3/8
  - [30,61, 62,45, 59,119]  # P4/16
  - [116,90, 156,198, 373,326]  # P5/32


#模型的backbone,会从上到下依次调用这些模块
#例:[-1, 1, Focus, [64, 3]]
#第一个参数from:代表从哪一个模块获取输入x,-1代表从上一层输入注意这里的x可能不止从一个模块得到,它可能从多个模块得到变成列表的形式
#第二个参数number:这个很简单理解了,就是这一个模块的数量,
#第三个参数moudle:模块的名字,在上面common.py中都有讲到
#第四个参数args:模块的构造参数

# YOLOv5 backbone
backbone:
  # [from, number, module, args]
  [[-1, 1, Focus, [64, 3]],  # 0-P1/2
   [-1, 1, Conv, [128, 3, 2]],  # 1-P2/4
   [-1, 3, C3, [128]],
   [-1, 1, Conv, [256, 3, 2]],  # 3-P3/8
   [-1, 9, C3, [256]],
   [-1, 1, Conv, [512, 3, 2]],  # 5-P4/16
   [-1, 9, C3, [512]],
   [-1, 1, Conv, [1024, 3, 2]],  # 7-P5/32
   [-1, 1, SPP, [1024, [5, 9, 13]]],
   [-1, 3, C3, [1024, False]],  # 9
  ]

# YOLOv5 head
head:
  [[-1, 1, Conv, [512, 1, 1]],
   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
   [[-1, 6], 1, Concat, [1]],  # cat backbone P4
   [-1, 3, C3, [512, False]],  # 13

   [-1, 1, Conv, [256, 1, 1]],
   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
   [[-1, 4], 1, Concat, [1]],  # cat backbone P3
   [-1, 3, C3, [256, False]],  # 17 (P3/8-small)

   [-1, 1, Conv, [256, 3, 2]],
   [[-1, 14], 1, Concat, [1]],  # cat head P4
   [-1, 3, C3, [512, False]],  # 20 (P4/16-medium)

   [-1, 1, Conv, [512, 3, 2]],
   [[-1, 10], 1, Concat, [1]],  # cat head P5
   [-1, 3, C3, [1024, False]],  # 23 (P5/32-large)

   [[17, 20, 23], 1, Detect, [nc, anchors]],  # Detect(P3, P4, P5)
  ]

如果你不关注yolov5是怎么通过工厂的方式构造模型代码的,那么阅读到这一步就差不多了.
下面我们将介绍yolov5的model.py是如何对yaml进行解析的

3, yolo.py代码解析

这是模型的最后一个Detect模块,其会将yolov5处理最终的pred box ,pred cls和pred conf.

class Detect(nn.Module):
    stride = None  # strides computed during build
    export = False  # onnx export
	
    def __init__(self, nc=80, anchors=(), ch=()):  # detection layer
        super(Detect, self).__init__()
        self.nc = nc  # number of classes
        self.no = nc + 5  # number of outputs per anchor
        self.nl = len(anchors)  # number of detection layers
        self.na = len(anchors[0]) // 2  # number of anchors
        self.grid = [torch.zeros(1)] * self.nl  # init grid
        a = torch.tensor(anchors).float().view(self.nl, -1, 2)
        self.register_buffer('anchors', a)  # shape(nl,na,2)
        self.register_buffer('anchor_grid', a.clone().view(self.nl, 1, -1, 1, 1, 2))  # shape(nl,1,na,1,1,2)
        self.m = nn.ModuleList(nn.Conv2d(x, self.no * self.na, 1) for x in ch)  # output conv
	#如果模型不训练那么将会对这些预测得到的参数进一步处理,然后输出,可以方便后期的直接调用
	#包含了三个信息pred_box [x,y,w,h] pred_conf[confidence] pre_cls[cls0,cls1,cls2,...clsn]
    def forward(self, x):
        # x = x.copy()  # for profiling
        z = []  # inference output
        self.training |= self.export
        for i in range(self.nl):
            x[i] = self.m[i](x[i])  # conv
            bs, _, ny, nx = x[i].shape  # x(bs,255,20,20) to x(bs,3,20,20,85)
            x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous()

            if not self.training:  # inference
                if self.grid[i].shape[2:4] != x[i].shape[2:4]:
                    self.grid[i] = self._make_grid(nx, ny).to(x[i].device)

                y = x[i].sigmoid()
                y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i]) * self.stride[i]  # xy
                y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i]  # wh
                z.append(y.view(bs, -1, self.no))

        return x if self.training else (torch.cat(z, 1), x)

    @staticmethod
    def _make_grid(nx=20, ny=20):
        yv, xv = torch.meshgrid([torch.arange(ny), torch.arange(nx)])
        return torch.stack((xv, yv), 2).view((1, 1, ny, nx, 2)).float()

模型的构建工场,你可以指定模型的yaml文件,以及一系列的训练参数

class Model(nn.Module):
    def __init__(self, cfg='yolov5s.yaml', ch=3, nc=None, anchors=None):  # model, input channels, number of classes
        super(Model, self).__init__()
        
        #检查传入的参数格式
        if isinstance(cfg, dict):
            self.yaml = cfg  # model dict
        else:  # is *.yaml
            import yaml  # for torch hub
            self.yaml_file = Path(cfg).name
            with open(cfg) as f:
                self.yaml = yaml.load(f, Loader=yaml.SafeLoader)  # model dict
    	#获取输入通道
        # Define model
        ch = self.yaml['ch'] = self.yaml.get('ch', ch)  # input channels
        
        #判断类的通道数和yaml中的通道数是否相等
        if nc and nc != self.yaml['nc']:
            logger.info(f"Overriding model.yaml nc={self.yaml['nc']} with nc={nc}")
            self.yaml['nc'] = nc  # override yaml value
        if anchors:
            logger.info(f'Overriding model.yaml anchors with anchors={anchors}')
            self.yaml['anchors'] = round(anchors)  # override yaml value
        
        #解析yaml文件得到完整模型
        self.model, self.save = parse_model(deepcopy(self.yaml), ch=[ch])  # model, savelist
        #获取默认名字
        self.names = [str(i) for i in range(self.yaml['nc'])]  # default names
        
        #这串代码可以模拟前向传播
        #print([x.shape for x in self.forward(torch.zeros(1, ch, 64, 64))])
        #input()
        

        # Build strides, anchors
        #获取模型的最后一个模块,也就是上面提到的Detect输出模块
        m = self.model[-1]  # Detect()
        if isinstance(m, Detect):#检查
            s = 256  # 2x min stride
            #保存特征层的stride,并且将anchor处理成相对于特征层的格式
            #8,16,32
            m.stride = torch.tensor([s / x.shape[-2] for x in self.forward(torch.zeros(1, ch, s, s))])  # forward
            #shape[3,3,2]
            m.anchors /= m.stride.view(-1, 1, 1)
			#检测模型
            check_anchor_order(m)
            self.stride = m.stride
            #初始化bias,对于输出cls或者confidence的卷积层,所用的bias都添加了技巧
            self._initialize_biases()  # only run once
            # print('Strides: %s' % m.stride.tolist())
        #初始化权重
        # Init weights, biases
        initialize_weights(self)
        self.info()
        logger.info('')

    def forward(self, x, augment=False, profile=False):
        if augment:
        	#在train阶段不会被调用
        	#这个技巧会将图片进行裁剪,并分别送入模型进行检测
            img_size = x.shape[-2:]  # height, width
            s = [1, 0.83, 0.67]  # scales
            f = [None, 3, None]  # flips (2-ud, 3-lr)
            y = []  # outputs
            for si, fi in zip(s, f):
                xi = scale_img(x.flip(fi) if fi else x, si, gs=int(self.stride.max()))
                yi = self.forward_once(xi)[0]  # forward
                # cv2.imwrite(f'img_{si}.jpg', 255 * xi[0].cpu().numpy().transpose((1, 2, 0))[:, :, ::-1])  # save
                yi[..., :4] /= si  # de-scale
                if fi == 2:
                    yi[..., 1] = img_size[0] - yi[..., 1]  # de-flip ud
                elif fi == 3:
                    yi[..., 0] = img_size[1] - yi[..., 0]  # de-flip lr
                y.append(yi)
            return torch.cat(y, 1), None  # augmented inference, train
        else:
            return self.forward_once(x, profile)  # single-scale inference, train

    def forward_once(self, x, profile=False):
        y, dt = [], []  # outputs
        #遍历model的各个模块
        for m in self.model:
            #判断该模块的输入是否从上一层得到
            #如果不是,将从save的y中读取到,并改成列表的格式
            if m.f != -1:  # if not from previous layer
                x = y[m.f] if isinstance(m.f, int) else [x if j == -1 else y[j] for j in m.f]  # from earlier layers

            if profile:#用来打印信息
                o = thop.profile(m, inputs=(x,), verbose=False)[0] / 1E9 * 2 if thop else 0  # FLOPS
                t = time_synchronized()
                for _ in range(10):
                    _ = m(x)
                dt.append((time_synchronized() - t) * 100)
                print('%10.1f%10.0f%10.1fms %-40s' % (o, m.np, dt[-1], m.type))
                
            #x为输入,当x为多输入时,x为列表
            #detect模块最终会将x输出成一个长度为3的列表
            x = m(x)  # run
            #判断该输出是否会被除了当前模块的下一模块调用
            #如果是,将会保存到y中
            y.append(x if m.i in self.save else None)  # save output

        if profile:
            print('%.1fms total' % sum(dt))
        return x
        
	#初始化biases的技巧
    def _initialize_biases(self, cf=None):  # initialize biases into Detect(), cf is class frequency
        # https://arxiv.org/abs/1708.02002 section 3.3
        # cf = torch.bincount(torch.tensor(np.concatenate(dataset.labels, 0)[:, 0]).long(), minlength=nc) + 1.
        m = self.model[-1]  # Detect() module
        for mi, s in zip(m.m, m.stride):  # from
            b = mi.bias.view(m.na, -1)  # conv.bias(255) to (3,85)
            b.data[:, 4] += math.log(8 / (640 / s) ** 2)  # obj (8 objects per 640 image)
            b.data[:, 5:] += math.log(0.6 / (m.nc - 0.99)) if cf is None else torch.log(cf / cf.sum())  # cls
            mi.bias = torch.nn.Parameter(b.view(-1), requires_grad=True)

    def _print_biases(self):
        m = self.model[-1]  # Detect() module
        for mi in m.m:  # from
            b = mi.bias.detach().view(m.na, -1).T  # conv.bias(255) to (3,85)
            print(('%6g Conv2d.bias:' + '%10.3g' * 6) % (mi.weight.shape[1], *b[:5].mean(1).tolist(), b[5:].mean()))

    # def _print_weights(self):
    #     for m in self.model.modules():
    #         if type(m) is Bottleneck:
    #             print('%10.3g' % (m.w.detach().sigmoid() * 2))  # shortcut weights

    def fuse(self):  # fuse model Conv2d() + BatchNorm2d() layers
        print('Fusing layers... ')
        for m in self.model.modules():
            if type(m) is Conv and hasattr(m, 'bn'):
                m.conv = fuse_conv_and_bn(m.conv, m.bn)  # update conv
                delattr(m, 'bn')  # remove batchnorm
                m.forward = m.fuseforward  # update forward
        self.info()
        return self

    def nms(self, mode=True):  # add or remove NMS module
        present = type(self.model[-1]) is NMS  # last layer is NMS
        if mode and not present:
            print('Adding NMS... ')
            m = NMS()  # module
            m.f = -1  # from
            m.i = self.model[-1].i + 1  # index
            self.model.add_module(name='%s' % m.i, module=m)  # add
            self.eval()
        elif not mode and present:
            print('Removing NMS... ')
            self.model = self.model[:-1]  # remove
        return self

    def autoshape(self):  # add autoShape module
        print('Adding autoShape... ')
        m = autoShape(self)  # wrap model
        copy_attr(m, self, include=('yaml', 'nc', 'hyp', 'names', 'stride'), exclude=())  # copy attributes
        return m

    def info(self, verbose=False, img_size=640):  # print model information
        model_info(self, verbose, img_size)

该模块是解析模型yaml的模块,通过读取yaml文件中的配置,并且到common.py中找到相对于的模块,然后组成一个完整的模型

def parse_model(d, ch):  # model_dict, input_channels(3)
    logger.info('\n%3s%18s%3s%10s  %-40s%-30s' % ('', 'from', 'n', 'params', 'module', 'arguments'))
    anchors, nc, gd, gw = d['anchors'], d['nc'], d['depth_multiple'], d['width_multiple']#获取配置文件的anchor,deepth,width
    
    na = (len(anchors[0]) // 2) if isinstance(anchors, list) else anchors  # anchor的数量
    no = na * (nc + 5)  #  outputs的数量 = anchors * (classes + 5)

    layers, save, c2 = [], [], ch[-1]  # layers, savelist, ch out
    #layers存放保存的模块
    #save存放哪些模块的输出还会被其他模块调用的索引
    #c2 输出通道数
    
    for i, (f, n, m, args) in enumerate(d['backbone'] + d['head']):  # from, number, module, args
        #[from, number, module, args]
        #[-1, 1, Focus, [64, 3]
        
        #判断module是否在模块里面
        m = eval(m) if isinstance(m, str) else m  # eval strings
        
        for j, a in enumerate(args):
            try:
                args[j] = eval(a) if isinstance(a, str) else a  # eval strings
            except:
                pass
        
        #n为调用该模块的次数,通过gd控制
        n = max(round(n * gd), 1) if n > 1 else n  # depth gain
        
        #这些模块就是common.py所含有的模块
        #如果再common.py中添加新的模块,需要在这边添加
        if m in [Conv, GhostConv, Bottleneck, GhostBottleneck, SPP, DWConv, MixConv2d, Focus, CrossConv, BottleneckCSP,
                 C3]:
            #ch:3
            c1, c2 = ch[f], args[0]
            if c2 != no:  # if not output
                c2 = make_divisible(c2 * gw, 8)

            args = [c1, c2, *args[1:]]
            if m in [BottleneckCSP, C3]:
                args.insert(2, n)  # number of repeats
                n = 1
        #判断是否是归一化模块
        elif m is nn.BatchNorm2d:
            args = [ch[f]]
        #判断是否是tensor连接模块
        elif m is Concat:
            c2 = sum([ch[x] for x in f])
        #判断是否是detect模块
        elif m is Detect:
            args.append([ch[x] for x in f])
            if isinstance(args[1], int):  # number of anchors
                args[1] = [list(range(args[1] * 2))] * len(f)
        elif m is Contract:
            c2 = ch[f] * args[0] ** 2
        elif m is Expand:
            c2 = ch[f] // args[0] ** 2
        else:
            c2 = ch[f]
            
        #将n个模块组合存放到m_里面
        m_ = nn.Sequential(*[m(*args) for _ in range(n)]) if n > 1 else m(*args)  # module
        #t为module的名字
        t = str(m)[8:-2].replace('__main__.', '')  # module type
        #这个模块参数的数量
        np = sum([x.numel() for x in m_.parameters()])  # number params
        m_.i, m_.f, m_.type, m_.np = i, f, t, np  # attach index, 'from' index, type, number params
        logger.info('%3s%18s%3s%10.0f  %-40s%-30s' % (i, f, n, np, t, args))  # print
        #如果输入的x不是从上一层来的,将会被保存序号
        save.extend(x % i for x in ([f] if isinstance(f, int) else f) if x != -1)  # append to savelist
        layers.append(m_)
        if i == 0:
            ch = []
        ch.append(c2)
    return nn.Sequential(*layers), sorted(save)

  • 12
    点赞
  • 67
    收藏
    觉得还不错? 一键收藏
  • 6
    评论
评论 6
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值