yolov5代码学习

酱紫牙

已于 2024-07-16 20:47:46 修改

阅读量205

点赞数 1

文章标签： YOLO 学习 python

于 2024-07-02 10:24:38 首次发布

本文链接：https://blog.csdn.net/weixin_45706801/article/details/140118726

版权

模型参数量计算

torch_utils.py

def model_info(model, verbose=False, imgsz=640):
    """
    Prints model summary including layers, parameters, gradients, and FLOPs; imgsz may be int or list.

    Example: img_size=640 or img_size=[640, 320]
    """
    n_p = sum(x.numel() for x in model.parameters())  # number parameters
    n_g = sum(x.numel() for x in model.parameters() if x.requires_grad)  # number gradients
    if verbose:
        print(f"{'layer':>5} {'name':>40} {'gradient':>9} {'parameters':>12} {'shape':>20} {'mu':>10} {'sigma':>10}")
        for i, (name, p) in enumerate(model.named_parameters()):
            name = name.replace("module_list.", "")
            print(
                "%5g %40s %9s %12g %20s %10.3g %10.3g"
                % (i, name, p.requires_grad, p.numel(), list(p.shape), p.mean(), p.std())
            )

    try:  # FLOPs
        p = next(model.parameters())
        stride = max(int(model.stride.max()), 32) if hasattr(model, "stride") else 32  # max stride
        im = torch.empty((1, p.shape[1], stride, stride), device=p.device)  # input image in BCHW format
        flops = thop.profile(deepcopy(model), inputs=(im,), verbose=False)[0] / 1e9 * 2  # stride GFLOPs
        imgsz = imgsz if isinstance(imgsz, list) else [imgsz, imgsz]  # expand if int/float
        fs = f",imgsz {imgsz} {flops * imgsz[0] / stride * imgsz[1] / stride:.1f} GFLOPs"  # 640x640 GFLOPs
    except Exception:
        fs = ""
        import traceback
        traceback.print_exc()

    name = Path(model.yaml_file).stem.replace("yolov5", "YOLOv5") if hasattr(model, "yaml_file") else "Model"
    LOGGER.info(f"{name} summary: {len(list(model.modules()))} layers, {n_p} parameters, {n_g} gradients{fs}")

超参数 hyp.scratch-low.yaml

# Ultralytics YOLOv5 🚀, AGPL-3.0 license
# Hyperparameters for low-augmentation COCO training from scratch
# python train.py --batch 64 --cfg yolov5n6.yaml --weights '' --data coco.yaml --img 640 --epochs 300 --linear
# See tutorials for hyperparameter evolution https://github.com/ultralytics/yolov5#tutorials

lr0: 0.01 # initial learning rate (SGD=1E-2, Adam=1E-3)
lrf: 0.01 # final OneCycleLR learning rate (lr0 * lrf)
momentum: 0.937 # SGD momentum/Adam beta1
weight_decay: 0.0005 # optimizer weight decay 5e-4
warmup_epochs: 3.0 # warmup epochs (fractions ok)
warmup_momentum: 0.8 # warmup initial momentum
warmup_bias_lr: 0.1 # warmup initial bias lr
box: 0.05 # box loss gain
cls: 0.5 # cls loss gain
cls_pw: 1.0 # cls BCELoss positive_weight
obj: 1.0 # obj loss gain (scale with pixels)
obj_pw: 1.0 # obj BCELoss positive_weight
iou_t: 0.20 # IoU training threshold
anchor_t: 4.0 # anchor-multiple threshold
# anchors: 3  # anchors per output layer (0 to ignore)
fl_gamma: 0.0 # focal loss gamma (efficientDet default gamma=1.5)
hsv_h: 0.015 # image HSV-Hue augmentation (fraction)
hsv_s: 0.7 # image HSV-Saturation augmentation (fraction)
hsv_v: 0.4 # image HSV-Value augmentation (fraction)
degrees: 0.0 # image rotation (+/- deg)
translate: 0.1 # image translation (+/- fraction)
scale: 0.5 # image scale (+/- gain)
shear: 0.0 # image shear (+/- deg)
perspective: 0.0 # image perspective (+/- fraction), range 0-0.001
flipud: 0.0 # image flip up-down (probability)
fliplr: 0.5 # image flip left-right (probability)
mosaic: 1.0 # image mosaic (probability)
mixup: 0.0 # image mixup (probability)
copy_paste: 0.0 # segment copy-paste (probability)

模型骨架配置

train.py

def train(hyp, opt, device, callbacks):
    ## 省略
    pretrained = weights.endswith(".pt")
    if pretrained:
        with torch_distributed_zero_first(LOCAL_RANK):
            weights = attempt_download(weights)  # download if not found locally
        ckpt = torch.load(weights, map_location="cpu")  # load checkpoint to CPU to avoid CUDA memory leak
        model = Model(cfg or ckpt["model"].yaml, ch=3, nc=nc, anchors=hyp.get("anchors")).to(device)  # create

yolo.py

parse_model函数解析并加载yaml模型结构

def parse_model(d, ch):
    """Parses a YOLOv5 model from a dict `d`, configuring layers based on input channels `ch` and model architecture."""
    LOGGER.info(f"\n{'':>3}{'from':>18}{'n':>3}{'params':>10}  {'module':<40}{'arguments':<30}")
    anchors, nc, gd, gw, act, ch_mul = (
        d["anchors"],
        d["nc"],
        d["depth_multiple"],
        d["width_multiple"],
        d.get("activation"),
        d.get("channel_multiple"),
    )
    if act:
        Conv.default_act = eval(act)  # redefine default activation, i.e. Conv.default_act = nn.SiLU()
        LOGGER.info(f"{colorstr('activation:')} {act}")  # print
    if not ch_mul:
        ch_mul = 8
    na = (len(anchors[0]) // 2) if isinstance(anchors, list) else anchors  # number of anchors
    no = na * (nc + 5)  # number of outputs = anchors * (classes + 5)

    layers, save, c2 = [], [], ch[-1]  # layers, savelist, ch out
    for i, (f, n, m, args) in enumerate(d["backbone"] + d["head"]):  # from, number, module, args
        m = eval(m) if isinstance(m, str) else m  # eval strings
        for j, a in enumerate(args):
            with contextlib.suppress(NameError):
                args[j] = eval(a) if isinstance(a, str) else a  # eval strings

        n = n_ = max(round(n * gd), 1) if n > 1 else n  # depth gain
        if m in {
            Conv,
            GhostConv,
            Bottleneck,
            GhostBottleneck,
            SPP,
            SPPF,
            DWConv,
            MixConv2d,
            Focus,
            CrossConv,
            BottleneckCSP,
            C3,
            C3TR,
            C3SPP,
            C3Ghost,
            nn.ConvTranspose2d,
            DWConvTranspose2d,
            C3x,
        }:
            c1, c2 = ch[f], args[0]
            if c2 != no:  # if not output
                c2 = make_divisible(c2 * gw, ch_mul)

            args = [c1, c2, *args[1:]]
            if m in {BottleneckCSP, C3, C3TR, C3Ghost, C3x}:
                args.insert(2, n)  # number of repeats
                n = 1
        elif m is nn.BatchNorm2d:
            args = [ch[f]]
        elif m is Concat:
            c2 = sum(ch[x] for x in f)
        # TODO: channel, gw, gd
        elif m in {Detect, Segment}:
            args.append([ch[x] for x in f])
            if isinstance(args[1], int):  # number of anchors
                args[1] = [list(range(args[1] * 2))] * len(f)
            if m is Segment:
                args[3] = make_divisible(args[3] * gw, ch_mul)
        elif m is Contract:
            c2 = ch[f] * args[0] ** 2
        elif m is Expand:
            c2 = ch[f] // args[0] ** 2
        else:
            c2 = ch[f]

        m_ = nn.Sequential(*(m(*args) for _ in range(n))) if n > 1 else m(*args)  # module
        t = str(m)[8:-2].replace("__main__.", "")  # module type
        np = sum(x.numel() for x in m_.parameters())  # number params
        m_.i, m_.f, m_.type, m_.np = i, f, t, np  # attach index, 'from' index, type, number params
        LOGGER.info(f"{i:>3}{str(f):>18}{n_:>3}{np:10.0f}  {t:<40}{str(args):<30}")  # print
        save.extend(x % i for x in ([f] if isinstance(f, int) else f) if x != -1)  # append to savelist
        layers.append(m_)
        if i == 0:
            ch = []
        ch.append(c2)
    return nn.Sequential(*layers), sorted(save)

yolov5s.yaml

# Ultralytics YOLOv5 🚀, AGPL-3.0 license

# Parameters
nc: 80 # number of classes
depth_multiple: 0.33 # model depth multiple
width_multiple: 0.50 # layer channel multiple
anchors:
  - [10, 13, 16, 30, 33, 23] # P3/8
  - [30, 61, 62, 45, 59, 119] # P4/16
  - [116, 90, 156, 198, 373, 326] # P5/32

# YOLOv5 v6.0 backbone
backbone:
  # [from, number, module, args]
  [
    [-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2
    [-1, 1, Conv, [128, 3, 2]], # 1-P2/4
    [-1, 3, C3, [128]],
    [-1, 1, Conv, [256, 3, 2]], # 3-P3/8
    [-1, 6, C3, [256]],
    [-1, 1, Conv, [512, 3, 2]], # 5-P4/16
    [-1, 9, C3, [512]],
    [-1, 1, Conv, [1024, 3, 2]], # 7-P5/32
    [-1, 3, C3, [1024]],
    [-1, 1, SPPF, [1024, 5]], # 9
  ]

# YOLOv5 v6.0 head
head: [
    [-1, 1, Conv, [512, 1, 1]],
    [-1, 1, nn.Upsample, [None, 2, "nearest"]],
    [[-1, 6], 1, Concat, [1]], # cat backbone P4
    [-1, 3, C3, [512, False]], # 13

    [-1, 1, Conv, [256, 1, 1]],
    [-1, 1, nn.Upsample, [None, 2, "nearest"]],
    [[-1, 4], 1, Concat, [1]], # cat backbone P3
    [-1, 3, C3, [256, False]], # 17 (P3/8-small)

    [-1, 1, Conv, [256, 3, 2]],
    [[-1, 14], 1, Concat, [1]], # cat head P4
    [-1, 3, C3, [512, False]], # 20 (P4/16-medium)

    [-1, 1, Conv, [512, 3, 2]],
    [[-1, 10], 1, Concat, [1]], # cat head P5
    [-1, 3, C3, [1024, False]], # 23 (P5/32-large)

    [[17, 20, 23], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5)
  ]

yolov5s.yaml参数分析

depth_multiple是每一层重复次数的系数，[from, number, module, args]和其中的number相乘得到最终的该层的重复次数，比如0.33*3=1

width_multiple是输出channel的系数，args [64, 6, 2, 2]分别表示 out_channel=64*width_multiple, kernel_size=(6, 6), stride=(2, 2), padding=(2, 2)

所以打印出来的yaml配置如下,

from=-1表示来自上一层， [-1, 6]表示来自上一层和第6层

n=2表示该层重复2次；

arguments :

Conv [input_c,output_c,kernel_size,stride,padding]

C3 [input_c,output_c,Bottleneck_repeat_num]

from n params module arguments
0 -1 1 3520 models.common.Conv [3, 32, 6, 2, 2]
1 -1 1 18560 models.common.Conv [32, 64, 3, 2]
2 -1 1 18816 models.common.C3 [64, 64, 1]
3 -1 1 73984 models.common.Conv [64, 128, 3, 2]
4 -1 2 115712 models.common.C3 [128, 128, 2]
5 -1 1 295424 models.common.Conv [128, 256, 3, 2]
6 -1 3 625152 models.common.C3 [256, 256, 3]
7 -1 1 1180672 models.common.Conv [256, 512, 3, 2]
8 -1 1 1182720 models.common.C3 [512, 512, 1]
9 -1 1 656896 models.common.SPPF [512, 512, 5]
10 -1 1 131584 models.common.Conv [512, 256, 1, 1]
11 -1 1 0 torch.nn.modules.upsampling.Upsample [None, 2, 'nearest']
12 [-1, 6] 1 0 models.common.Concat [1]
13 -1 1 361984 models.common.C3 [512, 256, 1, False]
14 -1 1 33024 models.common.Conv [256, 128, 1, 1]
15 -1 1 0 torch.nn.modules.upsampling.Upsample [None, 2, 'nearest']
16 [-1, 4] 1 0 models.common.Concat [1]
17 -1 1 90880 models.common.C3 [256, 128, 1, False]
18 -1 1 147712 models.common.Conv [128, 128, 3, 2]
19 [-1, 14] 1 0 models.common.Concat [1]
20 -1 1 296448 models.common.C3 [256, 256, 1, False]
21 -1 1 590336 models.common.Conv [256, 256, 3, 2]
22 [-1, 10] 1 0 models.common.Concat [1]
23 -1 1 1182720 models.common.C3 [512, 512, 1, False]
24 [17, 20, 23] 1 229245 models.yolo.Detect [80, [[10, 13, 16, 30, 33, 23], [30, 61, 62, 45, 59, 119], [116, 90, 156, 198, 373, 326]], [128, 256, 512]]

每层输入输出分析

yolo.py

    def _forward_once(self, x, profile=False, visualize=False):
        """Performs a forward pass on the YOLOv5 model, enabling profiling and feature visualization options."""
        # show_batch_imgs(x)
        y, dt = [], []  # outputs
        layer_index, y_shape = 1, []  # my code
        # model_info(model=self.model, verbose=True, imgsz=640)
        for m in self.model:
            # my print
            print('layer {}\nmodel {}\ninput shape {}'.
                  format(layer_index, m, x.shape if isinstance(x, torch.Tensor) else [z.shape for z in x]))
            # model_info(model=m,verbose=True,imgsz=640)
            if m.f != -1:  # if not from previous layer
                x = y[m.f] if isinstance(m.f, int) else [x if j == -1 else y[j] for j in m.f]  # from earlier layers
                print('m.f {}\nfrom earlier layers shape {}'.format(m.f, x.shape if isinstance(x, torch.Tensor) else [
                    z.shape for z in x]))
            if profile:
                self._profile_one_layer(m, x, dt)
            x = m(x)  # run
            # print('output shape {}'.format(x.shape if isinstance(x, torch.Tensor) else [z.shape for z in x]))
            y.append(x if m.i in self.save else None)  # save output
            if m.i in self.save:
                y_shape.append(x.shape if isinstance(x, torch.Tensor) else [z.shape for z in x])
            else:
                y_shape.append(None)
            print('save list {} y_shape {}'.format(self.save, y_shape))
            if visualize:
                feature_visualization(x, m.type, m.i, save_dir=visualize)
            layer_index += 1
        return x

layer 0

model Conv(
(conv): Conv2d(3, 32, kernel_size=(6, 6), stride=(2, 2), padding=(2, 2), bias=False)
(bn): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
input shape torch.Size([1, 3, 256, 256])
output shape torch.Size([1, 32, 128, 128])
save list [4, 6, 10, 14, 17, 20, 23] y_shape [None]

layer 1

model Conv(
(conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
(bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
input shape torch.Size([1, 32, 128, 128])
output shape torch.Size([1, 64, 64, 64])
save list [4, 6, 10, 14, 17, 20, 23] y_shape [None, None]

layer 2

model C3(
(cv1): Conv(
(conv): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
(cv2): Conv(
(conv): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
(cv3): Conv(
(conv): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
(m): Sequential(
(0): Bottleneck(
(cv1): Conv(
(conv): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
(cv2): Conv(
(conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
)
)
)
input shape torch.Size([1, 64, 64, 64])
output shape torch.Size([1, 64, 64, 64])
save list [4, 6, 10, 14, 17, 20, 23] y_shape [None, None, None]

layer 3

model Conv(
(conv): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
(bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
input shape torch.Size([1, 64, 64, 64])
output shape torch.Size([1, 128, 32, 32])
save list [4, 6, 10, 14, 17, 20, 23] y_shape [None, None, None, None]

layer 4

model C3(
(cv1): Conv(
(conv): Conv2d(128, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
(cv2): Conv(
(conv): Conv2d(128, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
(cv3): Conv(
(conv): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
(m): Sequential(
(0): Bottleneck(
(cv1): Conv(
(conv): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
(cv2): Conv(
(conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
)
(1): Bottleneck(
(cv1): Conv(
(conv): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
(cv2): Conv(
(conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
)
)
)
input shape torch.Size([1, 128, 32, 32])
output shape torch.Size([1, 128, 32, 32])
save list [4, 6, 10, 14, 17, 20, 23] y_shape [None, None, None, None, torch.Size([1, 128, 32, 32])]

layer 5

model Conv(
(conv): Conv2d(128, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
(bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
input shape torch.Size([1, 128, 32, 32])
output shape torch.Size([1, 256, 16, 16])
save list [4, 6, 10, 14, 17, 20, 23] y_shape [None, None, None, None, torch.Size([1, 128, 32, 32]), None]

layer 6

model C3(
(cv1): Conv(
(conv): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
(cv2): Conv(
(conv): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
(cv3): Conv(
(conv): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
(m): Sequential(
(0): Bottleneck(
(cv1): Conv(
(conv): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
(cv2): Conv(
(conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
)
(1): Bottleneck(
(cv1): Conv(
(conv): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
(cv2): Conv(
(conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
)
(2): Bottleneck(
(cv1): Conv(
(conv): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
(cv2): Conv(
(conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
)
)
)
input shape torch.Size([1, 256, 16, 16])
output shape torch.Size([1, 256, 16, 16])
save list [4, 6, 10, 14, 17, 20, 23] y_shape [None, None, None, None, torch.Size([1, 128, 32, 32]), None, torch.Size([1, 256, 16, 16])]

layer 7

model Conv(
(conv): Conv2d(256, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
(bn): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
input shape torch.Size([1, 256, 16, 16])
output shape torch.Size([1, 512, 8, 8])
save list [4, 6, 10, 14, 17, 20, 23] y_shape [None, None, None, None, torch.Size([1, 128, 32, 32]), None, torch.Size([1, 256, 16, 16]), None]

layer 8

model C3(
(cv1): Conv(
(conv): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
(cv2): Conv(
(conv): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
(cv3): Conv(
(conv): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
(m): Sequential(
(0): Bottleneck(
(cv1): Conv(
(conv): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
(cv2): Conv(
(conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
)
)
)
input shape torch.Size([1, 512, 8, 8])
output shape torch.Size([1, 512, 8, 8])
save list [4, 6, 10, 14, 17, 20, 23] y_shape [None, None, None, None, torch.Size([1, 128, 32, 32]), None, torch.Size([1, 256, 16, 16]), None, None]

layer 9

model SPPF(
(cv1): Conv(
(conv): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
(cv2): Conv(
(conv): Conv2d(1024, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
(m): MaxPool2d(kernel_size=5, stride=1, padding=2, dilation=1, ceil_mode=False)
)
input shape torch.Size([1, 512, 8, 8])
output shape torch.Size([1, 512, 8, 8])
save list [4, 6, 10, 14, 17, 20, 23] y_shape [None, None, None, None, torch.Size([1, 128, 32, 32]), None, torch.Size([1, 256, 16, 16]), None, None, None]

layer 10

model Conv(
(conv): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
input shape torch.Size([1, 512, 8, 8])
output shape torch.Size([1, 256, 8, 8])
save list [4, 6, 10, 14, 17, 20, 23] y_shape [None, None, None, None, torch.Size([1, 128, 32, 32]), None, torch.Size([1, 256, 16, 16]), None, None, None, torch.Size([1, 256, 8, 8])]

layer 11

model Upsample(scale_factor=2.0, mode='nearest')
input shape torch.Size([1, 256, 8, 8])
output shape torch.Size([1, 256, 16, 16])
save list [4, 6, 10, 14, 17, 20, 23] y_shape [None, None, None, None, torch.Size([1, 128, 32, 32]), None, torch.Size([1, 256, 16, 16]), None, None, None, torch.Size([1, 256, 8, 8]), None]

layer 12

model Concat()
input shape torch.Size([1, 256, 16, 16])
m.f [-1, 6]
from earlier layers shape [torch.Size([1, 256, 16, 16]), torch.Size([1, 256, 16, 16])]
output shape torch.Size([1, 512, 16, 16])
save list [4, 6, 10, 14, 17, 20, 23] y_shape [None, None, None, None, torch.Size([1, 128, 32, 32]), None, torch.Size([1, 256, 16, 16]), None, None, None, torch.Size([1, 256, 8, 8]), None, None]

layer 13

model C3(
(cv1): Conv(
(conv): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
(cv2): Conv(
(conv): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
(cv3): Conv(
(conv): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
(m): Sequential(
(0): Bottleneck(
(cv1): Conv(
(conv): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
(cv2): Conv(
(conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
)
)
)
input shape torch.Size([1, 512, 16, 16])
output shape torch.Size([1, 256, 16, 16])
save list [4, 6, 10, 14, 17, 20, 23] y_shape [None, None, None, None, torch.Size([1, 128, 32, 32]), None, torch.Size([1, 256, 16, 16]), None, None, None, torch.Size([1, 256, 8, 8]), None, None, None]

layer 14

model Conv(
(conv): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
input shape torch.Size([1, 256, 16, 16])
output shape torch.Size([1, 128, 16, 16])
save list [4, 6, 10, 14, 17, 20, 23] y_shape [None, None, None, None, torch.Size([1, 128, 32, 32]), None, torch.Size([1, 256, 16, 16]), None, None, None, torch.Size([1, 256, 8, 8]), None, None, None, torch.Size([1, 128, 16, 16])]

layer 15

model Upsample(scale_factor=2.0, mode='nearest')
input shape torch.Size([1, 128, 16, 16])
output shape torch.Size([1, 128, 32, 32])
save list [4, 6, 10, 14, 17, 20, 23] y_shape [None, None, None, None, torch.Size([1, 128, 32, 32]), None, torch.Size([1, 256, 16, 16]), None, None, None, torch.Size([1, 256, 8, 8]), None, None, None, torch.Size([1, 128, 16, 16]), None]

layer 16

model Concat()
input shape torch.Size([1, 128, 32, 32])
m.f [-1, 4]
from earlier layers shape [torch.Size([1, 128, 32, 32]), torch.Size([1, 128, 32, 32])]
output shape torch.Size([1, 256, 32, 32])
save list [4, 6, 10, 14, 17, 20, 23] y_shape [None, None, None, None, torch.Size([1, 128, 32, 32]), None, torch.Size([1, 256, 16, 16]), None, None, None, torch.Size([1, 256, 8, 8]), None, None, None, torch.Size([1, 128, 16, 16]), None, None]

layer 17

model C3(
(cv1): Conv(
(conv): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
(cv2): Conv(
(conv): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
(cv3): Conv(
(conv): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
(m): Sequential(
(0): Bottleneck(
(cv1): Conv(
(conv): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
(cv2): Conv(
(conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
)
)
)
input shape torch.Size([1, 256, 32, 32])
output shape torch.Size([1, 128, 32, 32])
save list [4, 6, 10, 14, 17, 20, 23] y_shape [None, None, None, None, torch.Size([1, 128, 32, 32]), None, torch.Size([1, 256, 16, 16]), None, None, None, torch.Size([1, 256, 8, 8]), None, None, None, torch.Size([1, 128, 16, 16]), None, None, torch.Size([1, 128, 32, 32])]

layer 18

model Conv(
(conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
(bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
input shape torch.Size([1, 128, 32, 32])
output shape torch.Size([1, 128, 16, 16])
save list [4, 6, 10, 14, 17, 20, 23] y_shape [None, None, None, None, torch.Size([1, 128, 32, 32]), None, torch.Size([1, 256, 16, 16]), None, None, None, torch.Size([1, 256, 8, 8]), None, None, None, torch.Size([1, 128, 16, 16]), None, None, torch.Size([1, 128, 32, 32]), None]

layer 19

model Concat()
input shape torch.Size([1, 128, 16, 16])
m.f [-1, 14]
from earlier layers shape [torch.Size([1, 128, 16, 16]), torch.Size([1, 128, 16, 16])]
output shape torch.Size([1, 256, 16, 16])
save list [4, 6, 10, 14, 17, 20, 23] y_shape [None, None, None, None, torch.Size([1, 128, 32, 32]), None, torch.Size([1, 256, 16, 16]), None, None, None, torch.Size([1, 256, 8, 8]), None, None, None, torch.Size([1, 128, 16, 16]), None, None, torch.Size([1, 128, 32, 32]), None, None]

layer 20

model C3(
(cv1): Conv(
(conv): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
(cv2): Conv(
(conv): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
(cv3): Conv(
(conv): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
(m): Sequential(
(0): Bottleneck(
(cv1): Conv(
(conv): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
(cv2): Conv(
(conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
)
)
)
input shape torch.Size([1, 256, 16, 16])
output shape torch.Size([1, 256, 16, 16])
save list [4, 6, 10, 14, 17, 20, 23] y_shape [None, None, None, None, torch.Size([1, 128, 32, 32]), None, torch.Size([1, 256, 16, 16]), None, None, None, torch.Size([1, 256, 8, 8]), None, None, None, torch.Size([1, 128, 16, 16]), None, None, torch.Size([1, 128, 32, 32]), None, None, torch.Size([1, 256, 16, 16])]

layer 21

model Conv(
(conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
(bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
input shape torch.Size([1, 256, 16, 16])
output shape torch.Size([1, 256, 8, 8])
save list [4, 6, 10, 14, 17, 20, 23] y_shape [None, None, None, None, torch.Size([1, 128, 32, 32]), None, torch.Size([1, 256, 16, 16]), None, None, None, torch.Size([1, 256, 8, 8]), None, None, None, torch.Size([1, 128, 16, 16]), None, None, torch.Size([1, 128, 32, 32]), None, None, torch.Size([1, 256, 16, 16]), None]

layer 22

model Concat()
input shape torch.Size([1, 256, 8, 8])
m.f [-1, 10]
from earlier layers shape [torch.Size([1, 256, 8, 8]), torch.Size([1, 256, 8, 8])]
output shape torch.Size([1, 512, 8, 8])
save list [4, 6, 10, 14, 17, 20, 23] y_shape [None, None, None, None, torch.Size([1, 128, 32, 32]), None, torch.Size([1, 256, 16, 16]), None, None, None, torch.Size([1, 256, 8, 8]), None, None, None, torch.Size([1, 128, 16, 16]), None, None, torch.Size([1, 128, 32, 32]), None, None, torch.Size([1, 256, 16, 16]), None, None]

layer 23

model C3(
(cv1): Conv(
(conv): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
(cv2): Conv(
(conv): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
(cv3): Conv(
(conv): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
(m): Sequential(
(0): Bottleneck(
(cv1): Conv(
(conv): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
(cv2): Conv(
(conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(act): SiLU()
)
)
)
)
input shape torch.Size([1, 512, 8, 8])
output shape torch.Size([1, 512, 8, 8])
save list [4, 6, 10, 14, 17, 20, 23] y_shape [None, None, None, None, torch.Size([1, 128, 32, 32]), None, torch.Size([1, 256, 16, 16]), None, None, None, torch.Size([1, 256, 8, 8]), None, None, None, torch.Size([1, 128, 16, 16]), None, None, torch.Size([1, 128, 32, 32]), None, None, torch.Size([1, 256, 16, 16]), None, None, torch.Size([1, 512, 8, 8])]

layer 24 Detect

model Detect(
(m): ModuleList(
(0): Conv2d(128, 255, kernel_size=(1, 1), stride=(1, 1))
(1): Conv2d(256, 255, kernel_size=(1, 1), stride=(1, 1))
(2): Conv2d(512, 255, kernel_size=(1, 1), stride=(1, 1))
)
)
input shape torch.Size([1, 512, 8, 8])
m.f [17, 20, 23]
from earlier layers shape [torch.Size([1, 128, 32, 32]), torch.Size([1, 256, 16, 16]), torch.Size([1, 512, 8, 8])]
output shape [torch.Size([1, 3, 32, 32, 85]), torch.Size([1, 3, 16, 16, 85]), torch.Size([1, 3, 8, 8, 85])]
save list [4, 6, 10, 14, 17, 20, 23] y_shape [None, None, None, None, torch.Size([1, 128, 32, 32]), None, torch.Size([1, 256, 16, 16]), None, None, None, torch.Size([1, 256, 8, 8]), None, None, None, torch.Size([1, 128, 16, 16]), None, None, torch.Size([1, 128, 32, 32]), None, None, torch.Size([1, 256, 16, 16]), None, None, torch.Size([1, 512, 8, 8]), None]

yolov5网络层class

Conv

class Conv(nn.Module):
    # Standard convolution with args(ch_in, ch_out, kernel, stride, padding, groups, dilation, activation)
    default_act = nn.SiLU()  # default activation

    def __init__(self, c1, c2, k=1, s=1, p=None, g=1, d=1, act=True):
        """Initializes a standard convolution layer with optional batch normalization and activation."""
        super().__init__()
        self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p, d), groups=g, dilation=d, bias=False)
        self.bn = nn.BatchNorm2d(c2)
        self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity()

    def forward(self, x):
        """Applies a convolution followed by batch normalization and an activation function to the input tensor `x`."""
        return self.act(self.bn(self.conv(x)))

    def forward_fuse(self, x):
        """Applies a fused convolution and activation function to the input tensor `x`."""
        return self.act(self.conv(x))

Bottleneck

class Bottleneck(nn.Module):
    # Standard bottleneck
    def __init__(self, c1, c2, shortcut=True, g=1, e=0.5):
        """Initializes a standard bottleneck layer with optional shortcut and group convolution, supporting channel
        expansion.
        """
        super().__init__()
        c_ = int(c2 * e)  # hidden channels
        self.cv1 = Conv(c1, c_, 1, 1)
        self.cv2 = Conv(c_, c2, 3, 1, g=g)
        self.add = shortcut and c1 == c2

    def forward(self, x):
        """Processes input through two convolutions, optionally adds shortcut if channel dimensions match; input is a
        tensor.
        """
        return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))

C3

class C3(nn.Module):
    # CSP Bottleneck with 3 convolutions
    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
        """Initializes C3 module with options for channel count, bottleneck repetition, shortcut usage, group
        convolutions, and expansion.
        """
        super().__init__()
        c_ = int(c2 * e)  # hidden channels
        self.cv1 = Conv(c1, c_, 1, 1)
        self.cv2 = Conv(c1, c_, 1, 1)
        self.cv3 = Conv(2 * c_, c2, 1)  # optional act=FReLU(c2)
        self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)))

    def forward(self, x):
        """Performs forward propagation using concatenated outputs from two convolutions and a Bottleneck sequence."""
        return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), 1))

BottleneckCSP

class BottleneckCSP(nn.Module):
    # CSP Bottleneck https://github.com/WongKinYiu/CrossStagePartialNetworks
    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
        """Initializes CSP bottleneck with optional shortcuts; args: ch_in, ch_out, number of repeats, shortcut bool,
        groups, expansion.
        """
        super().__init__()
        c_ = int(c2 * e)  # hidden channels
        self.cv1 = Conv(c1, c_, 1, 1)
        self.cv2 = nn.Conv2d(c1, c_, 1, 1, bias=False)
        self.cv3 = nn.Conv2d(c_, c_, 1, 1, bias=False)
        self.cv4 = Conv(2 * c_, c2, 1, 1)
        self.bn = nn.BatchNorm2d(2 * c_)  # applied to cat(cv2, cv3)
        self.act = nn.SiLU()
        self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)))

    def forward(self, x):
        """Performs forward pass by applying layers, activation, and concatenation on input x, returning feature-
        enhanced output.
        """
        y1 = self.cv3(self.m(self.cv1(x)))
        y2 = self.cv2(x)
        return self.cv4(self.act(self.bn(torch.cat((y1, y2), 1))))

yolov5 NMS

YOLOv5中的NMS（Non-Maximum Suppression，非极大值抑制）是一种在目标检测中常用的后处理技术，用于去除冗余的边界框（bounding boxes），只保留最有可能包含目标物体的边界框，从而提高检测的精度和效率。以下是关于YOLOv5中NMS的详细解析：

一、NMS的基本原理

NMS的基本原理是：在检测到的多个边界框中，对于同一个目标，可能会有多个边界框被检测出来，这些边界框之间会有一定的重叠。NMS通过计算这些边界框之间的交并比（IoU，Intersection over Union）来判断它们是否属于同一个目标，并保留置信度最高的边界框，抑制（即删除）其他重叠度较高的边界框。

二、YOLOv5中NMS的具体实现步骤

在YOLOv5中，NMS的实现通常遵循以下步骤：

初步筛选：首先，根据模型的输出，筛选出置信度高于某个阈值（如0.4）的边界框。这一步是为了减少后续处理的计算量。

置信度计算：对于筛选出的边界框，将其置信度（通常是目标置信度与类别置信度的乘积）进行计算，得到每个边界框的最终置信度。

边界框转换：将边界框的坐标从(center x, center y, width, height)形式转换为(x1, y1, x2, y2)形式，即左上角和右下角的坐标。

排序与抑制：

将所有边界框按照置信度从高到低进行排序。
选择置信度最高的边界框作为基准，遍历剩余的边界框，计算它们与基准边界框的IoU。
如果某个边界框与基准边界框的IoU大于设定的阈值（如0.5），则将该边界框删除（抑制）。
从剩余的边界框中继续选择置信度最高的作为新的基准，重复上述过程，直到所有边界框都被处理过。

输出：经过NMS处理后，剩余的边界框即为最终的检测结果，这些边界框将被输出为网络的最终输出。

三、NMS在YOLOv5中的作用

NMS在YOLOv5中起到了至关重要的作用，它极大地提高了目标检测的精度和效率。通过去除冗余的边界框，NMS减少了误检和漏检的情况，使得检测结果更加准确和可靠。同时，由于减少了需要处理的边界框数量，NMS也提高了检测的速度。

四、总结

YOLOv5中的NMS是一种有效的后处理技术，它通过计算边界框之间的IoU并抑制重叠度较高的边界框来优化检测结果。NMS的引入使得YOLOv5在保持高速检测的同时，能够实现高精度的目标检测。

NMS代码

general.py

def non_max_suppression()

损失函数

YOLO（You Only Look Once）系列模型中的BCE（Binary Cross Entropy，二元交叉熵）损失函数主要用于处理分类问题，尤其是在目标检测任务中用于计算分类损失。以下是关于YOLO中BCE损失函数的详细解释：

一、BCE损失函数概述

BCE损失函数是二分类问题中常用的损失函数，其计算公式为：

[
L = -\frac{1}{N} \sum_{i=1}^{N} [y_i \cdot \log(p_i) + (1 - y_i) \cdot \log(1 - p_i)]
]

其中，N 是样本数量，yi 是第 i 个样本的真实标签（0或1），pi 是模型预测第 i 个样本为正类的概率（通常通过Sigmoid函数将模型的输出转换为概率）。

二、YOLO中的BCE损失函数应用

在YOLO系列模型中，BCE损失函数主要用于计算分类损失。具体来说，对于每个预测框，模型会输出一个或多个类别的概率（在YOLOv3及以后版本中，通常使用Sigmoid函数将输出转换为概率），然后使用BCE损失函数计算这些概率与真实标签之间的损失。

三、YOLOv5和YOLOv8中的BCE损失函数

YOLOv5：
YOLOv5中，分类损失和置信度损失都使用了BCE损失函数。分类损失用于计算每个预测框与真实类别之间的损失，而置信度损失则用于计算预测框内存在目标的置信度与真实值之间的损失。
在YOLOv5中，回归损失使用的是CIOU（Complete Intersection over Union）损失函数，它结合了IOU、DIOU和宽高比惩罚项的优点，以更准确地评估预测框与真实框之间的位置差异。
YOLOv8：
YOLOv8在分类损失方面继续沿用了BCE损失函数，但在模型结构和损失函数的设计上进行了一些优化和改进。
值得注意的是，YOLOv8取消了YOLOv5中的对象损失（即判断该区域是否有对象的损失），改为在分类损失中直接以one-hot编码形式判断“该区域是否有此类对象”。这一改进提高了模型权重的利用率，并通过存在状态与分类状态的强关联，使得标签能更好地指导模型对类别区分能力的学习。

四、BCE损失函数的优点与局限性

优点：

适用于二分类或多标签分类问题。
计算简单，易于实现。
在处理不平衡数据集时，可以通过调整权重来缓解类别不平衡问题。

局限性：

对于多分类问题（且类别数较多时），直接使用BCE损失函数可能会导致计算量较大。
在某些情况下，BCE损失函数可能对噪声数据较为敏感。

综上所述，BCE损失函数在YOLO系列模型中扮演着重要的角色，特别是在分类损失的计算方面。随着YOLO模型的不断发展和优化，BCE损失函数也在不断地被改进和应用于新的场景中。

stride=32

在YOLOv5中，stride是一个重要的概念，它指的是特征图中每个位置对应原始输入图像中的像素区域大小。stride=32意味着在网络的某一层，特征图的每个像素点对应着原始输入图像中32x32像素的区域。这种对应关系对于理解目标检测中的锚框（anchor boxes）如何映射回原始图像以及如何处理不同尺度的目标非常重要。

在YOLOv5的网络架构中，通常会通过下采样（例如，使用stride为2的卷积层或池化层）来逐渐减小特征图的尺寸，同时增加感受野（即每个特征点能看到的原始图像区域）。YOLOv5设计了多个尺度的特征图来检测不同大小的目标，每个尺度都对应一个特定的stride值。

小stride值（如8或16）通常用于检测较小的目标，因为较小的stride意味着特征图的分辨率更高，能够捕捉到更精细的细节。
大stride值（如32或64）则用于检测较大的目标。较大的stride值虽然降低了特征图的分辨率，但增加了感受野，使得网络能够“看到”更大的区域，这对于检测大目标是有帮助的。

在YOLOv5中，stride=32通常与网络中某个中间尺度的特征图相对应。这个尺度的特征图既不像最小的特征图那样过于关注细节，也不像最大的特征图那样只关注非常粗略的轮廓，因此它能够较好地平衡对小目标和中等大小目标的检测能力。

YOLOv5通过多尺度预测（multi-scale prediction）机制，结合不同stride值的特征图，实现了对不同大小目标的准确检测。这种设计使得YOLOv5在目标检测任务中表现出色，尤其是在处理复杂场景和多种尺寸目标时。

需要注意的是，YOLOv5的具体实现可能会根据版本和配置的不同而有所差异，但基本的stride概念和多尺度预测机制是保持一致的。

通过数据看推理过程

--imgsz [399] 根据stride=32 会计算成为 [416,416]

图片resize

推理图片 shape = [640,427] new_shape = [416,416]

r = min(new_shape[0] / shape[0], new_shape[1] / shape[1]) #0.65

new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r)) # (278, 416)
dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] # wh padding dh=0 dw=138
dw, dh = np.mod(dw, stride), np.mod(dh, stride) # wh padding dw=138%32=10 dh=0
dw /= 2 # divide padding into 2 sides dw=5
dh /= 2 #dh=0
if shape[::-1] != new_unpad: # resize
im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color) # add border
经过上面的resize&add border操作，im size = (416,288,3)

提取特征

im size = (416,288,3)经过前向传播到达Detect层时，3个特征图的维度为(b,c,h,w)

(1,128,52,36) (1,256,26,18) (1,512,13,9)

ModuleList(
(0): Conv2d(128, 255, kernel_size=(1, 1), stride=(1, 1))
(1): Conv2d(256, 255, kernel_size=(1, 1), stride=(1, 1))
(2): Conv2d(512, 255, kernel_size=(1, 1), stride=(1, 1))
)

经过这个卷积核得到

(1,255,52,36) (1,255,26,18) (1,255,13,9)

x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous()

255=3*85

(1,3,52,36,85) (1,3,26,18,85) (1,3,13,9,85)

中间3个维度合并，得到

(1,5616,85) (1,1404,85) (1,351,85)

再次合并得到 (1,7371,85)作为推理时前向传播的最终输出

NMS

nms大概涉及的步骤：

1 上面得到的(1,7371,85)根据85的第五个维度confidence,过滤出confidence>conf_thres的

2 第三个维度85分别表示 x y w h conf 80个类别的prob

3 获得 85其中的前4个维度，得到 x y w h -> x y x y

4 根据confidence给box排序，现在我们有了boxes,scores(也就是conf),iou_thres也就是进行nms时过滤重复框的阈值，进行nms算法，就可以得到最终推理结果。

general.py def non_max_suppression()

def non_max_suppression(
        prediction,
        conf_thres=0.25,
        iou_thres=0.45,
        classes=None,
        agnostic=False,
        multi_label=False,
        labels=(),
        max_det=300,
        nm=0,  # number of masks
):
    """
    Non-Maximum Suppression (NMS) on inference results to reject overlapping detections.

    Returns:
         list of detections, on (n,6) tensor per image [xyxy, conf, cls]
    """

    # Checks
    assert 0 <= conf_thres <= 1, f"Invalid Confidence threshold {conf_thres}, valid values are between 0.0 and 1.0"
    assert 0 <= iou_thres <= 1, f"Invalid IoU {iou_thres}, valid values are between 0.0 and 1.0"
    if isinstance(prediction, (list, tuple)):  # YOLOv5 model in validation model, output = (inference_out, loss_out)
        prediction = prediction[0]  # select only inference output

    device = prediction.device
    mps = "mps" in device.type  # Apple MPS
    if mps:  # MPS not fully supported yet, convert tensors to CPU before NMS
        prediction = prediction.cpu()
    # prediction最后一个维度85的含义 [x,y,w,h,conf,80个类别的概率值]
    bs = prediction.shape[0]  # batch size
    nc = prediction.shape[2] - nm - 5  # number of classes
    conf_1 = prediction[..., 4]
    xc = prediction[..., 4] > conf_thres  # candidates xc表示的是conf置信度是否大于阈值,返回(1,7371)的bool

    # Settings
    # min_wh = 2  # (pixels) minimum box width and height
    max_wh = 7680  # (pixels) maximum box width and height
    max_nms = 30000  # maximum number of boxes into torchvision.ops.nms()
    time_limit = 0.5 + 0.05 * bs  # seconds to quit after
    redundant = True  # require redundant detections
    multi_label &= nc > 1  # multiple labels per box (adds 0.5ms/img)
    merge = False  # use merge-NMS

    t = time.time()
    mi = 5 + nc  # mask start index
    output = [torch.zeros((0, 6 + nm), device=prediction.device)] * bs
    for xi, x in enumerate(prediction):  # image index, image inference
        # Apply constraints
        # x[((x[..., 2:4] < min_wh) | (x[..., 2:4] > max_wh)).any(1), 4] = 0  # width-height
        # xi表示image index，例如只有一张图片时(1,7371,85) xi=0 xc的shape是(1,7371),那么xc[xi]的shape就是(7371)
        a1 = xc[xi]
        x = x[a1]  # confidence 这边过滤mask，只保留 confidence> conf_thres(0.25)

        # Cat apriori labels if autolabelling
        if labels and len(labels[xi]):
            lb = labels[xi]
            v = torch.zeros((len(lb), nc + nm + 5), device=x.device)
            v[:, :4] = lb[:, 1:5]  # box
            v[:, 4] = 1.0  # conf
            v[range(len(lb)), lb[:, 0].long() + 5] = 1.0  # cls
            x = torch.cat((x, v), 0)

        # If none remain process next image
        if not x.shape[0]:
            continue

        # Compute conf
        a3 = x[:, 4:5] # obj_conf (55,1)
        a4 = x[:, 5:] # cls_prob (55,80)
        x[:, 5:] *= x[:, 4:5]  # class_prob = obj_conf * class_prob 意味着所有类别的概率都乘以置信度，都变小了

        # Box/Mask
        box = xywh2xyxy(x[:, :4])  # center_x, center_y, width, height) to (x1, y1, x2, y2) (55,4)
        mask = x[:, mi:]  # zero columns if no masks

        # Detections matrix nx6 (xyxy, conf, cls)
        if multi_label:
            i, j = (x[:, 5:mi] > conf_thres).nonzero(as_tuple=False).T
            x = torch.cat((box[i], x[i, 5 + j, None], j[:, None].float(), mask[i]), 1)
        else:  # best class only
            # 找出当前55个候选框中，conf最大的那个类别的conf以及类别index
            conf, j = x[:, 5:mi].max(1, keepdim=True)
            #拼接box, conf , 类别；并且过滤 conf>conf_thres
            x = torch.cat((box, conf, j.float(), mask), 1)[conf.view(-1) > conf_thres]

        # Filter by class
        if classes is not None:
            x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)]

        # Apply finite constraint
        # if not torch.isfinite(x).all():
        #     x = x[torch.isfinite(x).all(1)]

        # Check shape
        n = x.shape[0]  # number of boxes
        if not n:  # no boxes
            continue
        x = x[x[:, 4].argsort(descending=True)[:max_nms]]  # sort by confidence and remove excess boxes

        # Batched NMS
        # 类别* 7680 这是干啥？ 22*7680 = 168960
        c = x[:, 5:6] * (0 if agnostic else max_wh)  # classes
        #boxes都加上了上面这个c=168960，变得很大;scores就是每个候选框的置信度
        boxes, scores = x[:, :4] + c, x[:, 4]  # boxes (offset by class), scores
        # i = torchvision.ops.nms(boxes, scores, iou_thres)  # NMS
        i = nms_1(boxes,scores,iou_thres)
        i = i[:max_det]  # limit detections
        if merge and (1 < n < 3e3):  # Merge NMS (boxes merged using weighted mean)
            # update boxes as boxes(i,4) = weights(i,n) * boxes(n,4)
            iou = box_iou(boxes[i], boxes) > iou_thres  # iou matrix
            weights = iou * scores[None]  # box weights
            x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(1, keepdim=True)  # merged boxes
            if redundant:
                i = i[iou.sum(1) > 1]  # require redundancy

        output[xi] = x[i]
        if mps:
            output[xi] = output[xi].to(device)
        if (time.time() - t) > time_limit:
            LOGGER.warning(f"WARNING ⚠️ NMS time limit {time_limit:.3f}s exceeded")
            break  # time limit exceeded

    return output

通过数据看训练过程

loss

preds 维度介绍：

(3,3,80,80,85) batch=3,3和85是255拆分的，可以看detect卷积层;80=640/8

(3,3,40,40,85) 40=640/16

(3,3,20,20,85) 20=640/32

20,40,80 是图片尺寸640/stride 产生的3个维度的特征图,

如果选择stride=32,则会产生上面这三种尺寸的特征图

targets矩阵维度介绍:

targets是标签真值 (53,6) 表示有53个label, 6分别代表的维度是

(batch中图片的index,class index,x,y,w,h)

特别注意：labels的数量比真实的label要多，这是因为yolov5在加载数据的时候采用了mosaic数据增强技术，一张图是4个图片拼接起来的，所以label也是4个图片label的总和

三种损失：

lcls = torch.zeros(1, device=self.device) # class loss
lbox = torch.zeros(1, device=self.device) # box loss
lobj = torch.zeros(1, device=self.device) # object loss

yolov5目标检测神经网络——损失函数计算原理_yolov5损失函数-CSDN博客