yolov5源码解读

yolov5源码解读

注意事项:

  • 本文件的代码解读为yolov5-7.0版本,可能和某些代码和大家不一样,可以在gitlens中切换版本
  • 参考视频:https://www.bilibili.com/video/BV1Dt4y1x7Fz?p=3&vd_source=6bba72c45d736e5ed123a053dfa46fcb 这个博主的为6.0版本
  • 如果有错误欢迎大家批评改正

一.推理部分之detect.py文件

  • 执行方式

在命令行输入:

python detect.py --source data/images/bus.jpg(此处为带预测的图片)
  • 导入相关库

      import argparse
      import csv
      import os
      import platform
      import sys
      from pathlib import Path
      import torch
    
  • 导入相对路径下的模块

      from ultralytics.utils.plotting import Annotator, colors, save_one_box
      from models.common import DetectMultiBackend
      from utils.dataloaders import IMG_FORMATS, VID_FORMATS, LoadImages, LoadScreenshots, LoadStreams
      from utils.general import (LOGGER, Profile, check_file, check_img_size, check_imshow, check_requirements, colorstr, cv2,
                              increment_path, non_max_suppression, print_args, scale_boxes, strip_optimizer, xyxy2xywh)
      from utils.torch_utils import select_device, smart_inference_mode
    
  • 定义路径

      FILE = Path(__file__).resolve()
      #__file__表示detect.py文件的路径
      #此命令表示得到detect.py文件的绝对路径
    
      ROOT = FILE.parents[0]  
      #得到detect.py文件的副目录即上一级目录
    
      if str(ROOT) not in sys.path:
      #模块的查询路径列表
    
          sys.path.append(str(ROOT))  
          # add ROOT to PATH
          
      ROOT = Path(os.path.relpath(ROOT, Path.cwd()))  # relative,绝对路径转换成相对路径
    
  • 执行完毕后,会跳到后面部分执行以下代码

      if __name__ == '__main__':
          opt = parse_opt()
          #解析命令行传入的参数,即前面列出的--source后面的路径
    
          main(opt)#执行定义的main函数
    

先来看解析参数的函数parse_opt()

def parse_opt():
    parser = argparse.ArgumentParser()
    parser.add_argument('--weights', nargs='+', type=str, default=ROOT / 'yolov5s.pt', help='model path or triton URL')
    parser.add_argument('--source', type=str, default=ROOT / 'data/images', help='file/dir/URL/glob/screen/0(webcam)')
    parser.add_argument('--data', type=str, default=ROOT / 'data/coco128.yaml', help='(optional) dataset.yaml path')
    parser.add_argument('--imgsz', '--img', '--img-size', nargs='+', type=int, default=[640], help='inference size h,w')
    parser.add_argument('--conf-thres', type=float, default=0.25, help='confidence threshold')
    parser.add_argument('--iou-thres', type=float, default=0.45, help='NMS IoU threshold')
    parser.add_argument('--max-det', type=int, default=1000, help='maximum detections per image')
    parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
    parser.add_argument('--view-img', action='store_true', help='show results')
    parser.add_argument('--save-txt', action='store_true', help='save results to *.txt')
    parser.add_argument('--save-csv', action='store_true', help='save results in CSV format')
    parser.add_argument('--save-conf', action='store_true', help='save confidences in --save-txt labels')
    parser.add_argument('--save-crop', action='store_true', help='save cropped prediction boxes')
    parser.add_argument('--nosave', action='store_true', help='do not save images/videos')
    parser.add_argument('--classes', nargs='+', type=int, help='filter by class: --classes 0, or --classes 0 2 3')
    parser.add_argument('--agnostic-nms', action='store_true', help='class-agnostic NMS')
    parser.add_argument('--augment', action='store_true', help='augmented inference')
    parser.add_argument('--visualize', action='store_true', help='visualize features')
    parser.add_argument('--update', action='store_true', help='update all models')
    parser.add_argument('--project', default=ROOT / 'runs/detect', help='save results to project/name')
    parser.add_argument('--name', default='exp', help='save results to project/name')
    parser.add_argument('--exist-ok', action='store_true', help='existing project/name ok, do not increment')
    parser.add_argument('--line-thickness', default=3, type=int, help='bounding box thickness (pixels)')
    parser.add_argument('--hide-labels', default=False, action='store_true', help='hide labels')
    parser.add_argument('--hide-conf', default=False, action='store_true', help='hide confidences')
    parser.add_argument('--half', action='store_true', help='use FP16 half-precision inference')
    parser.add_argument('--dnn', action='store_true', help='use OpenCV DNN for ONNX inference')
    parser.add_argument('--vid-stride', type=int, default=1, help='video frame-rate stride')
    opt = parser.parse_args()
    opt.imgsz *= 2 if len(opt.imgsz) == 1 else 1  # expand
    print_args(vars(opt))
    return opt

首先定义了命令行可以传入的参数:
如–source

再如–weight表示权重

–imgzs表示模型预测的图片大小

  • 其中每一行

      parser.add_argument('--source', type=str, default=ROOT / 'data/images', help='file/dir/URL/glob/screen/0(webcam)')
    

default表示默认值

  • opt.imgsz *= 2 if len(opt.imgsz) == 1 else 1 # expand

例如,没有传入时,转换成 [640,640],将imgsize长度变为2

  • 打印参数

      print_args(vars(opt))
    
  • 接着会执行定义的main函数

      def main(opt):
          check_requirements(ROOT / 'requirements.
          txt', exclude=('tensorboard', 'thop'))
          #检测requirements.txt中的python依赖包有没有成功安装
    
          run(**vars(opt))
          #后续的图片加载,预测,结构保存的等一系列流程都在这个函数里执行
    
  • 下面来看run函数,我们分为6部分来看

      def run(
      weights=ROOT / 'yolov5s.pt',  # model path or triton URL
      source=ROOT / 'data/images',  # file/dir/URL/glob/screen/0(webcam)
      data=ROOT / 'data/coco128.yaml',  # dataset.yaml path
      imgsz=(640, 640),  # inference size (height, width)
      conf_thres=0.25,  # confidence threshold
      iou_thres=0.45,  # NMS IOU threshold
      max_det=1000,  # maximum detections per image
      device='',  # cuda device, i.e. 0 or 0,1,2,3 or cpu
      view_img=False,  # show results
      save_txt=False,  # save results to *.txt
      save_csv=False,  # save results in CSV format
      save_conf=False,  # save confidences in --save-txt labels
      save_crop=False,  # save cropped prediction boxes
      nosave=False,  # do not save images/videos
      classes=None,  # filter by class: --class 0, or --class 0 2 3
      agnostic_nms=False,  # class-agnostic NMS
      augment=False,  # augmented inference
      visualize=False,  # visualize features
      update=False,  # update all models
      project=ROOT / 'runs/detect',  # save results to project/name
      name='exp',  # save results to project/name
      exist_ok=False,  # existing project/name ok, do not increment
      line_thickness=3,  # bounding box thickness (pixels)
      hide_labels=False,  # hide labels
      hide_conf=False,  # hide confidences
      half=False,  # use FP16 half-precision inference
      dnn=False,  # use OpenCV DNN for ONNX inference
      vid_stride=1,  # video frame-rate stride
      ):
    
  • 第一部分:对source的额外判断

          source = str(source)
          #强制转换成字符串类型
          
    
          save_img = not nosave and not source.endswith('.txt') 
          # save inference images
    
          is_file = Path(source).suffix[1:] in (IMG_FORMATS + VID_FORMATS)
          # 判断传入的是否为文件地址
          # suffix表示文件后缀
          # IMG_FORMATS 表示图片格式
          # VID_FORMATS 表示视频格式
    
          is_url = source.lower().startswith(('rtsp://', 'rtmp://', 'http://', 'https://'))
          # 判断是否为网络流地址
          # lower() 表示全部转换小写字母
          
    
          webcam = source.isnumeric() or source.endswith('.streams') or (is_url and not is_file)
          # isnumeric() 表示判断是不是数值,例如0表示打开电脑的摄像头
    
          screenshot = source.lower().startswith('screen')
          ##判断是不是截屏
          if is_url and is_file:
              source = check_file(source)  # download
    
  • 第二部分:新建了一个保存结果的文件夹

      # Directories
          save_dir = increment_path(Path(project) / name, exist_ok=exist_ok)  
          # increment_path增量文件夹 例如runs/detect/exp1,exp2,exp3,在保存时候逐渐递增
    
          (save_dir / 'labels' if save_txt else save_dir).mkdir(parents=True, exist_ok=True) 
          # make dir
    
  • 第三部分:加载模型

      # Load model
          device = select_device(device)
          #选择加载模型的设备 gpu or cpu
    
          model = DetectMultiBackend(weights, device=device, dnn=dnn, data=data, fp16=half)
          # 模型的后端框架 pytorch or TorchScript 等等,不同的框架有不同的加载模型方式
          # weights 模型的权重 yolov5.pt
          # dnn 默认为为false
          # data 在‘data/scripts/coco128.yam’文件
          # half 半精度的推理模式,默认为flase
    
          stride, names, pt = model.stride, model.names, model.pt
          #步长,类别名,是否为pytorch类型
    
          imgsz = check_img_size(imgsz, s=stride)  
          # check image size
    
  • 第四部分:定义了加载待预测的图片的模块

      # Dataloader
      bs = 1  # batch_size,每次输入一张图片
      if webcam:
          view_img = check_imshow(warn=True)
          dataset = LoadStreams(source, img_size=imgsz, stride=stride, auto=pt, vid_stride=vid_stride)
          bs = len(dataset)
    
      elif screenshot:
          dataset = LoadScreenshots(source, img_size=imgsz, stride=stride, auto=pt)
          
      else:
          dataset = LoadImages(source, img_size=imgsz, stride=stride, auto=pt, vid_stride=vid_stride)
          #  LoadImages为加载图片的模块
    
    
      vid_path, vid_writer = [None] * bs, [None] * bs
    

下面来看LoadiImages类

    class LoadImages:
        # YOLOv5 image/video dataloader, i.e. `python detect.py --source image.jpg/vid.mp4`
        
        def __init__(self, path, img_size=640, stride=32, auto=True, transforms=None, vid_stride=1):
        # path表示传入的路径
        # Process predictions  把所有的检测框画到原图中以及保存结果
        for i, det in enumerate(pred):  # per image
            seen += 1
            if webcam:  # batch_size >= 1
                p, im0, frame = path[i], im0s[i].copy(), dataset.count
                s += f'{i}: '
        # img_size其实是以列表的形式传进来的,即img_size=[640.640],在前面的第999行已经解释过
        # stride表示模型的步长

            if isinstance(path, str) and Path(path).suffix == '.txt':  # *.txt file with img/vid/dir on each line
                path = Path(path).read_text().rsplit()
            files = []
            for p in sorted(path) if isinstance(path, (list, tuple)) else [path]:
                p = str(Path(p).resolve()) #将相对路径转化为ie绝对路径

                if '*' in p:  #判断路径里有没有带*号
                    files.extend(sorted(glob.glob(p, recursive=True)))  # glob


                elif os.path.isdir(p):##判断路径是不是一个文件夹
                    files.extend(sorted(glob.glob(os.path.join(p, '*.*'))))  # dir


                elif os.path.isfile(p):##判断路径是不是一个文件
                    files.append(p)  # p本来为字符串类型的文件,现在变成列表形式


                else:
                    raise FileNotFoundError(f'{p} does not exist')

            images = [x for x in files if x.split('.')[-1].lower() in IMG_FORMATS]
            videos = [x for x in files if x.split('.')[-1].lower() in VID_FORMATS]
            #遍历files列表中的所有元素,x.split('.')表示用.作为分割,所以便得到了拓展名如(jpg),然后再判断在不在规定的格式里
            
            ni, nv = len(images), len(videos)#d得到列表长度

            self.img_size = img_size
            self.stride = stride
            self.files = images + videos  #图片路径和视频路径合并起来
            self.nf = ni + nv  # number of files 
            self.video_flag = [False] * ni + [True] * nv  ##表示一个视频标志的列表 False表示不是视频,True表示为视频
            self.mode = 'image'
            self.auto = auto
            self.transforms = transforms  # optional
            self.vid_stride = vid_stride  # video frame-rate stride


            if any(videos):##判断videos里面有没有值
                self._new_video(videos[0])  # new video
            else:
                self.cap = None
            assert self.nf > 0, f'No images or videos found in {p}. ' \
                                f'Supported formats are:\nimages: {IMG_FORMATS}\nvideos: {VID_FORMATS}'
  • 第五部分:执行模型推理

      # Run inference
      model.warmup(imgsz=(1 if pt or model.triton else bs, 3, *imgsz))  
      # warmup即热身,初始化了一张空白的图片,传入到模型当中
    
    
      seen, windows, dt = 0, [], (Profile(), Profile(), Profile())
    
    
      for path, im, im0s, vid_cap, s in dataset: 
      ##遍历dataset,相关步骤在LoadiImages类里的def __iter__(self):def __next__(self):这两个函数里
      ## path表示路径
      ## im表示resize后的图片
      ## im0s表示原图
      ## vid_cap=None
      ## s表示图片的打印信息
    
    
          #预处理
          with dt[0]:
              im = torch.from_numpy(im).to(model.device) 
              ##torch.Size[3,640,480]三个数字分别表示RGB三通道,图片的高,图片的宽
              ##将numpy转化成pytorch支持的tensor格式
    
              im = im.half() if model.fp16 else im.float()  
              # uint8 to fp16/32,判断模型有没有用到半精度
    
              im /= 255  
              # 0 - 255 to 0.0 - 1.0,归一化,将图片的像素点除以255
    
              if len(im.shape) == 3:##判断输入图片的尺寸是不是只有3
                  im = im[None]  # expand for batch dim,torch.Size[1,3,640,480]
    
    
    
          # Inference  预测
          with dt[1]:
              visualize = increment_path(save_dir / Path(path).stem, mkdir=True) if visualize else False
              ## visualize默认为False,如果为True的话,会把模型推断过程中的一些特征图也保存下来
    
              pred = model(im, augment=augment, visualize=visualize)  #torch.Size[1,18900,85]
              # augment表示推断的时候要不要做一个数据增强,增强会对模型推断有一定帮助,但会降低运行速率
              # pred得到了所有的检测框,但是此时的检测框非常多(18900),还需要后续的过滤
    
    
    
          # NMS 非极大值过滤
          with dt[2]:
              pred = non_max_suppression(pred, conf_thres, iou_thres, classes, agnostic_nms, max_det=max_det)  #1,5,6
              # conf_thres表示置信度阈值,默认为0.25
              # max_det表示一张图里最大能检测出来多少个目标,默认为1000,超过1000个目标会自动过滤掉剩下的目标
    
          # Second-stage classifier (optional)
          # pred = utils.general.apply_classifier(pred, classifier_model, im, im0s)
    
          # Define the path for the CSV file
          csv_path = save_dir / 'predictions.csv'
    
          # Create or append to the CSV file
          def write_to_csv(image_name, prediction, confidence):
              data = {'Image Name': image_name, 'Prediction': prediction, 'Confidence': confidence}
              with open(csv_path, mode='a', newline='') as f:
                  writer = csv.DictWriter(f, fieldnames=data.keys())
                  if not csv_path.is_file():
                      writer.writeheader()
                  writer.writerow(data)
    
    
    
    
          # Process predictions  把所有的检测框画到原图中以及保存结果
          for i, det in enumerate(pred):  # 遍历pred,det表示每个框的预测信息
              seen += 1   #每处理一张图片加一
              if webcam:  # batch_size >= 1
                  p, im0, frame = path[i], im0s[i].copy(), dataset.count
                  s += f'{i}: '
              else:
                  p, im0, frame = path, im0s.copy(), getattr(dataset, 'frame', 0)
    
    
              p = Path(p)  # to Path
              save_path = str(save_dir / p.name)  # 保存目录,p.name表示图片名,两者做一个拼接,最后得到图片的存储路径
              txt_path = str(save_dir / 'labels' / p.stem) + ('' if dataset.mode == 'image' else f'_{frame}')  # im.txt
    
    
              s += '%gx%g ' % im.shape[2:]  # print string,加上了一个尺寸
    
              gn = torch.tensor(im0.shape)[[1, 0, 1, 0]]  # 获得原图的宽和高的大小
    
              imc = im0.copy() if save_crop else im0  # 判断是不是要把检测框的那个地方裁减下来单独保存成一张图片
    
              annotator = Annotator(im0, line_width=line_thickness, example=str(names))  #  定义了一个专门用来绘图的工具,其中example=str(names)表示预测的标签名
    
              if len(det):##det表示每个框的预测信息
                  # Rescale boxes from img_size to im0 size
                  det[:, :4] = scale_boxes(im.shape[2:], det[:, :4], im0.shape).round()#做一个坐标映射,把预测出的坐标映射回原图坐标,方便在原图上画框
    
                  # Print results
                  for c in det[:, 5].unique():
                      n = (det[:, 5] == c).sum()  # detections per class
                      s += f"{n} {names[int(c)]}{'s' * (n > 1)}, "  # add to string
                      #统计所有框的类别,添加到s,方便后续打印
    
                  # Write results,是否保存预测结果
                  for *xyxy, conf, cls in reversed(det):
                      c = int(cls)  # integer class
                      label = names[c] if hide_conf else f'{names[c]}'
                      confidence = float(conf)
                      confidence_str = f'{confidence:.2f}'
    
                      if save_csv:
                          write_to_csv(p.name, label, confidence_str)
    
                      #保存成txt格式
                      if save_txt:  # Write to file
                          xywh = (xyxy2xywh(torch.tensor(xyxy).view(1, 4)) / gn).view(-1).tolist()  # normalized xywh
                          line = (cls, *xywh, conf) if save_conf else (cls, *xywh)  # label format
                          with open(f'{txt_path}.txt', 'a') as f:
                              f.write(('%g ' * len(line)).rstrip() % line + '\n')
    
                      #画到原图上保存下来
                      if save_img or save_crop or view_img:  # Add bbox to image
                          c = int(cls)  # integer class
                          label = None if hide_labels else (names[c] if hide_conf else f'{names[c]} {conf:.2f}')
                          # hide_labels表示是否隐藏标签,默认为False
                          # 如果hide_conf为True的话,则画出来的框没有后面的置信度
    
                          annotator.box_label(xyxy, label, color=colors(c, True))
                          ## 调用annotator类的box_label函数进行划线
    
    
                      if save_crop:##是否要将目标框截下来保存成图片,默认为Fasle
                          save_one_box(xyxy, imc, file=save_dir / 'crops' / names[c] / f'{p.stem}.jpg', BGR=True)
    
              # Stream results
              im0 = annotator.result()  ## 得到画好框的图片
    
    
              if view_img:默认为False
                  if platform.system() == 'Linux' and p not in windows:
                      windows.append(p)
                      cv2.namedWindow(str(p), cv2.WINDOW_NORMAL | cv2.WINDOW_KEEPRATIO)  # allow window resize (Linux)
                      cv2.resizeWindow(str(p), im0.shape[1], im0.shape[0])
                  cv2.imshow(str(p), im0)
                  cv2.waitKey(1)  # 1 millisecond
    
              # Save results (image with detections)保存图片
              if save_img:
                  if dataset.mode == 'image':
                      cv2.imwrite(save_path, im0) ## opencv的保存图片
    
                  else:  # 'video' or 'stream'
                      if vid_path[i] != save_path:  # new video
                          vid_path[i] = save_path
                          if isinstance(vid_writer[i], cv2.VideoWriter):
                              vid_writer[i].release()  # release previous video writer
                          if vid_cap:  # video
                              fps = vid_cap.get(cv2.CAP_PROP_FPS)
                              w = int(vid_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
                              h = int(vid_cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
                          else:  # stream
                              fps, w, h = 30, im0.shape[1], im0.shape[0]
                          save_path = str(Path(save_path).with_suffix('.mp4'))  # force *.mp4 suffix on results videos
                          vid_writer[i] = cv2.VideoWriter(save_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, (w, h))
                      vid_writer[i].write(im0)
    
          # Print time (inference-only)
          LOGGER.info(f"{s}{'' if len(det) else '(no detections), '}{dt[1].dt * 1E3:.1f}ms")
    
  • 第六部分:最终打印出一些输出信息

      # Print results
          t = tuple(x.t / seen * 1E3 for x in dt)  # 统计了每张图片运行的平均时间,seen记录了总共多少张图片,dt记录了总耗时
    
          LOGGER.info(f'Speed: %.1fms pre-process, %.1fms inference, %.1fms NMS per image at shape {(1, 3, *imgsz)}' % t)
    
          if save_txt or save_img:
              s = f"\n{len(list(save_dir.glob('labels/*.txt')))} labels saved to {save_dir / 'labels'}" if save_txt else ''
              LOGGER.info(f"Results saved to {colorstr('bold', save_dir)}{s}")
          if update:
              strip_optimizer(weights[0])  # update model (to fix SourceChangeWarning)
    

二.网络结构之yolo.py文件

  • 概述

无论是在detect文件中进行模型推理,还是在train文件中进行模型训练,其中都有一个过程是创建并加载模型所用到的模型结构就定义在/yolov5/models/yolo.py的model类中

  • 首先来看if __ name __ == ‘__ main __’:函数,有这个main函数说明yolo.py文件是可以直接运行的,直接在命令行输入python yolo.py便可以执行。

我们将这块代码分为三部分来看

  • 第一部分:定义一些参数信息

      parser = argparse.ArgumentParser()
      parser.add_argument('--cfg', type=str, default='yolov5s.yaml', help='model.yaml') #模型的配置文件
      parser.add_argument('--batch-size', type=int, default=1, help='total batch size for all GPUs')
      parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
      parser.add_argument('--profile', action='store_true', help='profile model speed')
      parser.add_argument('--line-profile', action='store_true', help='profile model speed layer by layer')
      parser.add_argument('--test', action='store_true', help='test all yolo*.yaml')
      opt = parser.parse_args()
      opt.cfg = check_yaml(opt.cfg)  # check YAML
      print_args(vars(opt))
      device = select_device(opt.device)
    
  • 第二部分:创建yolov5模型

      im = torch.rand(opt.batch_size, 3, 640, 640).to(device)
      #首先随即定义了一张图片
    
      model = Model(opt.cfg).to(device)
      #通过Model类进行初始化,Model类的讲解在下面
    

yolov5模型定义和网络搭建都是用到了Model类

    class BaseModel(nn.Module):

    class DetectionModel(BaseModel):

    Model = DetectionModel  # retain YOLOv5 'Model' class for backwards compatibility

———————————————————————————————————————————————————————————————————————

    class BaseModel(nn.Module):
        # YOLOv5 base model
        def forward(self, x, profile=False, visualize=False):
            
        def _forward_once(self, x, profile=False, visualize=False):
            
        def _profile_one_layer(self, m, x, dt):
            
        def fuse(self):  
        
        def info(self, verbose=False, img_size=640):  
            
        def _apply(self, fn):

———————————————————————————————————————————————————————————————————————

        # YOLOv5 detection model
        def __init__(self, cfg='yolov5s.yaml', ch=3, nc=None, anchors=None): ## 搭建网络结构,这个函数的解释放在了下面

        def forward(self, x, augment=False, profile=False, visualize=False):# 对输入的图片进行预测

        def _forward_augment(self, x):
            
        def _descale_pred(self, p, flips, scale, img_size):
        
        def _clip_augmented(self, y):

        def _initialize_biases(self, cf=None):

搭建网络结构过程

    def __init__(self, cfg='yolov5s.yaml', ch=3, nc=None, anchors=None):  
    # cfg表示配置文件,关于yolov5s.yaml文件的解读放在了下面,可以看到一共有5个这种文件,其中yolov5n的准确率是最低的,但是速度最快
    ## ch表示通道数
    ## 这里的传入nc和anchors将会覆盖掉yolov5s.yaml文件中的值

关于yolov5s.yaml文件的解读

    # YOLOv5 🚀 by Ultralytics, AGPL-3.0 license

    # Parameters
    nc: 80  # number of classes,表示yolov5s所能预测出来的目标类别数

    depth_multiple: 0.33  
    # model depth multiple,模型深度倍数

    width_multiple: 0.50  
    # layer channel multiple,模型通道倍数

    anchors:##提前预定好的矩形框
    - [10,13, 16,30, 33,23]  # P3/8
    ## 定义了3个anchor,分别为10*13大小,16*30大小,33*23大小,由此来看从底层到中层到高层其大小是越来越大的

    - [30,61, 62,45, 59,119]  # P4/16
    - [116,90, 156,198, 373,326]  # P5/32


    # YOLOv5 v6.0 backbone
    backbone:
    # [from, number, module, args]
    ## yolov5的层结构信息
    [[-1, 1, Conv, [64, 6, 2, 2]],  # 0-P1/2,第0层
    ## 我们以第0层为例,来解释[from, number, module, args]这4个参数的含义
    ## from表示第0层的输入是从那一层过来的,-1表示从上一层过来
    ## number表示某一层中的模块有几个,可以发现下面的number的不为1的模块均为C3模块,例如第二层,实际上为3*depth_multiple个C3模块
    ## module表示这一层的层结构。Conv表示巻积层结构,Upsample表示上采样,Concat表示拼接,这些不同的层结构全都定义在了yolov5/models/common.py这个文件里
    ## 最后一个值为创建该层的时候传入的信息

    [-1, 1, Conv, [128, 3, 2]],  # 1-P2/4,第1层
    [-1, 3, C3, [128]],
    [-1, 1, Conv, [256, 3, 2]],  # 3-P3/8,第3层
    [-1, 6, C3, [256]],
    [-1, 1, Conv, [512, 3, 2]],  # 5-P4/16
    [-1, 9, C3, [512]],
    [-1, 1, Conv, [1024, 3, 2]],  # 7-P5/32
    [-1, 3, C3, [1024]],
    [-1, 1, SPPF, [1024, 5]],  # 9
    ]

    # YOLOv5 v6.0 head
    head:
    [[-1, 1, Conv, [512, 1, 1]],
    [-1, 1, nn.Upsample, [None, 2, 'nearest']],
    [[-1, 6], 1, Concat, [1]],  # cat backbone P4
    [-1, 3, C3, [512, False]],  # 13

    [-1, 1, Conv, [256, 1, 1]],
    [-1, 1, nn.Upsample, [None, 2, 'nearest']],
    [[-1, 4], 1, Concat, [1]],  # cat backbone P3
    [-1, 3, C3, [256, False]],  # 17 (P3/8-small)  用来检测小目标

    [-1, 1, Conv, [256, 3, 2]],
    [[-1, 14], 1, Concat, [1]],  # cat head P4
    [-1, 3, C3, [512, False]],  # 20 (P4/16-medium) 用来检测中目标

    [-1, 1, Conv, [512, 3, 2]],
    [[-1, 10], 1, Concat, [1]],  # cat head P5
    [-1, 3, C3, [1024, False]],  # 23 (P5/32-large) 用来检测大目标

    [[17, 20, 23], 1, Detect, [nc, anchors]],  # Detect(P3, P4, P5)
    ]

外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传

The entire network structure

外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传

我们分为4个部分来解析这个函数

  • __init__第一部分:加载传入的配置文件

              super().__init__()
              if isinstance(cfg, dict):
                  self.yaml = cfg  # model dict,判断传入的参数是不是字典类型的
              else:  # is *.yaml
                  import yaml      # for torch hub,导入了yaml库,这个库可以专门用来处理.yaml结尾的文件
                  self.yaml_file = Path(cfg).name  ## 获得文件名
                  with open(cfg, encoding='ascii', errors='ignore') as f:  ## 正式加载文件
                      self.yaml = yaml.safe_load(f)  # model dict,以python的字典类进行存放
    
  • __init__第二部分:利用加载好的配置文件一步步搭建每一层

              # Define model
              ch = self.yaml['ch'] = self.yaml.get('ch', ch)  # input channels
              ##从字典中取出ch这个关键字代表的值,如果取不到的话,会默认为后面的ch值,即前面传入的ch=3
    
              if nc and nc != self.yaml['nc']:  ##判断nc是不是空的,然后判断nc和yaml中的值是不是一样的
                  LOGGER.info(f"Overriding model.yaml nc={self.yaml['nc']} with nc={nc}")
                  self.yaml['nc'] = nc  # override yaml value
              if anchors:
                  LOGGER.info(f'Overriding model.yaml anchors with anchors={anchors}')
                  self.yaml['anchors'] = round(anchors)  # override yaml value
    
              self.model, self.save = parse_model(deepcopy(self.yaml), ch=[ch])  # model, savelist,利用yaml里中的值搭建好每一层,得到yolov5中的模型,下面我们将解析搭建方式,即parse_model函数
    
              self.names = [str(i) for i in range(self.yaml['nc'])]  # default names,初始化names参数,表示每一类别的类别名,这个时候为0,1,2等等,后面会被别的替代,比如0代表检测人,1代表检测车
    
              self.inplace = self.yaml.get('inplace', True)  #  从yaml中加载inplace关键字,如果没有该关键字,则返回True
    

parse_model函数的解析

    def parse_model(d, ch):  # model_dict, input_channels(3),d表示yolov5s.yaml这个文件,ch为只有一个3元素的列表

        # Parse a YOLOv5 model.yaml dictionary

        LOGGER.info(f"\n{'':>3}{'from':>18}{'n':>3}{'params':>10}  {'module':<40}{'arguments':<30}")
        anchors, nc, gd, gw, act = d['anchors'], d['nc'], d['depth_multiple'], d['width_multiple'], d.get('activation')
        #从yaml文件中取出anchors,nc,depth_multiple,width_multiple相关值

        if act:
            Conv.default_act = eval(act)  # redefine default activation, i.e. Conv.default_act = nn.SiLU()
            LOGGER.info(f"{colorstr('activation:')} {act}")  # print

        na = (len(anchors[0]) // 2) if isinstance(anchors, list) else anchors  # number of anchors,会先判断anchors是不是一个list,这里取到的anchors[0]其实就是第一行的内容

        no = na * (nc + 5)  # number of outputs = anchors * (classes + 5) ## 最后输出的通道数即255

        layers, save, c2 = [], [], ch[-1]  # layers, savelist, ch out
        ## layers用来存储下面搭建的每一层
        ## save相当于一个标签,用来统计哪些层的特征是需要保存的
        ## c2表示输出的通道数,下面的c1表示输入的通道数,对每一层都有一个c1和c2来保存


        for i, (f, n, m, args) in enumerate(d['backbone'] + d['head']):  
        # from, number, module, args这个我们已经在yaml文件中解释过

            m = eval(m) if isinstance(m, str) else m 
            # eval strings
            ## 实际上我们的module传入的是字符串,比如说Conv仅仅代表了这几个单词,并不会知道他的真实结构是在一个类里,那么这句代码的含义就是做一个推断,让他知道真实的含义是在common.py文件里定义过的一个类


            for j, a in enumerate(args):
                with contextlib.suppress(NameError):
                    args[j] = eval(a) if isinstance(a, str) else a  # eval strings,做一个类似上面的推断


            n = n_ = max(round(n * gd), 1) if n > 1 else n  # depth gain
            ## 求number的实际值是多少,number其实并不是一个真正的number,比如C3前面的3并不是有3层,他还要乘一个深度倍数,这个我们在yaml文件的讲解里也有提到过
            ## gd就是深度倍数

            if m in {
                    Conv, GhostConv, Bottleneck, GhostBottleneck, SPP, SPPF, DWConv, MixConv2d, Focus, CrossConv,
                    BottleneckCSP, C3, C3TR, C3SPP, C3Ghost, nn.ConvTranspose2d, DWConvTranspose2d, C3x}:##判断m是哪个模块,从而执行相应内容

                c1, c2 = ch[f], args[0] ##通道数

                if c2 != no:  # if not output
                    c2 = make_divisible(c2 * gw, 8)  ## 乘通道倍数,8的倍数对gpu计算非常友好,如果不是8的倍数,会变成8的倍数的通道值

                args = [c1, c2, *args[1:]] ##把c1, c2, args后三个参数拼接起来

                if m in {BottleneckCSP, C3, C3TR, C3Ghost, C3x}:
                    args.insert(2, n)  # number of repeats
                    n = 1
            elif m is nn.BatchNorm2d:
                args = [ch[f]]
            elif m is Concat:
                c2 = sum(ch[x] for x in f)
            # TODO: channel, gw, gd
            elif m in {Detect, Segment}:
                args.append([ch[x] for x in f])
                if isinstance(args[1], int):  # number of anchors
                    args[1] = [list(range(args[1] * 2))] * len(f)
                if m is Segment:
                    args[3] = make_divisible(args[3] * gw, 8)
            elif m is Contract:
                c2 = ch[f] * args[0] ** 2
            elif m is Expand:
                c2 = ch[f] // args[0] ** 2
            else:
                c2 = ch[f]

            m_ = nn.Sequential(*(m(*args) for _ in range(n))) if n > 1 else m(*args)  # module

            t = str(m)[8:-2].replace('__main__.', '')  # module type,获取模块名,如果有'__main__.'字符串,就用空来替换他

            np = sum(x.numel() for x in m_.parameters())  # number params,统计某一层的参数量

            m_.i, m_.f, m_.type, m_.np = i, f, t, np  # attach index, 'from' index, type, number params

            LOGGER.info(f'{i:>3}{str(f):>18}{n_:>3}{np:10.0f}  {t:<40}{str(args):<30}')  # print

            save.extend(x % i for x in ([f] if isinstance(f, int) else f) if x != -1)  # append to savelist
            layers.append(m_)
            if i == 0:
                ch = []
            ch.append(c2) ##执行下一层时,需要上一层的输出通道作为输入通道

        return nn.Sequential(*layers), sorted(save)
        ## [6,4,14,10,17,20,23]需要保存特征的层号
  • __init__第三部分:对步长和anchors的额外处理

              # Build strides, anchors
              m = self.model[-1]  # Detect()
              if isinstance(m, (Detect, Segment)):
                  s = 256  # 2x min stride
                  m.inplace = self.inplace
                  forward = lambda x: self.forward(x)[0] if isinstance(m, Segment) else self.forward(x)
                  m.stride = torch.tensor([s / x.shape[-2] for x in forward(torch.zeros(1, ch, s, s))])  # forward
                  #新建了一张空白图片,大小为256*256,ch为3,将该图片传入模型中,进行了一次前向传播,在传播过程中,分别在底中高进行预测,从而知道了步长[8,16,32]
    
                  check_anchor_order(m)  ##  检测的anchor顺序对不对
    
                  m.anchors /= m.stride.view(-1, 1, 1) ## anchors定义的是像素坐标,而最后使用他是在特征层上
    
                  self.stride = m.stride
                  self._initialize_biases()  # only run once
    
  • __init__第四部分:对网络的参数进行初始化以及一些打印工作

              # Init weights, biases
              initialize_weights(self)
              self.info()
              LOGGER.info('')
    
  • 第三部分

          if opt.line_profile:  # profile layer by layer
              model(im, profile=True)
    
          elif opt.profile:  # profile forward-backward
              results = profile(input=im, ops=[model], n=3)
    
          elif opt.test:  # test all models
              for cfg in Path(ROOT / 'models').rglob('yolo*.yaml'):
                  try:
                      _ = Model(cfg)
                  except Exception as e:
                      print(f'Error in {cfg}: {e}')
    
          else:  # report fused model summary
              model.fuse()
    

三.模型之train.py文件

  • 在这个文件的开头作者就告诉了我们两种使用这个文件的方法:

      Usage - Single-GPU training:
          $ python train.py --data coco128.yaml --weights yolov5s.pt --img 640  # from pretrained (recommended)
          ##表示使用预训练好的yolov5s.pt这个权重文件来进行的训练,在训练过程中传入的data参数表示使用的文件为coco128.yaml,img表示传入的图片大小
    
          $ python train.py --data coco128.yaml --weights '' --cfg yolov5s.yaml --img 640  # from scratch
          ## 通过cfg这个参数来传入所要使用的网络结构,根据这个文件从0开始搭建一个模型,然后从头开始训练
    
  • 开始定义的这些变量主要是做分步式训练用的,对于初学者来说,大家一般都是一台电脑,一台gpu卡来进行模型训练,所以说也不会涉及到分步式训练,通常来说都为后面的默认值

      LOCAL_RANK = int(os.getenv('LOCAL_RANK', -1))  # https://pytorch.org/docs/stable/elastic/run.html
      RANK = int(os.getenv('RANK', -1))
      WORLD_SIZE = int(os.getenv('WORLD_SIZE', 1))
      GIT_INFO = check_git_info()
    
  • 来到文件的最下方,看到这个if name == ‘main’:函数

      if __name__ == '__main__':
          opt = parse_opt() ## 这个函数用来解析训练过程中用到的参数,这个函数我们之前有讲过一个类似的,这里不再讲解
          main(opt)   
    
  • 下面我们来看main函数,我们分为4部分来看

      def main(opt, callbacks=Callbacks()):
    
  1. main函数第一部分:首先进行的校验工作

         # Checks
         if RANK in {-1, 0}:
             print_args(vars(opt))  ## 打印文件所用到的参数信息
             check_git_status()    ##检验yolov5的github仓库中的代码是否更新,如果更新了这里会有提示
             check_requirements(ROOT / 'requirements.txt')  ## 检验requirements.txt中的依赖包有没有安装成功,没有安装成功也会给出一些提示
    
  2. main函数第二部分:是否传入了Resume这个参数将执行不同的操作

         # Resume (from specified or most recent last.pt)
         if opt.resume and not check_comet_resume(opt) and not opt.evolve:
             last = Path(check_file(opt.resume) if isinstance(opt.resume, str) else get_latest_run())
             opt_yaml = last.parent.parent / 'opt.yaml'  # train options yaml
             opt_data = opt.data  # original dataset
             if opt_yaml.is_file():
                 with open(opt_yaml, errors='ignore') as f:
                     d = yaml.safe_load(f)
             
         ##### Resume表示从中断中恢复,比如说现在想要在某个数据集上训练,比如想要训练300轮,但是训练到200轮的时候电脑死机了,重新开机后发现原来的200轮已经被迫中断了,这个时候电脑上只有一个已经训练了200轮的模型,但是需要的结果是训练了300轮的结果,这个时候就可以用到Resume这个参数。Resume可以把刚刚的实验环境给恢复过来,从而完成未训练完的100轮。由于我们使用的是已经训练好的yolov5s.pt,所以这里用不到这个参数
    
             else:
                 d = torch.load(last, map_location='cpu')['opt']
             opt = argparse.Namespace(**d)  # replace
             opt.cfg, opt.weights, opt.resume = '', str(last), True  # reinstate
             if is_url(opt_data):
                 opt.data = check_file(opt_data)  # avoid HUB resume auth timeout
         else:
             opt.data, opt.cfg, opt.hyp, opt.weights, opt.project = \
                 check_file(opt.data), check_yaml(opt.cfg), check_yaml(opt.hyp), str(opt.weights), str(opt.project) 
                 # checks,检查这几个配置文件的路径
    
             assert len(opt.cfg) or len(opt.weights), 'either --cfg or --weights must be specified'
             ## 检查cfg和weights是不是都为空,都为空则会报错。即要么告诉他网络是怎么创建的,要么直接给他权重文件让他加载。
    
             if opt.evolve:
                 if opt.project == str(ROOT / 'runs/train'):  # if default project name, rename to runs/evolve
                     opt.project = str(ROOT / 'runs/evolve')
                     ## 传入evolve这个参数的话,就将后续的保存路径改为runs/evolve这个目录下,默认是在train这个目录下
    
                 opt.exist_ok, opt.resume = opt.resume, False  # pass resume to exist_ok and disable resume
             if opt.name == 'cfg':
                 opt.name = Path(opt.cfg).stem  # use model.yaml as name
             opt.save_dir = str(increment_path(Path(opt.project) / opt.name, exist_ok=opt.exist_ok))
             ## 增量路径,之前有讲到
    
  3. main函数第三部分:判断是否采用DDP这种训练方法

         # DDP mode
         device = select_device(opt.device, batch_size=opt.batch_size)
         # 选择是使用cpu还是gpu
    
         ## 如果采用分布式训练,会执行下面的代码,这里不做讲解
         if LOCAL_RANK != -1:
             msg = 'is not compatible with YOLOv5 Multi-GPU DDP training'
             assert not opt.image_weights, f'--image-weights {msg}'
             assert not opt.evolve, f'--evolve {msg}'
             assert opt.batch_size != -1, f'AutoBatch with --batch-size -1 {msg}, please pass a valid --batch-size'
             assert opt.batch_size % WORLD_SIZE == 0, f'--batch-size {opt.batch_size} must be multiple of WORLD_SIZE'
             assert torch.cuda.device_count() > LOCAL_RANK, 'insufficient CUDA devices for DDP command'
             torch.cuda.set_device(LOCAL_RANK)
             device = torch.device('cuda', LOCAL_RANK)
             dist.init_process_group(backend='nccl' if dist.is_nccl_available() else 'gloo',
                                     timeout=timedelta(seconds=10800))
    
  4. main函数第四部分:正式开始训练

         # Train
         if not opt.evolve:
             train(opt.hyp, opt, device, callbacks)
             ## 调用train函数进行训练
    
         # Evolve hyperparameters (optional)
         ## 由于没有传入evolve,这里不做讲解
         ## evolve这个参数,表示会自动尝试不同超参数下的实验结果,从而选出最好的,避免了手动调参。但是极其漫长耗费资源,一般我们不使用。
         else:
             # Hyperparameter evolution metadata (mutation scale 0-1, lower_limit, upper_limit)
             meta = {
                 'lr0': (1, 1e-5, 1e-1),  # initial learning rate (SGD=1E-2, Adam=1E-3)
                 'lrf': (1, 0.01, 1.0),  # final OneCycleLR learning rate (lr0 * lrf)
                 'momentum': (0.3, 0.6, 0.98),  # SGD momentum/Adam beta1
                 'weight_decay': (1, 0.0, 0.001),  # optimizer weight decay
                 'warmup_epochs': (1, 0.0, 5.0),  # warmup epochs (fractions ok)
                 'warmup_momentum': (1, 0.0, 0.95),  # warmup initial momentum
                 'warmup_bias_lr': (1, 0.0, 0.2),  # warmup initial bias lr
                 'box': (1, 0.02, 0.2),  # box loss gain
                 'cls': (1, 0.2, 4.0),  # cls loss gain
                 'cls_pw': (1, 0.5, 2.0),  # cls BCELoss positive_weight
                 'obj': (1, 0.2, 4.0),  # obj loss gain (scale with pixels)
                 'obj_pw': (1, 0.5, 2.0),  # obj BCELoss positive_weight
                 'iou_t': (0, 0.1, 0.7),  # IoU training threshold
                 'anchor_t': (1, 2.0, 8.0),  # anchor-multiple threshold
                 'anchors': (2, 2.0, 10.0),  # anchors per output grid (0 to ignore)
                 'fl_gamma': (0, 0.0, 2.0),  # focal loss gamma (efficientDet default gamma=1.5)
                 'hsv_h': (1, 0.0, 0.1),  # image HSV-Hue augmentation (fraction)
                 'hsv_s': (1, 0.0, 0.9),  # image HSV-Saturation augmentation (fraction)
                 'hsv_v': (1, 0.0, 0.9),  # image HSV-Value augmentation (fraction)
                 'degrees': (1, 0.0, 45.0),  # image rotation (+/- deg)
                 'translate': (1, 0.0, 0.9),  # image translation (+/- fraction)
                 'scale': (1, 0.0, 0.9),  # image scale (+/- gain)
                 'shear': (1, 0.0, 10.0),  # image shear (+/- deg)
                 'perspective': (0, 0.0, 0.001),  # image perspective (+/- fraction), range 0-0.001
                 'flipud': (1, 0.0, 1.0),  # image flip up-down (probability)
                 'fliplr': (0, 0.0, 1.0),  # image flip left-right (probability)
                 'mosaic': (1, 0.0, 1.0),  # image mixup (probability)
                 'mixup': (1, 0.0, 1.0),  # image mixup (probability)
                 'copy_paste': (1, 0.0, 1.0)}  # segment copy-paste (probability)
    
             with open(opt.hyp, errors='ignore') as f:
                 hyp = yaml.safe_load(f)  # load hyps dict
                 if 'anchors' not in hyp:  # anchors commented in hyp.yaml
                     hyp['anchors'] = 3
             if opt.noautoanchor:
                 del hyp['anchors'], meta['anchors']
             opt.noval, opt.nosave, save_dir = True, True, Path(opt.save_dir)  # only val/save final epoch
             # ei = [isinstance(x, (int, float)) for x in hyp.values()]  # evolvable indices
             evolve_yaml, evolve_csv = save_dir / 'hyp_evolve.yaml', save_dir / 'evolve.csv'
             if opt.bucket:
                 # download evolve.csv if exists
                 subprocess.run([
                     'gsutil',
                     'cp',
                     f'gs://{opt.bucket}/evolve.csv',
                     str(evolve_csv), ])
    
             for _ in range(opt.evolve):  # generations to evolve
                 if evolve_csv.exists():  # if evolve.csv exists: select best hyps and mutate
                     # Select parent(s)
                     parent = 'single'  # parent selection method: 'single' or 'weighted'
                     x = np.loadtxt(evolve_csv, ndmin=2, delimiter=',', skiprows=1)
                     n = min(5, len(x))  # number of previous results to consider
                     x = x[np.argsort(-fitness(x))][:n]  # top n mutations
                     w = fitness(x) - fitness(x).min() + 1E-6  # weights (sum > 0)
                     if parent == 'single' or len(x) == 1:
                         # x = x[random.randint(0, n - 1)]  # random selection
                         x = x[random.choices(range(n), weights=w)[0]]  # weighted selection
                     elif parent == 'weighted':
                         x = (x * w.reshape(n, 1)).sum(0) / w.sum()  # weighted combination
    
                     # Mutate
                     mp, s = 0.8, 0.2  # mutation probability, sigma
                     npr = np.random
                     npr.seed(int(time.time()))
                     g = np.array([meta[k][0] for k in hyp.keys()])  # gains 0-1
                     ng = len(meta)
                     v = np.ones(ng)
                     while all(v == 1):  # mutate until a change occurs (prevent duplicates)
                         v = (g * (npr.random(ng) < mp) * npr.randn(ng) * npr.random() * s + 1).clip(0.3, 3.0)
                     for i, k in enumerate(hyp.keys()):  # plt.hist(v.ravel(), 300)
                         hyp[k] = float(x[i + 7] * v[i])  # mutate
    
                 # Constrain to limits
                 for k, v in meta.items():
                     hyp[k] = max(hyp[k], v[1])  # lower limit
                     hyp[k] = min(hyp[k], v[2])  # upper limit
                     hyp[k] = round(hyp[k], 5)  # significant digits
    
                 # Train mutation
                 results = train(hyp.copy(), opt, device, callbacks)
                 callbacks = Callbacks()
                 # Write mutation results
                 keys = ('metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95', 'val/box_loss',
                         'val/obj_loss', 'val/cls_loss')
                 print_mutation(keys, results, hyp.copy(), save_dir, opt.bucket)
    
             # Plot results
             plot_evolve(evolve_csv)
             LOGGER.info(f'Hyperparameter evolution finished {opt.evolve} generations\n'
                         f"Results saved to {colorstr('bold', save_dir)}\n"
                         f'Usage example: $ python train.py --hyp {evolve_yaml}')
    

到这里main函数已经讲解完毕,下面我们来看上面main第四部分训练时使用的train函数

    def train(hyp, opt, device, callbacks):  # hyp is path/to/hyp.yaml or hyp dictionary
        save_dir, epochs, batch_size, weights, single_cls, evolve, data, cfg, resume, noval, nosave, workers, freeze = \
            Path(opt.save_dir), opt.epochs, opt.batch_size, opt.weights, opt.single_cls, opt.evolve, opt.data, opt.cfg, \
            opt.resume, opt.noval, opt.nosave, opt.workers, opt.freeze
            ## 赋值一些临时的变量供后续使用

        callbacks.run('on_pretrain_routine_start')
        ## 下面出现了很多次这个函数,其实就是为了控制训练日志的记录过程


        # Directories
        w = save_dir / 'weights'  # weights dir
        # save_dir为结果保存的路径

        (w.parent if evolve else w).mkdir(parents=True, exist_ok=True)  # make dir
        # 判断weights文件夹是否存在,如果不存在,则创建一个用于保存权重文件

        last, best = w / 'last.pt', w / 'best.pt'
        ## 定义两个权重文件,即最后一轮和训练效果最好的权重文件

        # Hyperparameters
        if isinstance(hyp, str):
            with open(hyp, errors='ignore') as f:
                hyp = yaml.safe_load(f)  # load hyps dict
        LOGGER.info(colorstr('hyperparameters: ') + ', '.join(f'{k}={v}' for k, v in hyp.items()))
        opt.hyp = hyp.copy()  # for saving hyps to checkpoints
        ## 加载训练过程中使用的超参数,超参数在yolov5/data/hyps/hyp.scratch-low.yaml文件里

        # Save run settings,把运行过程中的一些环境保存下来
        if not evolve:
            yaml_save(save_dir / 'hyp.yaml', hyp)  ## 保存超参数
            yaml_save(save_dir / 'opt.yaml', vars(opt)) ##保存执行脚本文件在命令行使用的参数

        # Loggers
        data_dict = None
        if RANK in {-1, 0}:
            loggers = Loggers(save_dir, weights, opt, hyp, LOGGER)  # loggers instance
            ##  日志记录工具,主要用到了wandb库和TensorBoard库,读者可以自行搜索了解,这里不做讲解

            # Register actions
            for k in methods(loggers):
                callbacks.register_action(k, callback=getattr(loggers, k))

            # Process custom dataset artifact link
            data_dict = loggers.remote_dataset
            if resume:  # If resuming runs from remote artifact
                weights, epochs, hyp, batch_size = opt.weights, opt.epochs, opt.hyp, opt.batch_size

        # Config
        plots = not evolve and not opt.noplots  
        # create plots,是否画出训练过程中的图表和训练结果

        cuda = device.type != 'cpu' 
        ## 电脑是否支持cuda

        init_seeds(opt.seed + 1 + RANK, deterministic=True) 
        ## 初始随机化种子,从而保证训练过程是可复现的

        # 与分步式训练相关
        with torch_distributed_zero_first(LOCAL_RANK):
            data_dict = data_dict or check_dataset(data)  
            # check if None,检查数据集是否存在
            ## path: ../datasets/coco128  # dataset root dir
            ## train: images/train2017  # train images (relative to 'path') 128 images
            ## val: images/train2017  # val images (relative to 'path') 128 images
            ## test:  # test images (optional)

        train_path, val_path = data_dict['train'], data_dict['val']
        # 取出训练集和验证集的路径

        nc = 1 if single_cls else int(data_dict['nc'])  
        # number of classes,即80个类名

        names = {0: 'item'} if single_cls and len(data_dict['names']) != 1 else data_dict['names']  # class names
        ## 判断80个类名和80类是否相等,不相等的话则会报错

        is_coco = isinstance(val_path, str) and val_path.endswith('coco/val2017.txt')  
        # COCO dataset,判断是不是coco数据集,这里为false

        # Model
        check_suffix(weights, '.pt')  
        # check weights
        # 检测传入的weights参数后缀名是不是以.pt结尾的

        pretrained = weights.endswith('.pt')
        if pretrained:
            with torch_distributed_zero_first(LOCAL_RANK):
                weights = attempt_download(weights)  
                # download if not found locally
                # 检测有没有,如果没有就从yolov5官方仓库中下载yolos.pt这个文件

            ckpt = torch.load(weights, map_location='cpu') 
            # load checkpoint to CPU to avoid CUDA memory leak
            # 存储加载好的权重文件

            model = Model(cfg or ckpt['model'].yaml, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device)  
            # create
            #创建一个新的模型,因为传入的nc不一定相同

            exclude = ['anchor'] if (cfg or hyp.get('anchors')) and not resume else []  # exclude keys
            csd = ckpt['model'].float().state_dict()  
            # checkpoint state_dict as FP32
            # 把预训练的模型加载进来

            csd = intersect_dicts(csd, model.state_dict(), exclude=exclude)  
            # intersect
            # 判断两者有多少个参数是相同的

            model.load_state_dict(csd, strict=False)  
            # load
            LOGGER.info(f'Transferred {len(csd)}/{len(model.state_dict())} items from {weights}')  # report
        else:
            model = Model(cfg, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device)  # create
        amp = check_amp(model)  # check AMP

        # Freeze,默认为[0],默认不冻结,如果为[10],则把backbone的前10层冻结
        freeze = [f'model.{x}.' for x in (freeze if len(freeze) > 1 else range(freeze[0]))]  # layers to freeze
        for k, v in model.named_parameters():
            v.requires_grad = True  # train all layers
            # v.register_hook(lambda x: torch.nan_to_num(x))  # NaN to 0 (commented for erratic training results)
            if any(x in k for x in freeze):
                LOGGER.info(f'freezing {k}')
                v.requires_grad = False

        # Image size
        gs = max(int(model.stride.max()), 32) 
        # grid size (max stride)
        # 最高层的图片相较于原始特征缩小了多少倍

        imgsz = check_img_size(opt.imgsz, gs, floor=gs * 2)  
        # verify imgsz is gs-multiple
        # 检测输入图片满不满足32的倍数,如果不满足会自动补成32的倍数

        # Batch size,如果等于-1,会帮我们自动计算一个合适的Batch size大小,默认Batch size为16
        if RANK == -1 and batch_size == -1:  # single-GPU only, estimate best batch size
            batch_size = check_train_batch_size(model, imgsz, amp)
            loggers.on_params_update({'batch_size': batch_size})

        # Optimizer优化器
        nbs = 64  
        # nominal batch size
        ## 名义上的batch size,默认batch size为16,在训练过程中,就会从数据集取16条数据传入模型中,预测一次,再反向传播计算梯度和更新后的权重
        ## 假如目前使用的性能不够好,batch size最大只能设置到16,但是我们又想实现和batch size=64一样的训练效果,这个时候nominal batch size便派上用场了。
           我们可以先把nbs定义成64,下面的nbs / batch_size=4,最后算出来的累积次数就是4。也就是说我们给模型喂了4批数据之后,把4批得到的梯度值做一个累积之后,再进行权重更新,这样就能实现和batch size=64一样的效果了。

        accumulate = max(round(nbs / batch_size), 1)  
        # accumulate loss before optimizing
        ## 定义一个变量存放累积次数

        hyp['weight_decay'] *= batch_size * accumulate / nbs  # scale weight_decay
        ## 对weight_decay这个权重衰减的超参数进行一个缩放
        ## 权重衰减可以防止训练过程中出现过拟合

        optimizer = smart_optimizer(model, opt.optimizer, hyp['lr0'], hyp['momentum'], hyp['weight_decay'])
        

        # Scheduler
        if opt.cos_lr:
            lf = one_cycle(1, hyp['lrf'], epochs)  # cosine 1->hyp['lrf']
            ## one_cycle变化策略

        else:
            lf = lambda x: (1 - x / epochs) * (1.0 - hyp['lrf']) + hyp['lrf']  
            # linear,线性变化策略,其实就是学习率因子lf关于x的一个函数,化简后其实是((hyp['lrf' - 1.0]/epochs))+1,是一个kx+b形的线性函数
            ## 学习率因子可以理解为学习率需要乘的一个系数

        scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)  # plot_lr_scheduler(optimizer, scheduler, epochs)
        # 学习率衰减

        # EMA
        ema = ModelEMA(model) if RANK in {-1, 0} else None
        # 指数移动平均,能在每次更新参数的时候考虑历史值对参数的影响,给训练过程带来帮助

        # Resume,从权重文件中加载一些信息
        best_fitness, start_epoch = 0.0, 0
        if pretrained:
            if resume:
                best_fitness, start_epoch, epochs = smart_resume(ckpt, optimizer, ema, weights, epochs, resume)
            del ckpt, csd

        # DP mode,判断是不是用了多张显卡,如果是,会做一个数据并行化的操作
        if cuda and RANK == -1 and torch.cuda.device_count() > 1:
            LOGGER.warning(
                'WARNING ⚠️ DP not recommended, use torch.distributed.run for best DDP Multi-GPU results.\n'
                'See Multi-GPU Tutorial at https://docs.ultralytics.com/yolov5/tutorials/multi_gpu_training to get started.'
            )
            model = torch.nn.DataParallel(model)

        # SyncBatchNorm,和分步式训练相关
        if opt.sync_bn and cuda and RANK != -1:
            model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device)
            LOGGER.info('Using SyncBatchNorm()')


        # Trainloader
        # 加载训练集的数据
        train_loader, dataset = create_dataloader(train_path,
                                                imgsz,
                                                batch_size // WORLD_SIZE,
                                                gs,
                                                single_cls,
                                                hyp=hyp,
                                                augment=True,
                                                cache=None if opt.cache == 'val' else opt.cache,
                                                rect=opt.rect,
                                                rank=LOCAL_RANK,
                                                workers=workers,
                                                image_weights=opt.image_weights,
                                                quad=opt.quad,
                                                prefix=colorstr('train: '),
                                                shuffle=True,
                                                seed=opt.seed)
        labels = np.concatenate(dataset.labels, 0)
        mlc = int(labels[:, 0].max())  
        # max label class,计算最大的标签类别号
        assert mlc < nc, f'Label class {mlc} exceeds nc={nc} in {data}. Possible class labels are 0-{nc - 1}'

        # Process 0
        if RANK in {-1, 0}:## 加载验证集的数据
            val_loader = create_dataloader(val_path,
                                        imgsz,
                                        batch_size // WORLD_SIZE * 2,
                                        gs,
                                        single_cls,
                                        hyp=hyp,
                                        cache=None if noval else opt.cache,
                                        rect=True,
                                        rank=-1,
                                        workers=workers * 2,
                                        pad=0.5,
                                        prefix=colorstr('val: '))[0]

            if not resume:
                if not opt.noautoanchor:
                    check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz)  
                    # run AutoAnchor
                    # 检查一下anchor合不合适

                model.half().float()  # pre-reduce anchor precision

            callbacks.run('on_pretrain_routine_end', labels, names)

        # DDP mode,多卡训练的内容
        if cuda and RANK != -1:
            model = smart_DDP(model)

        # Model attributes
        nl = de_parallel(model).model[-1].nl  # number of detection layers (to scale hyps)
        # 从模型中取出来检测层的数量
        ## 利用这个层数对超参数进行缩放

        hyp['box'] *= 3 / nl  # scale to layers
        hyp['cls'] *= nc / 80 * 3 / nl  # scale to classes and layers
        hyp['obj'] *= (imgsz / 640) ** 2 * 3 / nl  # scale to image size and layers
        ## 这几个超参数表示损失函数前面的因子,损失值包含三个部分,框回归损失,类别损失,置信度损失,为了平衡这3个损失,会各自乘以一个系数。

        hyp['label_smoothing'] = opt.label_smoothing
        ## 标签平滑

        model.nc = nc  # attach number of classes to model
        model.hyp = hyp  # attach hyperparameters to model
        model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) * nc  # attach class weights
        model.names = names
        ## 把类别数,超参数,类别权重,标签名写入model

        # Start training
        t0 = time.time() ## 训练一轮所需要的时间
        nb = len(train_loader)  # number of batches
        nw = max(round(hyp['warmup_epochs'] * nb), 100)  # number of warmup iterations, max(3 epochs, 100 iterations)
        ## warmup的迭代次数

        # nw = min(nw, (epochs - start_epoch) / 2 * nb)  # limit warmup to < 1/2 of training
        last_opt_step = -1
        # 上一次更新参数时的计数器的值

        maps = np.zeros(nc)  # mAP per class,80

        results = (0, 0, 0, 0, 0, 0, 0)  # P, R, mAP@.5, mAP@.5-.95, val_loss(box, obj, cls)

        scheduler.last_epoch = start_epoch - 1  # do not move
        scaler = torch.cuda.amp.GradScaler(enabled=amp)
        ## 使用自动混合精度去训练

        stopper, stop = EarlyStopping(patience=opt.patience), False
        ## 在训练过程中,如果连续训练几轮,训练效果都没有提升的话,会自动提前终止训练

        compute_loss = ComputeLoss(model)  # init loss class,损失函数
        callbacks.run('on_train_start')

        LOGGER.info(f'Image sizes {imgsz} train, {imgsz} val\n'
                    f'Using {train_loader.num_workers * WORLD_SIZE} dataloader workers\n'
                    f"Logging results to {colorstr('bold', save_dir)}\n"
                    f'Starting training for {epochs} epochs...')


        # 遍历整个轮数的训练过程           
        for epoch in range(start_epoch, epochs):  # epoch ------------------------------------------------------------------
            callbacks.run('on_train_epoch_start')
            model.train()  ## 把模型切换到训练状态

            # Update image weights (optional, single-GPU only)
            if opt.image_weights:
                cw = model.class_weights.cpu().numpy() * (1 - maps) ** 2 / nc  # class weights
                ## class_weightss数据集中每一类的数量权重,某一类数量比较多,那他的权重就比较大
                ## maps表示80类目标各自的精度,1 - maps则表示不精确度
                ## 如果某一类的不精确的比较高,就会算出一个比较大的类别权重,来增加被采样到的概率

                iw = labels_to_image_weights(dataset.labels, nc=nc, class_weights=cw)  # image weights
                ## 把类别权重换算到图像维度,即每一张图片的采样权重,即某一张图片识别不精确的目标越多,则权重越大
        
                dataset.indices = random.choices(range(dataset.n), weights=iw, k=dataset.n)  # rand weighted idx
                ## 随机重采样,这个时候得到的数据集不再是原始的数据集了,会多包含一些难识别的样本,因此下面在给模型喂数据的时候也会多一些难识别的样本

            # Update mosaic border (optional)
            # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs)
            # dataset.mosaic_border = [b - imgsz, -b]  # height, width borders

            mloss = torch.zeros(3, device=device)  # mean losses
            ## 存放损失值

            ##显示进度条来展示训练进度
            if RANK != -1:
                train_loader.sampler.set_epoch(epoch)
            pbar = enumerate(train_loader)
            LOGGER.info(('\n' + '%11s' * 7) % ('Epoch', 'GPU_mem', 'box_loss', 'obj_loss', 'cls_loss', 'Instances', 'Size'))
            if RANK in {-1, 0}:
                pbar = tqdm(pbar, total=nb, bar_format=TQDM_BAR_FORMAT)  # progress bar

            ##    梯度归0
            optimizer.zero_grad()

            ## 遍历每一个batch
            for i, (imgs, targets, paths, _) in pbar:  # batch -------------------------------------------------------------
                callbacks.run('on_train_batch_start')

                ni = i + nb * epoch  # number integrated batches (since train start)
                ##  ni负责计数,表示取到第几批数据了
                
                imgs = imgs.to(device, non_blocking=True).float() / 255  # uint8 to float32, 0-255 to 0.0-1.0

                # Warmup,会在刚开始训练前几批数据的时候,用一个比较小的学习率,然后慢慢升高到,比较小的学习率
                if ni <= nw:
                    xi = [0, nw]  # x interp
                    # compute_loss.gr = np.interp(ni, xi, [0.0, 1.0])  # iou loss ratio (obj_loss = 1.0 or iou)
                    accumulate = max(1, np.interp(ni, xi, [1, nbs / batch_size]).round())
                    for j, x in enumerate(optimizer.param_groups):
                    ## 遍历优化器的参数组

                        # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0

                        x['lr'] = np.interp(ni, xi, [hyp['warmup_bias_lr'] if j == 0 else 0.0, x['initial_lr'] * lf(epoch)])
                        if 'momentum' in x:
                            x['momentum'] = np.interp(ni, xi, [hyp['warmup_momentum'], hyp['momentum']])

                # Multi-scale,多尺度训练在训练过程中,随机化得到一个比例因子,用这个因子去改变训练过程中的输入图片尺寸从而起到多尺度训练的效果
                if opt.multi_scale:
                    sz = random.randrange(int(imgsz * 0.5), int(imgsz * 1.5) + gs) // gs * gs  # size
                    sf = sz / max(imgs.shape[2:])  # scale factor
                    if sf != 1:
                        ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]]  # new shape (stretched to gs-multiple)
                        imgs = nn.functional.interpolate(imgs, size=ns, mode='bilinear', align_corners=False)

                # Forward,前向传播
                with torch.cuda.amp.autocast(amp):
                    pred = model(imgs)  # forward、
                    ## 将图片传给模型得到预测框

                    loss, loss_items = compute_loss(pred, targets.to(device))  # loss scaled by batch_size
                    ## 利用预测框和标注框计算损失值

                    if RANK != -1:
                        loss *= WORLD_SIZE  # gradient averaged between devices in DDP mode
                    if opt.quad:
                        loss *= 4.

                # Backward,反向传播
                scaler.scale(loss).backward()

                # Optimize - https://pytorch.org/docs/master/notes/amp_examples.html
                if ni - last_opt_step >= accumulate:
                    ## last_opt_step 表示上一次更新参数时候的批次,他两个相减大于累积次数,说明取够了4个16,才进行参数更新

                    scaler.unscale_(optimizer)  # unscale gradients
                    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=10.0)  # clip gradients
                    scaler.step(optimizer)  # optimizer.step
                    scaler.update()
                    optimizer.zero_grad()
                    if ema:
                        ema.update(model)
                    last_opt_step = ni

                # Log,日志记录
                if RANK in {-1, 0}:
                    mloss = (mloss * i + loss_items) / (i + 1)  # update mean losses
                    mem = f'{torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0:.3g}G'  # (GB)
                    pbar.set_description(('%11s' * 2 + '%11.4g' * 5) %
                                        (f'{epoch}/{epochs - 1}', mem, *mloss, targets.shape[0], imgs.shape[-1]))
                    callbacks.run('on_train_batch_end', model, ni, imgs, targets, paths, list(mloss))
                    if callbacks.stop_training:
                        return
                # end batch ------------------------------------------------------------------------------------------------

            # Scheduler,更新学习率
            lr = [x['lr'] for x in optimizer.param_groups]  # for loggers
            scheduler.step()

            if RANK in {-1, 0}:
                # mAP
                ## 在验证集上计算
                callbacks.run('on_train_epoch_end', epoch=epoch)
                ema.update_attr(model, include=['yaml', 'nc', 'hyp', 'names', 'stride', 'class_weights'])
                ## 给ema添加这几个属性

                final_epoch = (epoch + 1 == epochs) or stopper.possible_stop
                ## 判断当前轮是不是最终的一轮

                ## 如果不是最后一轮的话,把目前这一轮训练好的模型在验证集上跑一遍
                if not noval or final_epoch:  # Calculate mAP
                    results, maps, _ = validate.run(data_dict,
                                                    batch_size=batch_size // WORLD_SIZE * 2,
                                                    imgsz=imgsz,
                                                    half=amp,
                                                    model=ema.ema,
                                                    single_cls=single_cls,
                                                    dataloader=val_loader,
                                                    save_dir=save_dir,
                                                    plots=False,
                                                    callbacks=callbacks,
                                                    compute_loss=compute_loss)

                # Update best mAP
                fi = fitness(np.array(results).reshape(1, -1))  # weighted combination of [P, R, mAP@.5, mAP@.5-.95]
                ## 拟合度

                stop = stopper(epoch=epoch, fitness=fi)  # early stop check
                if fi > best_fitness:
                    best_fitness = fi
                    ##判断当前的拟合额度是不是最好的拟合度

                log_vals = list(mloss) + list(results) + lr
                callbacks.run('on_fit_epoch_end', log_vals, epoch, best_fitness, fi)

                # Save model
                ## 是否保存模型
                if (not nosave) or (final_epoch and not evolve):  # if save
                    ckpt = {
                        'epoch': epoch,
                        'best_fitness': best_fitness,
                        'model': deepcopy(de_parallel(model)).half(),
                        'ema': deepcopy(ema.ema).half(),
                        'updates': ema.updates,
                        'optimizer': optimizer.state_dict(),
                        'opt': vars(opt),
                        'git': GIT_INFO,  # {remote, branch, commit} if a git repo
                        'date': datetime.now().isoformat()}

                    # Save last, best and delete
                    ## 保存最后一轮和最好一轮的pt文件
                    torch.save(ckpt, last)
                    if best_fitness == fi:
                        torch.save(ckpt, best)
                    if opt.save_period > 0 and epoch % opt.save_period == 0:
                        torch.save(ckpt, w / f'epoch{epoch}.pt')
                    del ckpt
                    callbacks.run('on_model_save', last, epoch, final_epoch, best_fitness, fi)

            # EarlyStopping
            if RANK != -1:  # if DDP training
                broadcast_list = [stop if RANK == 0 else None]
                dist.broadcast_object_list(broadcast_list, 0)  # broadcast 'stop' to all ranks
                if RANK != 0:
                    stop = broadcast_list[0]
            if stop:
                break  # must break all DDP ranks

            # end epoch ----------------------------------------------------------------------------------------------------

        # end training -----------------------------------------------------------------------------------------------------
        ## 将效果最好的权重文件取出来,在验证集上跑一遍,并将验证集上的运行结果最终打印出来

        if RANK in {-1, 0}:
            LOGGER.info(f'\n{epoch - start_epoch + 1} epochs completed in {(time.time() - t0) / 3600:.3f} hours.')
            for f in last, best:
                if f.exists():
                    strip_optimizer(f)  # strip optimizers

                    ## 把best.pt在验证集上跑一遍
                    if f is best:
                        LOGGER.info(f'\nValidating {f}...')
                        results, _, _ = validate.run(
                            data_dict,
                            batch_size=batch_size // WORLD_SIZE * 2,
                            imgsz=imgsz,
                            model=attempt_load(f, device).half(),
                            iou_thres=0.65 if is_coco else 0.60,  # best pycocotools at iou 0.65
                            single_cls=single_cls,
                            dataloader=val_loader,
                            save_dir=save_dir,
                            save_json=is_coco,
                            verbose=True,
                            plots=plots,
                            callbacks=callbacks,
                            compute_loss=compute_loss)  # val best model with plots
                        if is_coco:
                            callbacks.run('on_fit_epoch_end', list(mloss) + list(results) + lr, epoch, best_fitness, fi)

            callbacks.run('on_train_end', last, best, epoch, results)

        torch.cuda.empty_cache()
        ## 把显存释放一下
        return results
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值