1.错误尝试
在训练YOLOv8的时候,因为开太多其他程序,导致在100多次的时候崩溃,查询网上相关知识如何接着训练,在yolo5中把resume改成True就可以。
在yolov8中也这样尝试,将ultralytics/yolo/cfg/default.yaml中的resume改成True发现并没有作用,感觉yolov8代码还是有很多bug
2.可行的方法
2.1 ultralytics/yolo/engine/trainer.py
找到check_resume和resume_training方法
在check_resume方法里面将resume=中断地方的last.pt
在resume_training里面将ckpt=中断地方的last.pt
def check_resume(self, overrides):
"""Check if resume checkpoint exists and update arguments accordingly."""
#resume = self.args.resume
resume = 'runs/detect/train9/weights/last.pt' #🚀🚀🚀🚀🚀手动resume修改完毕记得该回去
if resume:
try:
exists = isinstance(resume, (str, Path)) and Path(resume).exists()
last = Path(check_file(resume) if exists else get_latest_run())
# Check that resume data YAML exists, otherwise strip to force re-download of dataset
ckpt_args = attempt_load_weights(last).args
if not Path(ckpt_args["data"]).exists():
ckpt_args["data"] = self.args.data
resume = True
self.args = get_cfg(ckpt_args)
self.args.model = str(last) # reinstate model
for k in "imgsz", "batch": # allow arg updates to reduce memory on resume if crashed due to CUDA OOM
if k in overrides:
setattr(self.args, k, overrides[k])
except Exception as e:
raise FileNotFoundError(
"Resume checkpoint not found. Please pass a valid checkpoint to resume from, "
"i.e. 'yolo train resume model=path/to/last.pt'"
) from e
self.resume = resume
def resume_training(self, ckpt):
"""Resume YOLO training from given epoch and best fitness."""
ckpt = torch.load('runs/detect/train9/weights/last.pt') #🚀🚀🚀🚀🚀手动resume修改完毕记得该回去
if ckpt is None:
return
best_fitness = 0.0
start_epoch = ckpt["epoch"] + 1
if ckpt["optimizer"] is not None:
self.optimizer.load_state_dict(ckpt["optimizer"]) # optimizer
best_fitness = ckpt["best_fitness"]
if self.ema and ckpt.get("ema"):
self.ema.ema.load_state_dict(ckpt["ema"].float().state_dict()) # EMA
self.ema.updates = ckpt["updates"]
if self.resume:
assert start_epoch > 0, (
f"{self.args.model} training to {self.epochs} epochs is finished, nothing to resume.\n"
f"Start a new training without resuming, i.e. 'yolo train model={self.args.model}'"
)
LOGGER.info(
f"Resuming training from {self.args.model} from epoch {start_epoch + 1} to {self.epochs} total epochs"
)
if self.epochs < start_epoch:
LOGGER.info(
f"{self.model} has been trained for {ckpt['epoch']} epochs. Fine-tuning for {self.epochs} more epochs."
)
self.epochs += ckpt["epoch"] # finetune additional epochs
self.best_fitness = best_fitness
self.start_epoch = start_epoch
if start_epoch > (self.epochs - self.args.close_mosaic):
self._close_dataloader_mosaic()
3.运行代码
没有在中断的train9训练,可以继续训练,
运行指令多种,这里用的自定义文件训练
from ultralytics import YOLO
if __name__=='__main__':
#预训练模型创建模型
# model = YOLO('yolov8s.pt')
# model.train(**{'cfg':'ultralytics/cfg/exp.yaml','data':'dataset/limit.yaml'})
#🥭pip setup.py install
#使用yaml文件来创建模型,并且导入与训练权重
model = YOLO('ultralytics/cfg/models/v8/magickv8.yaml')
#model.load('yoov8s.pt')
model.train(**{'cfg':'ultralytics/cfg/exp.yaml','data':'datasets/limit.yaml'}) # ultralytics\cfg\datasets\limit.yaml
# #模型验证
# model =YOLO('runs/detect/train/weights/best.pt')
# model.val(**{'data':'dataset/data.yaml','split':'test'})
# #模型推理 检测
# model =YOLO ('runs/detect/train34/weights/best.pt') #runs/detect/train/weights/best.pt
# model.predict(source='datasets/limit/images/val',**{'save':True})
# model=YOLO ('runs/detect/train34/weights/best.pt' )
# model.export('runs/detect/train34/weights/best.pt',format=onnx opset=12)
#yolo export model=runs/detect/train4/weights/best.pt format=onnx imgsz=768 simplify=True console输出
结果显示:
重要提示
训练完成后请把所有代码复原!!!
训练完成后请把所有代码复原!!!
训练完成后请把所有代码复原!!!
yolov8版本自2023.1开始,现在使用的为2024.5.1发布代码,依然存在各种bug
有文章说还需改动如下,但新代码不需要做改动
ultralytics/yolo/engine/model.py
打开ultralytics/yolo/engine/model.py代码,找到train方法,如下
将self.trainer.model = self.model注释掉
overrides = yaml_load(checks.check_yaml(kwargs["cfg"])) if kwargs.get("cfg") else self.overrides
custom = {"data": DEFAULT_CFG_DICT["data"] or TASK2DATA[self.task]} # method defaults
args = {**overrides, **custom, **kwargs, "mode": "train"} # highest priority args on the right
if args.get("resume"):
args["resume"] = self.ckpt_path
self.trainer = (trainer or self._smart_load("trainer"))(overrides=args, _callbacks=self.callbacks)
if not args.get("resume"): # manually set model only if not resuming
self.trainer.model = self.trainer.get_model(weights=self.model if self.ckpt else None, cfg=self.model.yaml)
self.model = self.trainer.model #🚀🚀🚀🚀🚀此处
if SETTINGS["hub"] is True and not self.session:
# Create a model in HUB
try:
self.session = self._get_hub_session(self.model_name)
if self.session:
self.session.create_model(args)
# Check model was created
if not getattr(self.session.model, "id", None):
self.session = None
except (PermissionError, ModuleNotFoundError):
# Ignore PermissionError and ModuleNotFoundError which indicates hub-sdk not installed
pass