autodl上配置:pytorch2.1 ubuntu22.04 cuda 12.1 python3.10
选用了一块4090D
目录
一.下载代码
code:https://github.com/swz30/MPRNet
paper:https://openaccess.thecvf.com/content/CVPR2021/papers/Zamir_Multi-Stage_Progressive_Image_Restoration_CVPR_2021_paper.pdf
二.我更改的内容
在官网下载后,在 Deblurring 中的Datasets 中有README,按照官方路径下载数据集,pretrained_models文件夹下下载作者的权重。
之后,是我对 train 文件的更改,
更改点:
设置了10轮的预热(warmup_epochs = 10)
将torch.sum 改为 sum
loss_char = sum([criterion_char(restored[j], target) for j in range(len(restored))]) loss_edge = sum([criterion_edge(restored[j], target) for j in range(len(restored))])train_loader 和 val_ loader 其中的 num_workers = 0 ,我在Windows下改为0才可以,ubuntu下我也没改回作者的8。
import os
from config import Config
opt = Config('training.yml')
gpus = ','.join([str(i) for i in opt.GPU])
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = gpus
import torch
torch.backends.cudnn.benchmark = True
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import random
import time
import numpy as np
import utils
from data_RGB import get_training_data, get_validation_data
from MPRNet import MPRNet
import losses
from warmup_scheduler import GradualWarmupScheduler
from tqdm import tqdm
######### Set Seeds ###########
random.seed(1234)
np.random.seed(1234)
torch.manual_seed(1234)
torch.cuda.manual_seed_all(1234)
start_epoch = 1
mode = opt.MODEL.MODE
session = opt.MODEL.SESSION
result_dir = os.path.join(opt.TRAINING.SAVE_DIR, mode, 'results', session)
model_dir = os.path.join(opt.TRAINING.SAVE_DIR, mode, 'models', session)
utils.mkdir(result_dir)
utils.mkdir(model_dir)
train_dir = opt.TRAINING.TRAIN_DIR
val_dir = opt.TRAINING.VAL_DIR
######### Model ###########
model_restoration = MPRNet()
model_restoration.cuda()
device_ids = [i for i in range(torch.cuda.device_count())]
if torch.cuda.device_count() > 1:
print("\n\nLet's use", torch.cuda.device_count(), "GPUs!\n\n")
new_lr = opt.OPTIM.LR_INITIAL
optimizer = optim.Adam(model_restoration.parameters(), lr=new_lr, betas=(0.9, 0.999), eps=1e-8)
######### Scheduler ###########
warmup_epochs = 10
scheduler_cosine = optim.lr_scheduler.CosineAnnealingLR(optimizer, opt.OPTIM.NUM_EPOCHS-warmup_epochs, eta_min=opt.OPTIM.LR_MIN)
scheduler = GradualWarmupScheduler(optimizer, multiplier=1, total_epoch=warmup_epochs, after_scheduler=scheduler_cosine)
# scheduler.step()
######### Resume ###########
if opt.TRAINING.RESUME:
path_chk_rest = utils.get_last_path(model_dir, '_latest.pth')
utils.load_checkpoint(model_restoration, path_chk_rest)
start_epoch = utils.load_start_epoch(path_chk_rest) + 1
utils.load_optim(optimizer, path_chk_rest)
for i in range(1, start_epoch):
scheduler.step()
new_lr = scheduler.get_last_lr()[0]
print('------------------------------------------------------------------------------')
print("==> Resuming Training with learning rate:", new_lr)
print('------------------------------------------------------------------------------')
if len(device_ids) > 1:
model_restoration = nn.DataParallel(model_restoration, device_ids=device_ids)
######### Loss ###########
criterion_char = losses.CharbonnierLoss()
criterion_edge = losses.EdgeLoss()
######### DataLoaders ###########
print(f'Loading training data from {train_dir}')
train_dataset = get_training_data(train_dir, {'patch_size': opt.TRAINING.TRAIN_PS})
train_loader = DataLoader(dataset=train_dataset, batch_size=opt.OPTIM.BATCH_SIZE, shuffle=True, num_workers=0, drop_last=False, pin_memory=True)
print(f'Loading validation data from {val_dir}')
val_dataset = get_validation_data(val_dir, {'patch_size': opt.TRAINING.VAL_PS})
val_loader = DataLoader(dataset=val_dataset, batch_size=1, shuffle=False, num_workers=0, drop_last=False, pin_memory=True)
print('===> Start Epoch {} End Epoch {}'.format(start_epoch, opt.OPTIM.NUM_EPOCHS + 1))
print('===> Loading datasets')
best_psnr = 0
best_epoch = 0
for epoch in range(start_epoch, opt.OPTIM.NUM_EPOCHS + 1):
epoch_start_time = time.time()
epoch_loss = 0
model_restoration.train()
for i, data in enumerate(tqdm(train_loader), 0):
# zero_grad
for param in model_restoration.parameters():
param.grad = None
target = data[0].cuda()
input_ = data[1].cuda()
restored = model_restoration(input_)
# Compute loss at each stage
loss_char = sum([criterion_char(restored[j], target) for j in range(len(restored))])
loss_edge = sum([criterion_edge(restored[j], target) for j in range(len(restored))])
loss = (loss_char) + (0.05 * loss_edge)
loss.backward()
optimizer.step()
epoch_loss += loss.item()
#### Evaluation ####
if epoch % opt.TRAINING.VAL_AFTER_EVERY == 0:
model_restoration.eval()
psnr_val_rgb = []
print('Starting validation...')
print(f'Validation dataset size: {len(val_loader)}')
for ii, data_val in enumerate(val_loader, 0):
if len(data_val) == 0:
print(f'Batch {ii} is empty!')
continue
target = data_val[0].cuda()
input_ = data_val[1].cuda()
print(f'Batch {ii} - input shape: {input_.shape}, target shape: {target.shape}')
with torch.no_grad():
restored = model_restoration(input_)
if isinstance(restored, list) and len(restored) > 0:
restored = restored[0]
else:
print(f'Restored output for batch {ii} is empty or not a list!')
continue
for res, tar in zip(restored, target):
psnr = utils.torchPSNR(res, tar)
print(f'PSNR for batch {ii}: {psnr}')
psnr_val_rgb.append(psnr)
if len(psnr_val_rgb) == 0:
raise RuntimeError(
"Validation resulted in an empty PSNR list. Please check the validation dataset and PSNR calculation.")
psnr_val_rgb = torch.stack(psnr_val_rgb).mean().item()
if psnr_val_rgb > best_psnr:
best_psnr = psnr_val_rgb
best_epoch = epoch
torch.save({'epoch': epoch,
'state_dict': model_restoration.state_dict(),
'optimizer': optimizer.state_dict()
}, os.path.join(model_dir, "model_best.pth"))
print("[epoch %d PSNR: %.4f --- best_epoch %d Best_PSNR %.4f]" % (epoch, psnr_val_rgb, best_epoch, best_psnr))
torch.save({'epoch': epoch,
'state_dict': model_restoration.state_dict(),
'optimizer': optimizer.state_dict()
}, os.path.join(model_dir, f"model_epoch_{epoch}.pth"))
scheduler.step()
print("------------------------------------------------------------------")
print("Epoch: {}\tTime: {:.4f}\tLoss: {:.4f}\tLearningRate {:.6f}".format(epoch, time.time() - epoch_start_time,
epoch_loss, scheduler.get_last_lr()[0]))
print("------------------------------------------------------------------")
torch.save({'epoch': epoch,
'state_dict': model_restoration.state_dict(),
'optimizer': optimizer.state_dict()
}, os.path.join(model_dir, "model_latest.pth"))
三.对于 training.yaml 的更改
我只租了一块GPU,batch_size 设置更高我租的4090D会爆显存不够的错误。
训练轮数100次,我训练时大概是7min / epoch。
学习率我设置为 1e-4,因为作者的2e-4和预热(warmup_epochs = 3)我会出现梯度爆炸,loss突然变得巨大的情况,因此降低学习率,增加预热轮数。
GPU: [0]
VERBOSE: True
MODEL:
MODE: 'Deblurring'
SESSION: 'MPRNet'
# Optimization arguments.
OPTIM:
BATCH_SIZE: 1
NUM_EPOCHS: 100
# NEPOCH_DECAY: [10]
LR_INITIAL: 1e-4
LR_MIN: 1e-6
# BETA1: 0.9
TRAINING:
VAL_AFTER_EVERY: 20
RESUME: False
TRAIN_PS: 256
VAL_PS: 256
TRAIN_DIR: './Datasets/GoPro/train' # path to training data
VAL_DIR: './Datasets/GoPro/test' # path to validation data
SAVE_DIR: './checkpoints' # path to save models and images
# SAVE_IMAGES: False
四.在服务器上部署
找一个你想要的地区和显卡:
点进去,按照如下进行创建:
之后,你会得到一个实例:
在正式训练之前,可以使用无卡开机进行传输文件,省钱。
传输文件可以看看别人的,我是使用Xftp 7传的,巨慢
之后使用 jupyter 打开,查看文件,安装需要的软件包:
pip install matplotlib scikit-image opencv-python yacs joblib natsort h5py tqdm
下图是我在服务器上的文件内容:
文件啥的都没问题,关闭 jupyter ,关机实例(因为现在是无显卡启动)
再次开机(现在是有显卡启动) ,进入 jupyter :
开始训练。
我大概训练了40轮,一不小心按Ctrl+c退出了,当时的PSNR大概是28左右,训练出来的权重使用demo.py得出的结果图照作者的有差距但不大。
五.参考:
1.MPRNet 训练自己的数据集_mprnet训练结果-CSDN博客
2.【论文笔记】图像修复MPRNet:Multi-Stage Progressive Image Restoration 含代码解析-CSDN博客