在Pytorch中,DDP训练模式可以利用单机上的多块显卡进行训练,能够保证训练时所有显卡的显存消耗相同(如果分配的数据量相同)。在多卡训练过程中,原理上是启动多进程训练,进程之间依赖网络通讯共享梯度。
例程
import os
import pathlib
import numpy as np
import tqdm
import argparse
import torch
import torch.nn as nn
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.optim import AdamW
from torch.optim.lr_scheduler import StepLR
from torch.utils.tensorboard import SummaryWriter
from torchvision import transforms
from utils import load_configs
from data import data_pipeline, get_ood_dataloader, RESIZE, CROP
from model import get_model
from ood_matrics import ACC, FPR, AUROC
from test import test
def setup(rank, world_size):
# rank: 进程 ID 0-(world_size - 1)
# world_size: 进程个数
print("multi gpu setup", rank, world_size)
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '12345'
dist.init_process_group("nccl", rank=rank, world_size=world_size)
torch.cuda.set_device(rank)
return
def cleanup():
dist.destroy_process_group()
def train(model: torch.nn.Module, ID_dataloader_train, ID_dataloader_val, OOD_dataloader, epoch, lr, checkpoint_path, fpr_threshold=0.95, log_path='./logs/baseline', device='cuda', current_rank=None, trainlog_writer=None):
params = [
{
'params': model.od_head.parameters(), 'lr': lr},
{
'params': model.backbone.fc.parameters()},
# {'params': model.backbone.layer4.parameters(), 'lr': 0.1 * lr}
]
if