config添加
parser.add_argument('--local_rank', type=int, default=-1)
train中添加
import torch.distributed as dist
from torch.utils.data.distributed import DistributedSampler
在有写操作时,注意判断local_rank
初始化
dist.init_process_group(backend='nccl')
torch.cuda.set_device(self.opt.local_rank)
torch.autograd.set_detect_anomaly(True) #检查异常使用,训练时需注释掉
self.device = torch.device("cuda", self.opt.local_rank) if torch.cuda.is_available() else torch.device("cpu")
模型操作(用到batchnorm需要额外添加一项,每个模型注意添加GPU idx)
self.netD = torch.nn.SyncBatchNorm.convert_sync_batchnorm(self.netD)
self.netD = torch.nn.parallel.DistributedDataParallel(self.netD,
find_unused_parameters=True,device_ids[self.opt.local_rank],output_device=self.opt.local_rank)
dataloader操作(shuffle不能设置为True,因为sampler自带shuffle,testset可以不管)
rain_dataset = self.dataset(
self.opt.data_path, train_filenames, self.opt.data_height, self.opt.data_width,
self.opt.data_frame_ids, 4, is_train=True, img_ext=img_ext)
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
self.train_loader = torch.utils.data.DataLoader(
train_dataset, self.opt.batch_size, #shuffle = True,
num_workers=self.opt.data_workers, pin_memory=True, drop_last=True, sampler=train_sampler)
训练
export CUDA_VISIBLE_DEVICES=0,1
python -m torch.distributed.launch --nproc_per_node=2 train_ablation_multi.py
nproc_per_node 是用到几个GPU
参考:
https://www.cnblogs.com/JunzhaoLiang/archive/2004/01/13/13535952.html