使用DistributedDataParallel
进行单机多GPU训练,官方教程很详细了。
最后需要通过CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.launch --nproc_per_node=2 train.py
调用。
import torch
import torch.distributed as dist
torch.distributed.init_process_group(backend="nccl")
local_rank = torch.distributed.get_rank()
torch.cuda.set_device(local_rank)
device = torch.device("cuda", local_rank)
def train():
net = Net.Network().cuda()
#多卡训练保存的模型会有额外的"module."字段,读取模型时去除
net.load_state_dict({k.replace("module.",""):v for k, v in state_dict.items()})
#batch_size指定每张卡的训练样本个数
trainloader = torch.utils.data.DataLoader(trainset, batch_size=2, shuffle=False, num_workers=2, pin_memory=True,
sampler=DistributedSampler(trainset))
net.to(device)
net = torch.nn.parallel.DistributedDataParallel(net, device_ids=[local_rank], output_device=local_rank)
net.train()
#...
if local_rank == 0:
torch.save()
if epoch % 5 == 0:
if local_rank == 0:
test()
dist.barrier() #进程等待同步
参考
https://pytorch.org/tutorials/intermediate/ddp_tutorial.html
https://pytorch.org/docs/stable/nn.html#distributeddataparallel
https://fyubang.com/2019/07/23/distributed-training3/