本文章借鉴于Pytorch官方的Totourial
DDP(DistributedDataParallel):多进程,可单机多GPU训练,可跨设备训练
DP(DataParallel):单进程,多线程,只可单机多GPU训练,但多GPU训练时内存主要占用第一块GPU
import os
import sys
import tempfile
import torch
import torch.distributed as dist
import torch.nn as nn
import torch.optim as optim
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP
# 申请多进程组
def setup(rank, world_size):
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '12355'
# initialize the process group
dist.init_process_group("gloo", rank=rank, world_size=world_size)
#销毁进程组
def cleanup():
dist.destroy_process_group()
#简易的网络
class ToyMpModel(nn.Module):
def __init__(self, dev0, dev1):
super(ToyMpModel, self).__init__()
self.dev0 = dev0
self.dev1 = dev1
self.net1 = torch.nn.Linear(10, 10).to(dev0)
self.relu = torch.nn.ReLU()
self.net2 = torch.nn.Linear(10, 5).to(dev1)
def forward(self, x):
x = x.to(self.dev0)
x = self.relu(self.net1(x))
x = x.to(self.dev1)
return self.net2(x)
def run_demo(demo_fn, world_size):
mp.spawn(demo_fn,
args=(world_size,),
nprocs=world_size,
join=True)
def demo_model_parallel(rank, world_size):
print(f"Running DDP with model parallel example on rank {rank}.")
setup(rank, world_size)
# setup mp_model and devices for this process
dev0 = (rank * 2) % world_size
dev1 = (rank * 2 + 1) % world_size
mp_model = ToyMpModel(dev0, dev1)
ddp_mp_model = DDP(mp_model)
loss_fn = nn.MSELoss()
optimizer = optim.SGD(ddp_mp_model.parameters(), lr=0.001)
optimizer.zero_grad()
# outputs will be on dev1
for i in range(2000):
outputs = ddp_mp_model(torch.randn(20, 10))
labels = torch.randn(20, 5).to(dev1)
loss_fn(outputs, labels).backward()
print("[loss] ", loss_fn(outputs, labels))
optimizer.step()
cleanup()
if __name__ == "__main__":
n_gpus = torch.cuda.device_count()
assert n_gpus >= 2, f"Requires at least 2 GPUs to run, but got {n_gpus}"
world_size = n_gpus
run_demo(demo_model_parallel, world_size)
通过外部命令行设置GPU,执行代码
CUDA_VISIBLE_DEVICES=0,1,2,3 python main.py
得到的结果会是
Running DDP with model parallel example on rank 2.
Running DDP with model parallel example on rank 1.
Running DDP with model parallel example on rank 3.
Running DDP with model parallel example on rank 0.
[loss] tensor(1.0375, device='cuda:3', grad_fn=<MseLossBackward>)
[loss] tensor(0.9430, device='cuda:3', grad_fn=<MseLossBackward>)
先试炼小的DEMO,后续就知道怎么运行在自己的网络上面了。自己也可以通过改变上面简单的网络的一些设定来检验自己的猜想。