重点:参考官方教程修改,很nice的教程 !!! 想学习的必看
测试环境
Linux
torch==2.0.0
使用torch.multiprocessing
优点:运行代码简单
# 直接测试
$ python test2.py# 指定GPU 0,1 进行测试
$ CUDA_VISIBLE_DEVICES=0,1 python test2.py# run time 14.37288
文件名:test2.py
import torch
import torch.distributed as dist
import torch.nn as nn
import torch.optim as optim
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP
import os
import argparse
def setup(rank, world_size):
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '12355'
# On Windows platform, the torch.distributed package only
# supports Gloo backend, FileStore and TcpStore.
# For FileStore, set init_method parameter in init_process_group
# to a local file. Example as follow:
# init_method="file:///f:/libtmp/some_file"
# dist.init_process_group(
# "gloo",
# rank=rank,
# init_method=init_method,
# world_size=world_size)
# For TcpStore, same way as on Linux.
# initialize the process group 'nccl'
dist.init_process_group("nccl", rank=rank, world_size=world_size)
def cleanup():
dist.destroy_process_group()
class ToyModel(nn.Module):
def __init__(self, h,layers):
super(ToyModel, self).__init__()
self.m = nn.Sequential(
nn.Linear(10, h),
*[nn.Linear(h, h) for _ in range(layers)],
nn.Linear(h, 5)
)
def forward(self, x):
return self.m(x)
def demo_basic(rank, args):
setup(rank, args.world_size)
print(f"Start running basic DDP example on rank {rank}.")
# create model and move it to GPU with id rank
device_id = rank % torch.cuda.device_count()
model = ToyModel(args.h, args.layers).to(device_id)
ddp_model = DDP(model, device_ids=[device_id])
loss_fn = nn.MSELoss()
optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
for _ in range(args.epochs):
optimizer.zero_grad()
outputs = ddp_model(torch.randn(20, 10))
labels = torch.randn(20, 5).to(device_id)
loss_fn(outputs, labels).backward()
optimizer.step()
if rank==0:
print(_)
cleanup()
def run_demo(demo_fn, args):
mp.spawn(demo_fn,
args=(args,),
nprocs=args.world_size,
join=True)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--h', type=int, default=1024, help='隐藏层节点数')
parser.add_argument('--layers', type=int, default=10, help='MLP层数')
parser.add_argument('--epochs', type=int, default=1000, help='epochs')
parser.add_argument('--world_size', type=int, default=2, help='n_gpus')
args = parser.parse_args()
print(vars(args))
# 自动调用所有的GPU
# n_gpus = torch.cuda.device_count()
# assert n_gpus >= 2, f"Requires at least 2 GPUs to run, but got {n_gpus}"
# args.world_size = n_gpus
import time
start_time = time.time()
run_demo(demo_basic, args)
print(f'run time {time.time()-start_time:.5f}')
不使用torch.multiprocessing
运行代码复杂一点,但是运行的更快
# nnodes: 机器个数 一般一台机器多个GPU
# nproc_per_node: 每个机器GPU数量
$ torchrun --nnodes=1 --nproc_per_node=2 test.py --h 4096 --layers 5 --epochs 1000# 指定GPU进行测试 nproc_per_node要和指定的GPU数量一致
$ CUDA_VISIBLE_DEVICES=0,1 torchrun --nnodes=1 --nproc_per_node=2 test.py --h 4096 --layers 5 --epochs 1000# run time 10.70822
# run time 11.10419
文件名:test.py
import torch
import torch.distributed as dist
import torch.nn as nn
import torch.optim as optim
from torch.nn.parallel import DistributedDataParallel as DDP
import argparse
class ToyModel(nn.Module):
def __init__(self, h,layers):
super(ToyModel, self).__init__()
self.m = nn.Sequential(
nn.Linear(10, h),
*[nn.Linear(h, h) for _ in range(layers)],
nn.Linear(h, 5)
)
def forward(self, x):
return self.m(x)
def demo_basic(args):
dist.init_process_group("nccl")
rank = dist.get_rank()
print(f"Start running basic DDP example on rank {rank}.")
# create model and move it to GPU with id rank
device_id = rank % torch.cuda.device_count()
model = ToyModel(args.h, args.layers).to(device_id)
ddp_model = DDP(model, device_ids=[device_id])
loss_fn = nn.MSELoss()
optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
for _ in range(args.epochs):
optimizer.zero_grad()
outputs = ddp_model(torch.randn(20, 10))
labels = torch.randn(20, 5).to(device_id)
loss_fn(outputs, labels).backward()
optimizer.step()
if rank==0:
print(_)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--h', type=int, default=1024, help='隐藏层节点数')
parser.add_argument('--layers', type=int, default=10, help='MLP层数')
parser.add_argument('--epochs', type=int, default=1000, help='epochs')
args = parser.parse_args()
print(vars(args))
import time
start_time = time.time()
demo_basic(args)
print(f'run time {time.time()-start_time:.5f}')