[pytorch 多GPU并行训练例程与测试] 简易模型,替换成自己的model即可

重点:参考官方教程修改,很nice的教程 !!! 想学习的必看

测试环境

Linux

torch==2.0.0

 使用torch.multiprocessing

优点:运行代码简单

# 直接测试
$ python test2.py

# 指定GPU 0,1 进行测试
$ CUDA_VISIBLE_DEVICES=0,1 python test2.py

# run time 14.37288

 文件名:test2.py

import torch
import torch.distributed as dist
import torch.nn as nn
import torch.optim as optim
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP

import os
import argparse

def setup(rank, world_size):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'
    
    # On Windows platform, the torch.distributed package only
    # supports Gloo backend, FileStore and TcpStore.
    # For FileStore, set init_method parameter in init_process_group
    # to a local file. Example as follow:
    # init_method="file:///f:/libtmp/some_file"
    # dist.init_process_group(
    #    "gloo",
    #    rank=rank,
    #    init_method=init_method,
    #    world_size=world_size)
    # For TcpStore, same way as on Linux.

    # initialize the process group  'nccl'
    dist.init_process_group("nccl", rank=rank, world_size=world_size)

def cleanup():
    dist.destroy_process_group()
    
class ToyModel(nn.Module):
    def __init__(self, h,layers):
        super(ToyModel, self).__init__()
        self.m = nn.Sequential(
            nn.Linear(10, h),
            *[nn.Linear(h, h) for _ in range(layers)],
            nn.Linear(h, 5)
        )

    def forward(self, x):
        return self.m(x)


def demo_basic(rank, args):
    setup(rank, args.world_size)
    print(f"Start running basic DDP example on rank {rank}.")

    # create model and move it to GPU with id rank
    device_id = rank % torch.cuda.device_count()
    model = ToyModel(args.h, args.layers).to(device_id)
    ddp_model = DDP(model, device_ids=[device_id])
    
    loss_fn = nn.MSELoss()
    optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
    
    for _ in range(args.epochs):
        optimizer.zero_grad()
        outputs = ddp_model(torch.randn(20, 10))
        labels = torch.randn(20, 5).to(device_id)
        loss_fn(outputs, labels).backward()
        optimizer.step()
        
        if rank==0:
            print(_)
    
    cleanup()
    
def run_demo(demo_fn, args):
    mp.spawn(demo_fn,
             args=(args,),
             nprocs=args.world_size,
             join=True)
    
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--h', type=int, default=1024, help='隐藏层节点数')
    parser.add_argument('--layers', type=int, default=10, help='MLP层数')
    parser.add_argument('--epochs', type=int, default=1000, help='epochs')
    parser.add_argument('--world_size', type=int, default=2, help='n_gpus')
    args = parser.parse_args()
    print(vars(args))
    
    # 自动调用所有的GPU
    # n_gpus = torch.cuda.device_count()
    # assert n_gpus >= 2, f"Requires at least 2 GPUs to run, but got {n_gpus}"
    # args.world_size = n_gpus
    
    import time
    start_time = time.time()
    run_demo(demo_basic, args)
    print(f'run time {time.time()-start_time:.5f}')

不使用torch.multiprocessing

运行代码复杂一点,但是运行的更快

# nnodes: 机器个数  一般一台机器多个GPU
# nproc_per_node: 每个机器GPU数量
$ torchrun --nnodes=1 --nproc_per_node=2 test.py --h 4096 --layers 5 --epochs 1000

# 指定GPU进行测试 nproc_per_node要和指定的GPU数量一致
$ CUDA_VISIBLE_DEVICES=0,1 torchrun --nnodes=1 --nproc_per_node=2 test.py --h 4096 --layers 5 --epochs 1000

# run time 10.70822
# run time 11.10419

 文件名:test.py

import torch
import torch.distributed as dist
import torch.nn as nn
import torch.optim as optim

from torch.nn.parallel import DistributedDataParallel as DDP

import argparse

class ToyModel(nn.Module):
    def __init__(self, h,layers):
        super(ToyModel, self).__init__()
        self.m = nn.Sequential(
            nn.Linear(10, h),
            *[nn.Linear(h, h) for _ in range(layers)],
            nn.Linear(h, 5)
        )

    def forward(self, x):
        return self.m(x)


def demo_basic(args):
    dist.init_process_group("nccl")
    rank = dist.get_rank()
    print(f"Start running basic DDP example on rank {rank}.")

    # create model and move it to GPU with id rank
    device_id = rank % torch.cuda.device_count()
    model = ToyModel(args.h, args.layers).to(device_id)
    ddp_model = DDP(model, device_ids=[device_id])
    
    loss_fn = nn.MSELoss()
    optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
    
    for _ in range(args.epochs):
        optimizer.zero_grad()
        outputs = ddp_model(torch.randn(20, 10))
        labels = torch.randn(20, 5).to(device_id)
        loss_fn(outputs, labels).backward()
        optimizer.step()
        
        if rank==0:
            print(_)

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--h', type=int, default=1024, help='隐藏层节点数')
    parser.add_argument('--layers', type=int, default=10, help='MLP层数')
    parser.add_argument('--epochs', type=int, default=1000, help='epochs')
    args = parser.parse_args()
    print(vars(args))
    
    import time
    start_time = time.time()
    demo_basic(args)
    print(f'run time {time.time()-start_time:.5f}')

  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

放飞自我的Coder

你的鼓励很棒棒哦~

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值