Pytorch分布式训练

参考文献:https://www.cnblogs.com/jfdwd/p/11196439.html

NCCL

NVIDIA Collective Communication Library (NCCL) :NVIDIA分布式通信库,负责加速共同通信原语,不是一个并行编程框架。利用NCCL可以在多GPU训练下的优化。
下载安装链接:https://docs.nvidia.com/deeplearning/sdk/nccl-install-guide/index.html
Pytorch中的torch.distributed包可以调用NCCL作为后端,下面展示单机多GPU的情况。
首先看全局的运行方法:

python -m torch.distributed.launch --nproc_per_node=GPU数量 train.py --arg1 --arg2 --arg3

Note that如果要同时运行另外一个程序,则需要指定主机的地址和端口,否则会与之前的程序地址冲突:

python -m torch.distributed.launch --nproc_per_node=GPU数量  --master_addr 127.0.0.2 --master_port 29501 train.py --arg1 --arg2 --arg3

上述命令为使用torch.distributed.launch来运行我们包含有torch.distributed的训练脚本,其会传递一些环境变量,下面来看训练脚本:

import torch.distributed as dist
parser = argparse.ArgumentParser(description='Distributed Training')
# 该参数由torch.distributed.launch自动传递,代表当前进程处理的GPU编号
parser.add_argument('--local_rank', type=int, default=0)

num_gpus = int(os.environ['WORLD_SIZE']) if 'WORLD_SIZE' in os.environ else 1
is_distributed = num_gpus > 1

if is_distributed:
    torch.cuda.set_device(args.local_rank)
    torch.distributed.init_process_group(backend='nccl', init_method='env://')
    dist.barrier()
# train
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank)

torch.distributed.launch为我们触发了n个train.pyGPU进程(PID),n就是我们将要使用的GPU数量。train.py会并行地n个运行。所以我们需要torch.cuda.set_device(args.local_rank)设定默认的GPU。

如下提供一个训练CIFAR10的文件:
执行代码:python -m torch.distributed.launch --nproc_per_node=2 main.py

import torch
import torch.nn as nn
import torch.optim as optim
import torch.backends.cudnn as cudnn
import time

import os
import argparse
import numpy as np

import models
import torchvision
import torchvision.transforms as transforms
import torch.distributed as dist

parser = argparse.ArgumentParser(description='Distributed Training')
parser.add_argument('--local_rank', type=int, default=0)
args = parser.parse_args()

num_gpus = int(os.environ['WORLD_SIZE']) if 'WORLD_SIZE' in os.environ else 1
is_distributed = num_gpus > 1 

if is_distributed:
    torch.cuda.set_device(args.local_rank)
    torch.distributed.init_process_group(backend='nccl', init_method='env://')
    dist.barrier()

trainset = torchvision.datasets.CIFAR10(root='data', train=True, download=True,
                                        transform=transforms.Compose([
                                        transforms.RandomCrop(32, padding=4),
                                        transforms.RandomHorizontalFlip(),
                                        transforms.ToTensor(),
                                        transforms.Normalize([0.49139968, 0.48215827, 0.44653124],
                                                              [0.24703233, 0.24348505, 0.26158768]),])) 
testset = torchvision.datasets.CIFAR10(root='data', train=False, download=True,
                                       transform=transforms.Compose([
                                       transforms.ToTensor(),
                                       transforms.Normalize([0.49139968, 0.48215827, 0.44653124],
                                                            [0.24703233, 0.24348505, 0.26158768]),]))
trainloader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True,
                                          pin_memory=(torch.cuda.is_available()))

testloader = torch.utils.data.DataLoader(testset, batch_size=100, shuffle=False,
                                         pin_memory=(torch.cuda.is_available()))                                                                                 
def reduce_tensor(tensor):
    rt = tensor.clone()
    dist.all_reduce(rt, op=dist.ReduceOp.SUM)
    rt /= 2
    return rt

criterion = nn.CrossEntropyLoss()
model = getattr(models, 'LeNet')().cuda()
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank)
optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=1e-4, nesterov=True)
def train(epoch):
    print(args.local_rank)
    model.train()
    start_time = time.time()
    for batch_idx, (inputs, targets) in enumerate(trainloader):
        inputs, targets = inputs.cuda(), targets.cuda()
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        # 每个GPU独立更新参数
        loss.backward()
        optimizer.step()

        if is_distributed:
       		# 将多GPU求解的loss进行汇总,torch.distributed会自动的同步所有GPU
            reduced_loss = reduce_tensor(loss)
        print('GPU: %s, loss: %.3f, batch_size: %d, reduced_loss: %.3f' % (args.local_rank,
              loss.item(), targets.size()[0], reduced_loss.item()))
for i in range(100):
    train(i)

APEX

apex是一个Pytorch的扩展,用于辅助NVIDIA显卡的混合精度和分布式训练。
参考文档:https://nvidia.github.io/apex/
github地址:https://github.com/NVIDIA/apex
这里展示使用apex来实现分布式训练和混合精度训练,同时还能实现多卡的同步批标准化。其中,我们仍然使用torch.distributed库,但是并行方法采用apex的DistributedDataParallel对象。其中FP16和普通精度训练都被包含在代码中。
参考链接:https://github.com/NVIDIA/apex/blob/master/examples/imagenet/main_amp.py

import torch
import torch.nn as nn
import torch.optim as optim
import torch.backends.cudnn as cudnn
import time

import os
import argparse
import numpy as np

import models
import torchvision
import torchvision.transforms as transforms
import torch.distributed as dist
import apex
import apex.amp as amp 
from apex.parallel import DistributedDataParallel as DDP 

parser = argparse.ArgumentParser(description='Distributed Training')
parser.add_argument('--local_rank', type=int, default=0)
parser.add_argument('--opt_level', type=str, default='O0',
                    help='O0:FP32,O1:Mixed Precision,O2:Almost FP16 Mixed Precision, O3:fp16 training')
parser.add_argument('--sync_bn', action='store_true',
                    help='enabling apex sync BN.')
parser.add_argument('--keep-batchnorm-fp32', type=str, default=None)
parser.add_argument('--loss-scale', type=str, default=None)
args = parser.parse_args()

num_gpus = int(os.environ['WORLD_SIZE']) if 'WORLD_SIZE' in os.environ else 1
is_distributed = num_gpus > 1 

if is_distributed:
    torch.cuda.set_device(args.local_rank)
    torch.distributed.init_process_group(backend='nccl', init_method='env://')
trainset = torchvision.datasets.CIFAR10(root='data', train=True, download=True,
                    transform=transforms.Compose([
                    transforms.RandomCrop(32, padding=4),
                    transforms.RandomHorizontalFlip(),
                    transforms.ToTensor(),
                    transforms.Normalize([0.49139968, 0.48215827, 0.44653124],
                                          [0.24703233, 0.24348505, 0.26158768]),]))
testset = torchvision.datasets.CIFAR10(root='data', train=False, download=True,
                       transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize([0.49139968, 0.48215827, 0.44653124],
                                        [0.24703233, 0.24348505, 0.26158768]),]))
trainloader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True,
                                          pin_memory=(torch.cuda.is_available()))

testloader = torch.utils.data.DataLoader(testset, batch_size=100, shuffle=False,
                                         pin_memory=(torch.cuda.is_available()))
def reduce_tensor(tensor):
    rt = tensor.clone()
    dist.all_reduce(rt, op=dist.ReduceOp.SUM)
    rt /= 2
    return rt

criterion = nn.CrossEntropyLoss().cuda()
model = getattr(models, 'LeNet')()
if args.sync_bn:
    model = apex.parallel.convert_syncbn_model(model)
model=model.cuda()
optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=1e-4, nesterov=True)
model, optimizer = amp.initialize(model, optimizer,
                                  opt_level=args.opt_level,
                                  keep_batchnorm_fp32=args.keep_batchnorm_fp32,
                                  loss_scale=args.loss_scale)
if is_distributed:
    model = DDP(model, delay_allreduce=True)

def train(epoch):
    model.train()
    start_time = time.time()
    for batch_idx, (inputs, targets) in enumerate(trainloader):
        inputs, targets = inputs.cuda(), targets.cuda()
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        with amp.scale_loss(loss, optimizer) as scaled_loss:
            scaled_loss.backward()
        optimizer.step()

        if is_distributed:
            reduced_loss = reduce_tensor(loss)
        print('GPU: %s, loss: %.3f, batch_size: %d, reduced_loss: %.3f' % (args.local_rank,
              loss.item(), targets.size()[0], reduced_loss.item()))
for i in range(100):
    train(i)
已标记关键词 清除标记
相关推荐
©️2020 CSDN 皮肤主题: 编程工作室 设计师:CSDN官方博客 返回首页