pytorch 多GPU训练

pytorch 提供两种多GPU训练方案:nn.DataParallel 和 nn.DistributedDataParallel. 

nn.DataParallel

(支持单机多卡)很容易使用,但是速度慢(主要原因是它采用parameter server 模式,一张主卡作为reducer,负载不均衡,主卡成为训练瓶颈)

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

input_size = 5
output_size = 2
batch_size = 30
data_size = 100

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

class RandomDataset(Dataset):
    def __init__(self, size, length):
        self.len = length
        self.data = torch.randn(length, size)


    def __getitem__(self, index):
        return self.data[index]


    def __len__(self):
        return self.len
# dataloader
rand_loader = DataLoader(dataset=RandomDataset(input_size, data_size),
                         batch_size=batch_size, shuffle=True)

class Model(nn.Module):
    def __init__(self, input_size, output_size):
        super(Model, self).__init__()
        self.fc = nn.Linear(input_size, output_size)


    def forward(self, input):
        output = self.fc(input)
        print("\tIn Model: input size", input.size(),
              "output size", output.size())
        return output


model = Model(input_size, output_size)
if torch.cuda.device_count() > 1:
  print("Let's use", torch.cuda.device_count(), "GPUs!")
  # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
  # 将batchsize 30 分配到N个GPU上运行
  model = nn.DataParallel(model)
model.to(device)

for data in rand_loader:
    input = data.to(device)
    output = model(input)
    print("Outside: input size", input.size(),
          "output_size", output.size())

nn.DistributedDataParallel

(支持单机多卡和多机多卡)采用All-reduce模式:
复制模型到多个GPU上,每个GPU通过一个进程来控制,进程之间互相通信,只有梯度信息是需要不同进程gpu之间通信,所有瓶颈限制没有那么严重。
在训练时,每个进程/GPU load 自己的minibatch数据(所以要用distributedsampler), 每个GPU做自己独立的前向运算,反向传播时梯度all-reduce在各个GPU之间,各个节点得到平均梯度,保证各个GPU上的模型权重同步。 多进程之间同步信息通信是通过 distributed.init_process_group实现,找到主进程和总的进程数,总的进程数称为world_size。
 
一种方式是通过使用multiprocessing:
# 每个进程run一次train(i, args), i在(0 到 args.gpus-1)的范围。
def train(local_rank, args):
    rank = args.nodes * args.gpus + local_rank #得到全局rank  
    # 初始化进程,join 其他进程,pytorch docs解释nccl 通讯后台 backend 是最快的。  
    # https://pytorch.org/docs/stable/distributed.html 
    dist.init_process_group(                                   
        backend='nccl',                                         
                init_method='env://',                                   
        world_size=args.world_size,                              
        rank=rank                                               
    )                                                          
    
    torch.manual_seed(0)#设置随机种子每个进程中,使得每个进程以同样的参数做初始化
    model = model()
    torch.cuda.set_device(gpu)
    model.cuda(gpu)
    batch_size = 100
    criterion = nn.CrossEntropyLoss().cuda(gpu)
    optimizer = torch.optim.SGD(model.parameters(), 1e-4)
    
    # Wrap the model
    model = nn.parallel.DistributedDataParallel(model,
                                                device_ids=[gpu])

    # Data loading code
    train_dataset = xxx    
    #train_sampler 使得每个进程得到不同切分的数据
    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_dataset,
        num_replicas=args.world_size,
        rank=rank
    )


    train_loader = torch.utils.data.DataLoader(
        dataset=train_dataset,
       batch_size=batch_size,
       shuffle=False,
       num_workers=args.num_workers,
       pin_memory=True,
      sampler=train_sampler)
    …



def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-n', '--nodes', default=1,
                        type=int, metavar='N')
    parser.add_argument('-g', '--gpus', default=1, type=int,
                        help='number of gpus per node')
    parser.add_argument('-nr', '--nr', default=0, type=int,
                        help='ranking within the nodes')
    parser.add_argument('--epochs', default=2, type=int, 
                        metavar='N',
                        help='number of total epochs to run’)

    args = parser.parse_args()
    args.world_size = args.gpus * args.nodes                #
    os.environ['MASTER_ADDR'] = ‘xx.xx.xx.xx'              #
    os.environ['MASTER_PORT'] = ‘xxxx'                      #
    mp.spawn(train, nprocs=args.gpus, args=(args,))         #

另一种方式使用torch.distributed.launch:

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(‘—-local_rank’, type=int, default=0)

    dist.init_process_group(backend='nccl')                                                          
    
    world_size = torch.distributed.get_world_size()
    torch.manual_seed(0)
    model = model()
    torch.cuda.set_device(args.local_rank)
    device = torch.device("cuda", args.local_rank)
    model.cuda(args.local_rank)
    batch_size = 100
    criterion = nn.CrossEntropyLoss().cuda(gpu)
    optimizer = torch.optim.SGD(model.parameters(), 1e-4)
    
    # Wrap the model
    model = nn.parallel.DistributedDataParallel(model,
                                              device_ids[args.local_rank])

    # Data loading code
    train_dataset = xxx    
    #train_sampler 使得每个进程得到不同切分的数据
    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_dataset,
        num_replicas=world_size,
        rank=rank
    )


    train_loader = torch.utils.data.DataLoader(
        dataset=train_dataset,
       batch_size=batch_size,
       shuffle=False,
       num_workers=args.num_workers,
       pin_memory=True,
      sampler=train_sampler)


# python -m torch.distributed.launch --nproc_per_node=2 main.py

参考: https://yangkky.github.io/2019/07/08/distributed-pytorch-tutorial.html

  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
以下是使用 PyTorch 进行多 GPU 训练的示例代码: ```python import torch import torch.nn as nn import torch.optim as optim from torch.utils.data import DataLoader from torchvision.datasets import CIFAR10 from torchvision.transforms import transforms # 定义模型 class Net(nn.Module): def __init__(self): super(Net, self).__init__() self.conv1 = nn.Conv2d(3, 6, 5) self.pool = nn.MaxPool2d(2, 2) self.conv2 = nn.Conv2d(6, 16, 5) self.fc1 = nn.Linear(16 * 5 * 5, 120) self.fc2 = nn.Linear(120, 84) self.fc3 = nn.Linear(84, 10) def forward(self, x): x = self.pool(torch.relu(self.conv1(x))) x = self.pool(torch.relu(self.conv2(x))) x = x.view(-1, 16 * 5 * 5) x = torch.relu(self.fc1(x)) x = torch.relu(self.fc2(x)) x = self.fc3(x) return x # 定义训练函数 def train(model, device, train_loader, optimizer, criterion): model.train() for batch_idx, (data, target) in enumerate(train_loader): data, target = data.to(device), target.to(device) optimizer.zero_grad() output = model(data) loss = criterion(output, target) loss.backward() optimizer.step() # 定义测试函数 def test(model, device, test_loader, criterion): model.eval() test_loss = 0 correct = 0 with torch.no_grad(): for data, target in test_loader: data, target = data.to(device), target.to(device) output = model(data) test_loss += criterion(output, target).item() pred = output.argmax(dim=1, keepdim=True) correct += pred.eq(target.view_as(pred)).sum().item() test_loss /= len(test_loader.dataset) accuracy = 100. * correct / len(test_loader.dataset) print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)'.format( test_loss, correct, len(test_loader.dataset), accuracy)) # 定义主函数 def main(): # 设置超参数 batch_size = 128 epochs = 10 lr = 0.01 momentum = 0.9 num_workers = 4 num_gpus = torch.cuda.device_count() # 加载数据集 transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) train_dataset = CIFAR10(root='./data', train=True, download=True, transform=transform) test_dataset = CIFAR10(root='./data', train=False, download=True, transform=transform) train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers) test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers) # 初始化模型和优化器 model = Net() if num_gpus > 1: model = nn.DataParallel(model) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum) criterion = nn.CrossEntropyLoss() # 训练和测试 for epoch in range(1, epochs + 1): train(model, device, train_loader, optimizer, criterion) test(model, device, test_loader, criterion) if __name__ == '__main__': main() ``` 这个示例代码可以在多个 GPU 上并行训练模型,如果只有一个 GPU,可以将 `num_gpus` 设置为 1。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值