DDP（DistributedDataParallel）细节

还好我不在意

已于 2023-08-04 15:07:22 修改

阅读量122

点赞数 1

文章标签：深度学习 pytorch 人工智能

于 2023-08-04 12:41:45 首次发布

本文链接：https://blog.csdn.net/m0_46294481/article/details/132100955

版权

以下链接可见具体细节

Distributed Data Parallel — PyTorch 2.0 documentation

import torch
import torch.distributed as dist
import torch.multiprocessing as mp
import torch.nn as nn
import torch.optim as optim
from torch.nn.parallel import DistributedDataParallel as DDP
import os

def example(rank, world_size):
    # create default process group
    dist.init_process_group("nccl", rank=rank, world_size=world_size)
    # create local model
    model = nn.Linear(5, 5).to(rank)
    print('model:',list(model.named_parameters()))
    # construct DDP model
    ddp_model = DDP(model,device_ids=[rank])
    print(list(model.named_parameters()))
    # define loss function and optimizer
    loss_fn = nn.MSELoss()
    optimizer = optim.SGD(ddp_model.parameters(), lr=0.01)

    # forward pass
    data=torch.randn(2, 5).to(rank)
    print('data:',data)
    outputs = ddp_model(data)
    labels = torch.randn(2, 5).to(rank)
    print('labels:',labels)
    # backward pass
    loss_fn(outputs, labels).backward()
    # update parameters
    optimizer.step()
    print('model:',list(model.named_parameters()))

def main():
    world_size = 3
    mp.spawn(example,
        args=(world_size,),
        nprocs=world_size,
        join=True)

if __name__=="__main__":
    # Environment variables which need to be
    # set when using c10d's default "env"
    # initialization mode.
    os.environ["MASTER_ADDR"] = "localhost"
    os.environ["MASTER_PORT"] = "29500"
    main()

model: [('weight', Parameter containing:
tensor([[ 0.3045,  0.1964, -0.3424, -0.0541, -0.1097],
        [ 0.1195, -0.0218,  0.2623, -0.1955,  0.3437],
        [-0.4044,  0.0271,  0.3661, -0.4396, -0.2141],
        [-0.3810,  0.4220, -0.0514, -0.3390, -0.3115],
        [ 0.0033, -0.4471, -0.4336, -0.1539, -0.3841]], device='cuda:0',
       requires_grad=True)), ('bias', Parameter containing:
tensor([-0.3170, -0.3447, -0.1953,  0.1275, -0.1596], device='cuda:0',
       requires_grad=True))] 
model: [('weight', Parameter containing:
tensor([[-0.0103,  0.2450, -0.3801, -0.3852, -0.4201],
        [ 0.2583, -0.1970, -0.0857, -0.4087,  0.4442],
        [-0.2723,  0.2790, -0.4115, -0.4093,  0.1986],
        [-0.1748, -0.3412,  0.2581,  0.1599, -0.4466],
        [ 0.1322,  0.2249, -0.2637, -0.0051, -0.4192]], device='cuda:2',
       requires_grad=True)), ('bias', Parameter containing:
tensor([-0.2870,  0.1566,  0.3332, -0.3366, -0.3582], device='cuda:2',
       requires_grad=True))] 
model: [('weight', Parameter containing:
tensor([[ 0.0037, -0.1383,  0.1032,  0.1924, -0.1857],
        [-0.0665, -0.3667, -0.0102,  0.2279,  0.0858],
        [ 0.2357, -0.1589,  0.4119,  0.3005, -0.1027],
        [ 0.3986,  0.2913, -0.2754,  0.2790, -0.2813],
        [-0.0271,  0.3241, -0.1089, -0.2947,  0.1021]], device='cuda:1',
       requires_grad=True)), ('bias', Parameter containing:
tensor([ 0.1550,  0.0868,  0.4109,  0.4064, -0.2531], device='cuda:1',
       requires_grad=True))] 
[('weight', Parameter containing:
tensor([[ 0.3045,  0.1964, -0.3424, -0.0541, -0.1097],
        [ 0.1195, -0.0218,  0.2623, -0.1955,  0.3437],
        [-0.4044,  0.0271,  0.3661, -0.4396, -0.2141],
        [-0.3810,  0.4220, -0.0514, -0.3390, -0.3115],
        [ 0.0033, -0.4471, -0.4336, -0.1539, -0.3841]], device='cuda:0',
       requires_grad=True)), ('bias', Parameter containing:
tensor([-0.3170, -0.3447, -0.1953,  0.1275, -0.1596], device='cuda:0',
       requires_grad=True))]
[('weight', Parameter containing:
tensor([[ 0.3045,  0.1964, -0.3424, -0.0541, -0.1097],
        [ 0.1195, -0.0218,  0.2623, -0.1955,  0.3437],
        [-0.4044,  0.0271,  0.3661, -0.4396, -0.2141],
        [-0.3810,  0.4220, -0.0514, -0.3390, -0.3115],
        [ 0.0033, -0.4471, -0.4336, -0.1539, -0.3841]], device='cuda:2',
       requires_grad=True)), ('bias', Parameter containing:
tensor([-0.3170, -0.3447, -0.1953,  0.1275, -0.1596], device='cuda:2',
       requires_grad=True))]

[('weight', Parameter containing:
tensor([[ 0.3045,  0.1964, -0.3424, -0.0541, -0.1097],
        [ 0.1195, -0.0218,  0.2623, -0.1955,  0.3437],
        [-0.4044,  0.0271,  0.3661, -0.4396, -0.2141],
        [-0.3810,  0.4220, -0.0514, -0.3390, -0.3115],
        [ 0.0033, -0.4471, -0.4336, -0.1539, -0.3841]], device='cuda:1',
       requires_grad=True)), ('bias', Parameter containing:
tensor([-0.3170, -0.3447, -0.1953,  0.1275, -0.1596], device='cuda:1',
       requires_grad=True))] 
data: tensor([[ 1.9884,  0.2205,  0.0892,  2.4214,  1.1334],
        [-0.9074, -0.3302, -0.0436,  0.1240,  0.7344]], device='cuda:2') 
data: tensor([[-0.3715,  3.0405,  1.1832,  0.0265, -0.2706],
        [ 0.0707,  0.4818,  1.6107,  0.0430, -0.3189]], device='cuda:0') 
data: tensor([[ 0.2532,  1.0779, -1.1392,  0.2004, -1.4275],
        [-3.2874, -1.6035, -0.4193, -0.3141,  1.2213]], device='cuda:1') 
labels: tensor([[ 1.1778, -1.7035, -1.1635,  0.6847,  0.0173],
        [ 0.8534, -0.2631, -0.5943,  1.0659,  0.4787]], device='cuda:2') 
labels: tensor([[-0.6421, -1.4290, -2.1635, -1.4473, -2.3120],
        [-0.2674, -0.2792, -0.2350, -1.1327,  1.3098]], device='cuda:0') 
labels: tensor([[-1.4693, -0.2071, -0.1675, -0.7679, -0.8032],
        [-0.4238, -0.6182,  1.1620,  0.0233, -1.5597]], device='cuda:1') 
model: [('weight', Parameter containing:
tensor([[ 0.3023,  0.1928, -0.3411, -0.0526, -0.1052],
        [ 0.1185, -0.0235,  0.2603, -0.1978,  0.3416],
        [-0.4027,  0.0215,  0.3630, -0.4380, -0.2129],
        [-0.3771,  0.4148, -0.0538, -0.3353, -0.3071],
        [ 0.0084, -0.4460, -0.4300, -0.1518, -0.3837]], device='cuda:0',
       requires_grad=True)), ('bias', Parameter containing:
tensor([-0.3157, -0.3462, -0.1968,  0.1254, -0.1591], device='cuda:0',
       requires_grad=True))]
model: [('weight', Parameter containing:
tensor([[ 0.3023,  0.1928, -0.3411, -0.0526, -0.1052],
        [ 0.1185, -0.0235,  0.2603, -0.1978,  0.3416],
        [-0.4027,  0.0215,  0.3630, -0.4380, -0.2129],
        [-0.3771,  0.4148, -0.0538, -0.3353, -0.3071],
        [ 0.0084, -0.4460, -0.4300, -0.1518, -0.3837]], device='cuda:2',
       requires_grad=True)), ('bias', Parameter containing:
tensor([-0.3157, -0.3462, -0.1968,  0.1254, -0.1591], device='cuda:2',
       requires_grad=True))]

model: [('weight', Parameter containing:
tensor([[ 0.3023,  0.1928, -0.3411, -0.0526, -0.1052],
        [ 0.1185, -0.0235,  0.2603, -0.1978,  0.3416],
        [-0.4027,  0.0215,  0.3630, -0.4380, -0.2129],
        [-0.3771,  0.4148, -0.0538, -0.3353, -0.3071],
        [ 0.0084, -0.4460, -0.4300, -0.1518, -0.3837]], device='cuda:1',
       requires_grad=True)), ('bias', Parameter containing:
tensor([-0.3157, -0.3462, -0.1968,  0.1254, -0.1591], device='cuda:1',
       requires_grad=True))]

import torch
import torch.distributed as dist
import torch.multiprocessing as mp
import torch.nn as nn
import torch.optim as optim
from torch.nn.parallel import DistributedDataParallel as DDP
import os
from torch.utils.data.distributed import DistributedSampler
from torch.utils.data import DataLoader

def example(rank, world_size):
    # create default process group
    dist.init_process_group("nccl", rank=rank, world_size=world_size)
    dataset=torch.randn(8, 5).to(rank)
    print('dataset:',dataset)
    sampler = DistributedSampler(dataset,shuffle=False)
    loader = DataLoader(dataset,batch_size=1,sampler=sampler)
    for i,data in enumerate(loader):
        print(i,data)
    

def main():
    world_size = 4
    mp.spawn(example,
        args=(world_size,),
        nprocs=world_size,
        join=True)

if __name__=="__main__":
    # Environment variables which need to be
    # set when using c10d's default "env"
    # initialization mode.
    os.environ["MASTER_ADDR"] = 'localhost'
    os.environ["MASTER_PORT"] = '12357'
    main()

dataset: tensor([[ 1.3948e+00, -3.7822e-01,  5.2887e-01,  1.3636e+00, -1.8920e+00],
        [-1.7722e-01,  5.6919e-01, -4.7340e-01,  7.9027e-01,  1.5672e-01],
        [ 1.9237e-01,  6.7848e-01, -5.6045e-01, -1.1554e+00,  7.5676e-01],
        [-9.0246e-01, -1.0382e+00, -5.0761e-01,  1.5271e-01, -3.3746e-01],
        [-1.2035e+00, -5.6039e-01, -1.3606e+00, -3.1048e-01, -6.2549e-01],
        [ 3.8156e-01, -4.7977e-01, -7.5969e-01, -1.6575e+00, -2.8292e-02],
        [ 6.6425e-01, -1.0442e+00,  1.7973e+00, -6.8186e-01, -9.0716e-01],
        [ 1.1209e-01,  1.8569e+00,  6.2515e-01,  5.5524e-04, -1.7088e-01]],
       device='cuda:0') 
0 tensor([[ 1.3948, -0.3782,  0.5289,  1.3636, -1.8920]], device='cuda:0') 
1 tensor([[-1.2035, -0.5604, -1.3606, -0.3105, -0.6255]], device='cuda:0') 
dataset: tensor([[-1.4152, -0.0952, -0.3020, -1.1259,  0.8200],
        [ 0.2737, -0.1275,  0.7300,  0.3648,  1.0146],
        [-0.9328,  0.9171,  0.0599,  1.5056, -1.0918],
        [-2.2928,  0.8857, -0.0789,  1.6129,  1.1632],
        [ 0.6055, -0.0182, -0.0836, -0.2351, -0.1364],
        [-0.3387, -0.7023, -0.3434, -0.2430,  1.4194],
        [ 1.8680, -1.2303,  0.1068,  0.2723, -0.8544],
        [-1.6527, -0.2950,  0.3333,  0.2765, -1.4153]], device='cuda:1') 
0 tensor([[ 0.2737, -0.1275,  0.7300,  0.3648,  1.0146]], device='cuda:1') 
1 tensor([[-0.3387, -0.7023, -0.3434, -0.2430,  1.4194]], device='cuda:1') 
dataset: tensor([[-5.8241e-01, -2.2155e-01, -1.0731e+00, -9.2801e-01,  3.3917e-01],
        [ 3.1482e-01, -6.0189e-01, -4.5408e-01, -7.0090e-03, -2.8336e-01],
        [-1.9568e+00,  1.1444e+00,  7.5926e-01,  6.3337e-01,  4.5700e-01],
        [ 1.4293e+00,  2.4193e+00,  4.7138e-01,  8.9208e-02,  1.6085e+00],
        [-2.4531e-01, -8.4192e-01,  1.1619e-01,  2.7609e-01,  5.0926e-01],
        [-3.7706e-01, -3.8997e-01,  6.1987e-02, -8.9578e-02,  1.7686e+00],
        [ 1.2479e+00, -2.2167e+00, -1.0408e+00, -2.0674e-01, -6.6368e-01],
        [-1.0032e+00,  1.4418e-03,  1.8756e-01, -4.3765e-01,  8.0244e-01]],
       device='cuda:2') 
0 tensor([[-1.9568,  1.1444,  0.7593,  0.6334,  0.4570]], device='cuda:2') 
1 tensor([[ 1.2479, -2.2167, -1.0408, -0.2067, -0.6637]], device='cuda:2') 
dataset: tensor([[ 1.6479, -0.9287,  1.4874, -1.4861,  1.3497],
        [ 0.6806,  0.5721,  1.2495, -1.0997, -1.1951],
        [-0.0758, -1.0465, -0.1006,  1.3309, -0.5498],
        [-0.0167,  1.9883,  0.5203,  2.4311, -0.7038],
        [-0.4381,  0.2445,  2.0624, -2.7274,  0.7429],
        [-0.2545,  1.4348, -0.6444,  1.0268, -1.7688],
        [-0.2316, -0.6672,  1.6953,  0.4538,  2.8950],
        [ 0.1620, -1.0992, -0.2960, -0.1933,  2.5532]], device='cuda:3') 
0 tensor([[-0.0167,  1.9883,  0.5203,  2.4311, -0.7038]], device='cuda:3') 
1 tensor([[ 0.1620, -1.0992, -0.2960, -0.1933,  2.5532]], device='cuda:3')

'cuda:0': 0,4

'cuda:1': 1,5

'cuda:2': 2,6

'cuda:3': 3,7