[torch] DistributedDataParallel 训练框架 + 梯度累积模板

文章介绍了如何在PyTorch中利用DistributedDataParallel框架实现模型在多个GPU之间的数据并行,通过自定义sampler和进程管理,使得数据划分更灵活,提高了训练效率。作者还展示了如何配置和封装模型以适应分布式环境。
摘要由CSDN通过智能技术生成

Distributed Data Parallel 框架

之前使用dataparallel发现data是自动划分到不同gpu上的,十分不方便。而ddp可以自己设计一个sampler,然后启动不同进程,互不干扰,好用很多。

import os
os.environ['VISIBLE_DEVICES'] = '0,1'

import torch
from torch import nn
from torchvision import datasets
from torch.utils.data import DataLoader
from torch.utils.data.distributed import DistributedSampler
from itertools import accumulate

class Dataset(torch.utils.data.Dataset):
    def __init__(self, len):
        super(Dataset, self).__init__()
        self.data_len = len
    def __len__(self):
        return self.data_len
    def __getitem__(self, index):
        n = torch.randint(0, 10, (1, )).item() * 11
        return torch.full((n, 10), index).float() + torch.randn((10, )), index


class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.fc = nn.Linear(10, 1)
    def forward(self, x, s):
        assert x.shape[0] % 11 == 0, f'x.shape[0] % 11 == 0, but got {x.shape[0]} % 11 == 0'
        max_length = max(s)
        batch_size = len(s)
        s = [0] + list(accumulate(s))
        x = self.fc(x).view(x.shape[0], -1)  # (n, 10) -> (n, 1)
        x = torch.cat([
            torch.cat(
                [x[s[i-1]:s[i]].view(-1), torch.zeros((max_length - s[i] + s[i - 1])).to(x.device)]
            ).reshape(1, max_length) if s[i - 1] != s[i] else torch.zeros((1, max_length)).to(x.device) for i in range(1, len(s))
        ], dim=0)
        assert x.shape == (batch_size, max_length), f'x.shape == (batch_size, max_length), but got {x.shape} == ({batch_size}, {max_length})'
        return torch.max(x, dim=1).values

def collate_fn(batch):
    (img, target), sizes = list(zip(*batch)), [x.shape[0] for x, _ in batch]
    return torch.cat(img, dim=0).float(), torch.tensor(target).float(), sizes

# hyper parameters
batch_size = 4
print_rank = 0
train_print_iter = 100
valid_print_iter = 100

# 1) 初始化
torch.distributed.init_process_group(backend="nccl")

# 2) 配置每个进程的gpu
# local_rank = torch.distributed.get_rank()
local_rank = int(os.environ['LOCAL_RANK'])
torch.cuda.set_device(local_rank)
device = torch.device("cuda", local_rank)

train_dataset = Dataset(900)
valid_dataset = Dataset(100)

# 3)使用DistributedSampler
train_loader = DataLoader(dataset=train_dataset,
                            batch_size=batch_size,
                            shuffle=False, 
                            sampler=DistributedSampler(train_dataset),
                            collate_fn=collate_fn)
valid_loader = DataLoader(dataset=valid_dataset,
                            batch_size=batch_size,
                            collate_fn=collate_fn)

# 4) 封装之前要把模型移到对应的gpu
model = Model()
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=5e-4)
criterion = torch.nn.MSELoss()

if torch.cuda.device_count() > 1:
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    # 5) 封装
    model = torch.nn.parallel.DistributedDataParallel(model,
                                                    device_ids=[local_rank],
                                                    output_device=local_rank)
    model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)

# 设置每轮epoch中打印loss的轮次
train_len = len(train_dataset)
valid_len = len(valid_dataset)


for epoch in range(10000):
    # 设置每轮epoch中用于打印训练数据
    logger = {'train_loss': 0.0,
                'valid_loss': 0.0}
    train_loader.sampler.set_epoch(epoch)
    for i, (img, target, s) in enumerate(train_loader):
        optimizer.zero_grad()
        img = img.cuda()
        target = target.cuda()
        output = model(img, s)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        logger['train_loss'] += loss * img.shape[0]
    
"""

python3 -m torch.distributed.launch --nproc_per_node=2 /home/Guanjq/Work/GradAccumulate/test_ddp.py

"""

梯度累积模板

class GradAccumulate:
    def __init__(self, model, optimizer, accumulation_steps):
        self.model = model
        self.optimizer = optimizer
        self.accumulation_steps = accumulation_steps
        self.loss = nn.CrossEntropyLoss()

    def zero_grad(self):
        self.optimizer.zero_grad()

    def step(self):
        self.optimizer.step()

    def backward(self, outputs, targets):
        loss = self.loss(outputs, targets)
        loss = loss / self.accumulation_steps
        loss.backward()

    def step_accumulate(self, outputs, targets):
        self.backward(outputs, targets)
        if (self.optimizer.step + 1) % self.accumulation_steps == 0:
            self.step()
            self.zero_grad()
        else:
            self.zero_grad()
            
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值