Distributed Data Parallel 框架
之前使用dataparallel发现data是自动划分到不同gpu上的,十分不方便。而ddp可以自己设计一个sampler,然后启动不同进程,互不干扰,好用很多。
import os
os.environ['VISIBLE_DEVICES'] = '0,1'
import torch
from torch import nn
from torchvision import datasets
from torch.utils.data import DataLoader
from torch.utils.data.distributed import DistributedSampler
from itertools import accumulate
class Dataset(torch.utils.data.Dataset):
def __init__(self, len):
super(Dataset, self).__init__()
self.data_len = len
def __len__(self):
return self.data_len
def __getitem__(self, index):
n = torch.randint(0, 10, (1, )).item() * 11
return torch.full((n, 10), index).float() + torch.randn((10, )), index
class Model(nn.Module):
def __init__(self):
super(Model, self).__init__()
self.fc = nn.Linear(10, 1)
def forward(self, x, s):
assert x.shape[0] % 11 == 0, f'x.shape[0] % 11 == 0, but got {x.shape[0]} % 11 == 0'
max_length = max(s)
batch_size = len(s)
s = [0] + list(accumulate(s))
x = self.fc(x).view(x.shape[0], -1) # (n, 10) -> (n, 1)
x = torch.cat([
torch.cat(
[x[s[i-1]:s[i]].view(-1), torch.zeros((max_length - s[i] + s[i - 1])).to(x.device)]
).reshape(1, max_length) if s[i - 1] != s[i] else torch.zeros((1, max_length)).to(x.device) for i in range(1, len(s))
], dim=0)
assert x.shape == (batch_size, max_length), f'x.shape == (batch_size, max_length), but got {x.shape} == ({batch_size}, {max_length})'
return torch.max(x, dim=1).values
def collate_fn(batch):
(img, target), sizes = list(zip(*batch)), [x.shape[0] for x, _ in batch]
return torch.cat(img, dim=0).float(), torch.tensor(target).float(), sizes
# hyper parameters
batch_size = 4
print_rank = 0
train_print_iter = 100
valid_print_iter = 100
# 1) 初始化
torch.distributed.init_process_group(backend="nccl")
# 2) 配置每个进程的gpu
# local_rank = torch.distributed.get_rank()
local_rank = int(os.environ['LOCAL_RANK'])
torch.cuda.set_device(local_rank)
device = torch.device("cuda", local_rank)
train_dataset = Dataset(900)
valid_dataset = Dataset(100)
# 3)使用DistributedSampler
train_loader = DataLoader(dataset=train_dataset,
batch_size=batch_size,
shuffle=False,
sampler=DistributedSampler(train_dataset),
collate_fn=collate_fn)
valid_loader = DataLoader(dataset=valid_dataset,
batch_size=batch_size,
collate_fn=collate_fn)
# 4) 封装之前要把模型移到对应的gpu
model = Model()
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=5e-4)
criterion = torch.nn.MSELoss()
if torch.cuda.device_count() > 1:
print("Let's use", torch.cuda.device_count(), "GPUs!")
# 5) 封装
model = torch.nn.parallel.DistributedDataParallel(model,
device_ids=[local_rank],
output_device=local_rank)
model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
# 设置每轮epoch中打印loss的轮次
train_len = len(train_dataset)
valid_len = len(valid_dataset)
for epoch in range(10000):
# 设置每轮epoch中用于打印训练数据
logger = {'train_loss': 0.0,
'valid_loss': 0.0}
train_loader.sampler.set_epoch(epoch)
for i, (img, target, s) in enumerate(train_loader):
optimizer.zero_grad()
img = img.cuda()
target = target.cuda()
output = model(img, s)
loss = criterion(output, target)
loss.backward()
optimizer.step()
logger['train_loss'] += loss * img.shape[0]
"""
python3 -m torch.distributed.launch --nproc_per_node=2 /home/Guanjq/Work/GradAccumulate/test_ddp.py
"""
梯度累积模板
class GradAccumulate:
def __init__(self, model, optimizer, accumulation_steps):
self.model = model
self.optimizer = optimizer
self.accumulation_steps = accumulation_steps
self.loss = nn.CrossEntropyLoss()
def zero_grad(self):
self.optimizer.zero_grad()
def step(self):
self.optimizer.step()
def backward(self, outputs, targets):
loss = self.loss(outputs, targets)
loss = loss / self.accumulation_steps
loss.backward()
def step_accumulate(self, outputs, targets):
self.backward(outputs, targets)
if (self.optimizer.step + 1) % self.accumulation_steps == 0:
self.step()
self.zero_grad()
else:
self.zero_grad()