在IDE(例如pycharm)中,是难以按照从单卡到多卡(DDP使用方式,附代码)(一)的ddp的启动方式跑代码的,一般是单卡运行。那如何在pycharm中启动多卡跑程序呢?
下面介绍使用torch.multiprocessing的方式使用ddp,单卡和多卡程序均可在编译器中运行,只需要设置环境CUDA_VISIBLE_DEVICES,代码如下
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.distributed as dist
import torch.multiprocessing as mp
import os
class SimpleNN(nn.Module):
def __init__(self):
super(SimpleNN, self).__init__()
self.fc = nn.Linear(28 * 28, 10)
def forward(self, x):
x = x.view(-1, 28 * 28)
x = self.fc(x)
return x
def main_worker(rank, world_size):
# 初始化DDP环境
# 为分布式训练设置环境变量
os.environ['MASTER_ADDR'] = 'localhost' # 或者是主机的IP地址
os.environ['MASTER_PORT'] = '12355' # 选择一个未被占用的端口号
torch.cuda.set_device(rank)
dist.init_process_group(backend='nccl', rank=rank, world_size=world_size)
# 模型、损失函数、数据加载等
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
criterion = nn.CrossEntropyLoss()
transform = transforms.ToTensor()
train_dataset = datasets.MNIST(root='./', train=True, download=False, transform=transform)
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
train_loader = DataLoader(train_dataset, batch_size=32, sampler=train_sampler)
test_dataset = datasets.MNIST('./', train=False, download=False, transform=transform)
test_loader = DataLoader(test_dataset, batch_size=1000)
model = SimpleNN().to(device)
model = DDP(model, device_ids=[rank])
optimizer = optim.SGD(model.parameters(), lr=0.01 * world_size)
# 训练循环
for epoch in range(5):
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
# 测试和保存模型
if rank == 0:
model.eval()
correct = 0
with torch.no_grad():
for data, target in test_loader:
data, target = data.to(device), target.to(device)
output = model(data)
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()
print(
f"\nTest set: Accuracy: {correct}/{len(test_loader.dataset)} ({100. * correct / len(test_loader.dataset)}%)\n")
# 保存模型
torch.save(model.state_dict(), 'ddp_model.pth')
if __name__ == '__main__':
world_size = torch.cuda.device_count()
mp.spawn(main_worker, args=(world_size,), nprocs=world_size, join=True)
在multiprocessing.spawn的使用中,函数(这里是main_worker)的第一个参数是由spawn方法自动传递的,代表当前进程的索引。nprocs代表节点也就是gpu的使用数量
当使用默认的CUDA_VISIBLE_DEVICES=0时,
当使用指定的多卡CUDA_VISIBLE_DEVICES=1,2时,
可以看到效果差不多。