这篇文章写的很好,参考https://zhuanlan.zhihu.com/p/393648544
因为GPU显存有限,batchsize上不去,所以考虑用torch.nn.parallel.DistributedDataParallel来解决,结合众多资料,整理出来一份能跑通的代码(很奇怪,突然就跑通,我不知道之前是哪里出了问题)
需要的是添加主要的3个step:
import torch, torchvision, argparse
from torchvision import transforms
if __name__ == '__main__':
#step1:初始化
# rank用于给各个主机编号,和local_rank类似,world_size表示使用几个主机,只有1个就设置成1
# nccl表示的是支持的通信后端,init_method是各个机器之间的通讯方式,
torch.distributed.init_process_group(backend = 'nccl')#,init_method='env://',rank=0,world_size = 1)
parse = argparse.ArgumentParser()
#当用torch.distributed.launch执行上述代码时,这个参数确实会自动分配,对应的就是GPU编号
parse.add_argument('--local_rank', type=int, default=-1)
args = parse.parse_args()
torch.cuda.set_device(args.local_rank)
device = torch.device('cuda',args.local_rank)
batch_size = int(1024/4)
epochs = 20
lr = 0.001
print(f"If Distributed training successes, it will repeat {torch.cuda.device_count()} times!\n")
model = torchvision.models.resnet18().to(device)
#step2:分发模型
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank)
train_dataset = torchvision.datasets.CIFAR10(
root='/root/localdisk/wxx/dataset',
train=True,
download=False,
transform=transforms.Compose([
transforms.RandomCrop(32, padding=4),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize(
(0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)
),
]))
# step3:分发数据
train_sampler = torch.utils.data.distributed.DistributedSampler(
train_dataset,
shuffle=True,
)
#这个batchsize指的是单块GPU上的batchsize,最终实际计算的batchsize应该是batchsize*torch.cuda.device_count()
train_loader = torch.utils.data.DataLoader(
train_dataset,
batch_size=batch_size,
num_workers=4,
pin_memory=False,
sampler=train_sampler,
)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(
model.parameters(),
lr=lr * 2,
momentum=0.9,
weight_decay=0.0001,
nesterov=True,
)
if args.local_rank == 0:
print(" ======= Training ======= \n")
model.train()
for ep in range(1, epochs + 1):
train_loss = correct = total = 0
# 在分布式模式下,需要在每个 epoch 开始时调用 set_epoch() 方法,然后再创建 DataLoader 迭代器,
# 以使 shuffle 操作能够在多个 epoch 中正常工作。 否则,dataloader迭代器产生的数据将始终使用相同的顺序。
train_loader.sampler.set_epoch(ep)
for idx, (inputs, targets) in enumerate(train_loader):
inputs, targets = inputs.to(device), targets.to(device)
outputs = model(inputs)
loss = criterion(outputs, targets)
optimizer.zero_grad()
loss.backward()
optimizer.step()
train_loss += loss.item()
total += targets.size(0)
correct += torch.eq(outputs.argmax(dim=1), targets).sum().item()
if args.local_rank == 0 and ((idx + 1) % 25 == 0 or (idx + 1) == len(train_loader)):
print(
" == step: [{:3}/{}] [{}/{}] | loss: {:.3f} | acc: {:6.3f}%".format(
idx + 1,
len(train_loader),
ep,
epochs,
train_loss / (idx + 1),
100.0 * correct / total,
)
)
if args.local_rank == 0:
print("\n ======= Training Finished ======= \n")