参考资料:
CUDA Automatic Mixed Precision examples — PyTorch master documentation
pytorch scheduler汇总_AI大魔王的博客-CSDN博客_pytorch scheduler
代码:
import argparse
import math
import os
import sys
from pathlib import Path
import torch.distributed as dist
import torch
import torchvision
from torch import nn
from torch.optim import lr_scheduler
from torchvision import transforms
FILE = Path(__file__).resolve()
ROOT = FILE.parents[0] # YOLOv5 root directory
if str(ROOT) not in sys.path:
sys.path.append(str(ROOT)) # add ROOT to PATH
ROOT = Path(os.path.relpath(ROOT, Path.cwd())) # relative
LOCAL_RANK = int(os.getenv('LOCAL_RANK', -1)) # https://pytorch.org/docs/stable/elastic/run.html
RANK = int(os.getenv('RANK', -1))
WORLD_SIZE = int(os.getenv('WORLD_SIZE', 2))
def parse_opt():
parser = argparse.ArgumentParser()
parser.add_argument('--local_rank', type=int, default=-1, help='Automatic DDP Multi-GPU argument, do not modify')
parser.add_argument('--freeze', nargs='+', type=int, default=[2], help='Freeze layers: backbone=10, first3=0 1 2')
parser.add_argument('--weights', type=str, default=ROOT / 'wights/yolov5s.pt',
help='initial weights path') # 权重文件配置
parser.add_argument('--epochs', type=int, default=1, help='total training epochs') # 训练次数
parser.add_argument('--cos-lr', action='store_true', default=True, help='cosine LR scheduler')
args = parser.parse_args()
print(args.local_rank)
return args
def main(args):
getpid_X = os.getpid()
print(f'当前进程id:{getpid_X}')
print(f'local_rannk:{LOCAL_RANK}, rannk:{RANK}, world_size:{WORLD_SIZE}')
if LOCAL_RANK != -1:
print(f'local_rank:{LOCAL_RANK}')
torch.cuda.set_device(LOCAL_RANK)
device = torch.device('cuda', LOCAL_RANK)
dist.init_process_group(backend="nccl" if dist.is_nccl_available() else "gloo")
trans = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (1.0,))])
data_set = torchvision.datasets.MNIST('~/DATA/', train=True,
transform=trans, target_transform=None, download=True)
train_sampler = torch.utils.data.distributed.DistributedSampler(data_set)
data_loader_train = torch.utils.data.DataLoader(dataset=data_set,
batch_size=256,
sampler=train_sampler,
num_workers=2,
pin_memory=True)
net = torchvision.models.resnet18(num_classes=10)
num_ftrs = net.fc.in_features
net.fc = nn.Linear(num_ftrs, 10)
net = net.cuda()
# 冻结训练
# freeze = [f'model.{x}.' for x in (args.freeze if len(args.freeze) > 1 else range(args.freeze[0]))] # layers to freeze
freeze = ['layer1', 'layer2']
for k, v in net.named_parameters():
v.requires_grad = True # train all layers
# v.register_hook(lambda x: torch.nan_to_num(x)) # NaN to 0 (commented for erratic training results)
# if any(x in k for x in freeze):
if k.split('.')[0] in freeze:
print(f'freezing {k}')
v.requires_grad = False
# ddp 模型
net = torch.nn.parallel.DistributedDataParallel(net, device_ids=[LOCAL_RANK],
output_device=LOCAL_RANK)
criterion = torch.nn.CrossEntropyLoss()
opt = torch.optim.SGD(net.parameters(), lr=0.001)
# Scheduler 学习率策略 余弦退火衰减
print(f'是否启动余弦退火衰减, {args.cos_lr}')
if args.cos_lr:
lf = lambda x: ((1 - math.cos(x * math.pi / args.epochs)) / 2) * (0.01 - 1) + 1
else:
lf = lambda x: (1 - x / args.epochs) * (1.0 - 0.01) + 0.01 # linear
scheduler = lr_scheduler.LambdaLR(opt, lr_lambda=lf)
# 自动混合精度
scaler = torch.cuda.amp.GradScaler(enabled=True)
for epoch in range(args.epochs):
for i, data in enumerate(data_loader_train):
opt.zero_grad()
images, labels = data
images = images.repeat(1, 3, 1, 1)
# 要将数据送入指定的对应的gpu中
print(f'数据移动到驱动前:{LOCAL_RANK}, images:{images.device}, labels:{labels.device}')
images = images.to(LOCAL_RANK, non_blocking=True)
labels = labels.to(LOCAL_RANK, non_blocking=True)
print(f'数据移动到驱动后:{LOCAL_RANK}, images:{images.device}, labels:{labels.device}')
# Forward 使用混合精度
with torch.cuda.amp.autocast(True):
outputs = net(images)
loss = criterion(outputs, labels)
# 梯度裁剪
scaler.scale(loss).backward()
scaler.unscale_(opt)
torch.nn.utils.clip_grad_norm_(net.parameters(), max_norm=10.0)
scaler.step(opt)
scaler.update()
# 学习率
scheduler.step()
if i % 10 == 0:
print("loss: {}".format(loss.item()))
if __name__ == "__main__":
args = parse_opt()
main(args)
kaggle训练:
!python -m torch.distributed.launch --nproc_per_node=2 --nnodes=1 --node_rank=0 --master_addr="localhost" --master_port=12355 ../input/hellow16/dist_test1.py
测试过程出现了梯度爆炸,原因是学习率的自动调节。具体用那个学习率,我没看见比较好的说明。