1. 设置随机种子
seed = 1234
os.environ['PYTHONHASHSEED'] = str(seed)
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
2. 参数加载
- 超参数加载,最常用的方法是使用
argparse
:import argparse parser = argparse.ArgumentParser() parser.add_argument("--local_rank", default=-1, type=int, help='node rank for distributed training') parser.add_argument('--nproc_per_node', default=2, type=int, help='nums of process/gpu') parser.add_argument('--nnode', default=1, type=int, help='nums of node') parser.add_argument('--node_rank', default=0, type=int) args = parser.parse_args()
- 也可以使用yaml文件编写json格式的参数,通过以下方式加载:
""" json格式 model: batch_size: 2 """ with open("config.yaml", 'r', encoding='utf-8') as f: cfg = yaml.safe_load(f) return cfg batch_size = cfg['model']['batch_size']
3. 数据处理
- 自定义跳过脏数据
return self.__getitem__(index + 1)
- 还有一种修改源文件的方法,我没调试成功。
4. 模型优化
-
学习率衰减策略。大部分论文中使用的都是warmup + cosine_decay。实际上大家都是守着模型手动调
if lr_decay_type == 'warmup_step': t, T = warmup_step, epochs lr_lambda = lambda epoch: (0.9 * epoch / t + 0.1) if epoch < t \ else 0.1 if 0.5 * (1 + math.cos(math.pi * (epoch - t) / (T - t))) < 0.1 \ else 0.5 * (1 + math.cos(math.pi * (epoch - t) / (T - t))) scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_lambda) elif lr_decay_type == 'consine_anneal': scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs) elif lr_decay_type == 'linear': scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=1, end_factor=0.05, total_iters=epochs*3117) else: # step scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=int(1e9), gamma=0.1, last_epoch=-1)
-
分布式训练 + 混合精度加速训练
import torch.distributed as dist from apex.parallel import DistributedDataParallel from apex import amp import torch.multiprocessing as mp # 添加环境变量 os.environ["CUDA_VISBLE_DEVICES"] = '0,1' # 双卡 os.environ['MASTER_ADDR'] = 'localhost' os.environ['MASTER_PORT'] = '23456' os.environ['RANK'] = '0' os.environ['CUDA_LAUNCH_BLOCKING'] = '1' def load_data(data_mode): dataset = Dataset(...) # data_single = next(iter(dataset)) # use num_workers to set parallel if data_mode == 'train' or data_mode == 'val': sampler = torch.utils.data.distributed.DistributedSampler(dataset) loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=False, drop_last=True, num_workers=4, pin_memory=True, sampler=sampler) else: loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True, drop_last=True, num_workers=0, pin_memory=True) return loader def main_worker(local_rank, args): # 初始化 global_rank = local_rank + args.node_rank * args.nproc_per_node world_size = args.nnode * args.nproc_per_node dist.init_process_group(backend="nccl", init_method='env://', rank=global_rank, world_size=world_size) # 加载数据和模型 train_loader = load_data(data_mode='train') model = init_model() para_num = sum([param.nelement() for param in model.parameters()]) print("Number of parameter: %.2fM" % (para_num / 1e6)) torch.cuda.set_device(local_rank) model.cuda(local_rank) model = DistributedDataParallel(model) model, optimizer = amp.initialize(model, optimizer, opt_level="O1") criterion = Loss(...) criterion.cuda(local_rank) train() dist.destroy_process_group() def main(): mp.spawn(main_worker, nprocs=2, args=(args,))
-
求导监测
# 正向传播开启自动求导异常侦测 torch.autograd.set_detect_anomaly(True) # 查看参数梯度和更新情况 for name, parms in model.named_parameters(): print("更新前/后") print('-->name:', name) print('-->para:', parms) print('-->grad_requirs:', parms.requires_grad) print('-->grad_value:', parms.grad) # 反向传播求导侦测 from apex import amp # from torch.cuda import amp with torch.autograd.detect_anomaly(): if use_amp: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward(retain_graph=True)
-
继续训练
# 加载模型的预训练参数 pretrained_dict = torch.load(pretrained_model_path + pretrained_model, map_location='cpu')['state_dict'] model_name.load_state_dict({k: v for k, v in pretrained_dict.items()}, strict=False) # 加载优化器的预训练参数 pretrained_dict = torch.load(pretrained_model_path + pretrained_model, map_location='cpu')['optimizer'] optimizer.load_state_dict(pretrained_dict) for param_group in optimizer.param_groups: # 单独修改学习率 param_group["lr"] = lr del pretrained_dict # 保存训练好的模型 checkpoint = { # 'model': Model(), # 模型大的时候不建议保存 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict() } checkpoint_path = output_path + 'checkpoint_better.pkl' torch.save(checkpoint, checkpoint_path)
-
其他技巧
- pytorch加速训练的17种方法
- 在单个GPU上进行高效训练
- adamw_apex_fused