unet train.py 解析

敦码
已于 2023-04-21 09:40:01 修改
阅读量284
点赞数 1
文章标签：深度学习神经网络 pytorch
于 2022-07-23 10:28:03 首次发布
本文链接：https://blog.csdn.net/wdz502425575/article/details/125944438
版权
import argparse
import logging
import sys

import torch
import wandb
import torch.nn.functional as F
from torch import optim, nn
from torch.utils.data import random_split, DataLoader
from tqdm import tqdm

from evaluate import evaluate
from unet import Unet
from utils import dice_loss
from utils import *

dir_img = Path('../../new20211123/JPEGImages')
dir_mask = Path('../../new20211123/heatmap')
dir_checkpoint = Path('../../new20211123/checkpoints/')
def train_net(net,
              device,
              epochs: int = 5,
              batch_size: int = 1,
              learning_rate: float = 0.001,
              val_percent: float = 0.1,
              save_checkpoint: bool = True,
              img_scale: float = 0.5,
              amp: bool = False):
    # 1.Create dataset
    # 首先创建数据集
    try:
        dataset = CarvanaDataset(dir_img, dir_mask, img_scale)
    except(AssertionError, RuntimeError):
        dataset = BasicDataset(dir_img, dir_mask, img_scale)

    # 2. split into train / validation partitions
    # 根据val_percent比例因子设置train和val数量
    n_val = int(len(dataset) * val_percent)
    n_train = len(dataset) - n_val
    # 按比例将dataset数据集随机不重复的分给train和val
    train_set, val_set = random_split(dataset, [n_train, n_val], generator=torch.Generator().manual_seed(0))

    # 3. create data loaders
    # 加载数据集
    loader_args = dict(batch_size=batch_size, num_workers=4, pin_memory=True)
    train_loader = DataLoader(train_set, shuffle=True, **loader_args)
    val_loader = DataLoader(val_set, shuffle=False, drop_last=True, **loader_args)

    # (Initialize logging)
    experiment = wandb.init(project='U-Net', resume='allow', anonymous='must')
    experiment.config.update(dict(epochs=epochs, batch_size=batch_size, learning_rate=learning_rate,
                                  val_percent=val_percent, save_checkpoint=save_checkpoint, img_scale=img_scale,
                                  amp=amp))
    logging.info(f'''Starting training:
        Epochs:          {epochs}
        Batch size:      {batch_size}
        Learning rate:   {learning_rate}
        Training size:   {n_train}
        Validation size: {n_val}
        Checkpoints:     {save_checkpoint}
        Device:          {device.type}
        Images scaling:  {img_scale}
        Mixed Precision: {amp}
    ''')

    # 4. Set up the optimizer, the loss, the learning rate scheduler and the loss scaling for AMP
    # RMSprop（均方根）算法可以消除梯度下降中的摆动，加速梯度下降
    optimizer = optim.RMSprop(net.parameters(), lr=learning_rate, weight_decay=1e-8, momentum=0.9)
    # 调整学习率，这里采用的是ReduceLROnPlateau方法，当发现loss不在降低或者acc不在增加，降低学习率
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'max', patience=2)  # goal: maximize Dice score
    # 对tensor提供了自动混合精度计算（GradScaler方法：通过放大loss的值来防止梯度的underflow）加速训练计算
    grad_scaler = torch.cuda.amp.GradScaler(enabled=amp)
    # CE损失函数
    criterion = nn.CrossEntropyLoss()
    global_step = 0

    # 5. Begin training
    # 5、开始训练
    for epoch in range(epochs):
        net.train()
        epoch_loss = 0
        with tqdm(total=n_train, desc=f'Epoch {epoch + 1}/{epochs}', unit='img') as pbar:
            for batch in train_loader:
                images = batch['image']
                true_masks = batch['mask']

                assert images.shape[1] == net.n_channels, \
                f'Network has been defined with {net.n_channels} input channels, ' \
                f'but loaded images have {images.shape[1]} channels. Please check that ' \
                'the images are loaded correctly.'

                # 将images和masks加载到GPU显存
                images = images.to(device=device, dtype=torch.float32)
                true_masks = true_masks.to(device=device, dtype=torch.long)

                # 采用自动混合精度计算，加速训练迭代
                with torch.cuda.amp.autocast(enabled=amp):
                    masks_pred = net(images)
                    # 损失值是CE和Dice的和
                    loss = criterion(masks_pred, true_masks) \
                            + dice_loss(F.softmax(masks_pred, dim=1).float(),
                                        F.one_hot(true_masks, net.n_classes).permute(0, 3, 1, 2).float(),
                                        multiclass=True)

                # 梯度清零
                optimizer.zero_grad(set_to_none=True)
                # 反向传播更新梯度(这里采用GradScaler方法进行自动混合精度计算）
                grad_scaler.scale(loss).backward()
                # 更新优化器参数
                grad_scaler.step(optimizer)
                # 更新grad_scaler参数
                grad_scaler.update()

                # 进度条每个batches更新
                pbar.update(images.shape[0])
                global_step += 1
                # loss每轮迭代进行叠加
                epoch_loss += loss.item()
                # wandb可视化记录
                experiment.log({
                    'train loss': loss.item(),
                    'step': global_step,
                    'epoch': epoch
                })
                # 进度条按字典形式显示指标
                pbar.set_postfix(**{'loss (batch)': loss.item()})

                # Evaluation round
                # 验证集测试loss
                # 每10个batch_size测一次
                division_step = (n_train // (10 * batch_size ))
                if division_step > 0:
                    if global_step % division_step == 0:
                        histograms = {}
                        for tag, value in net.named_parameters():
                            # 直方图可视化权重和梯度
                            tag = tag.replace('/', '.')
                            histograms['Weights/' + tag] = wandb.Histogram(value.data.cpu())
                            histograms['Gradients/' + tag] = wandb.Histogram(value.grad.data.cpu())

                        # 验证集的loss
                        val_score = evaluate(net, val_loader, device)
                        # 根据验证集的loss调整学习率
                        scheduler.step(val_score)

                        # 打印各种参数的信息
                        logging.info('Validation Dice score: {}'.format(val_score))
                        experiment.log({
                            'learning rate': optimizer.param_groups[0]['lr'],
                            'validation Dice': val_score,
                            'images': wandb.Image(images[0].cpu()),
                            'masks':{
                                'true': wandb.Image(true_masks[0].float().cpu()),
                                'pred': wandb.Image(torch.softmax(masks_pred, dim=1).argmax(dim=1)[0].float().cpu()),
                            },
                            'step': global_step,
                            'epoch': epoch,
                            **histograms
                        })
        # 设置断点
        if save_checkpoint:
            Path(dir_checkpoint).mkdir(parents=True, exist_ok=True)
            # 保存第epoch轮的模型
            torch.save(net.state_dict(), str(dir_checkpoint / 'checkpoint_epoch{}.pth'.format(epoch + 1)))
            logging.info(f'Checkpoint {epoch + 1} saved!')


def get_args():
    parser = argparse.ArgumentParser(description='Train the UNet on images and target masks')
    parser.add_argument('--epochs', '-e', metavar='E', type=int, default=5, help='Number of epochs')
    parser.add_argument('--batch-size', '-b', dest='batch_size', metavar='B', type=int, default=1, help='Batch size')
    parser.add_argument('--learning-rate', '-l', metavar='LR', type=float, default=0.00001,
                        help='Learning rate', dest='lr')
    parser.add_argument('--load', '-f', type=str, default=False, help='Load model from a .pth file')
    parser.add_argument('--scale', '-s', type=float, default=0.5, help='Downscaling factor of the images')
    parser.add_argument('--validation', '-v', dest='val', type=float, default=10.0,
                        help='Percent of the data that is used as validation (0-100)')
    parser.add_argument('--amp', action='store_true', default=False, help='Use mixed precision')

    return parser.parse_args()

if __name__ == '__main__':
    args = get_args()

    # 设置logging
    logging.basicConfig(level=logging.INFO, format='%(asctime)%(levelname)s: %(message)s')
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    logging.info(f'Using device {device}')

    # Change here to adapt to your data
    # n_channels=3 for RGB images
    # n_classes is the number of probabilities you want to get per pixel
    net = Unet(n_channels=3, n_classes=2, bilinear=True)

    logging.info(f'Network:\n'
                 f'\t{net.n_channels} input channels\n'
                 f'\t{net.n_classes} output channels (classes)\n'
                 f'\t{"Bilinear" if net.bilinear else "Transposed conv"} upscaling')

    if args.load:
        net.load_state_dict(torch.load(args.load, map_location=device))
        logging.info(f'Model loaded from {args.load}')

    net = net.to(device = device)
    try:
        train_net(net=net,
                  epochs=args.epochs,
                  batch_size=args.batch_size,
                  learning_rate=args.lr,
                  device=device,
                  img_scale=args.scale,
                  val_percent=args.val / 100,
                  amp=args.amp)

    except KeyboardInterrupt:
        torch.save(net.state_dict(), 'INTERRUPTED.pth')
        logging.info('Saved interrupt')
        sys.exit(0)