【笔记】ray.tune :超参最优化(3)

生成的文件:

 

Code:

from functools import partial
import numpy as np
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import random_split
import torchvision
import torchvision.transforms as transforms

from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler


# 定义神经网络模型
class Net(nn.Module):
    def __init__(self, l1=120, l2=84):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, l1)  # 参数待指定
        self.fc2 = nn.Linear(l1, l2)  # 参数待指定
        self.fc3 = nn.Linear(l2, 10)  # 参数待指定

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


# 封装数据加载过程,传递全局数据路径,以保证不同实验间共享数据路径
def load_data(data_dir="./image1"):
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])

    trainset = torchvision.datasets.CIFAR10(
        root=data_dir, train=True, download=True, transform=transform)

    return trainset



# 封装训练脚本
# config参数用于指定超参数
# checkpoint_dir参数用于存储检查点
# data_dir参数用于指定数据加载和存储路径
def train_cifar(config, checkpoint_dir=None, data_dir=None):
    # 模型实例化
    net = Net(config["l1"], config["l2"])  # 2个超参数

    # 这种写法保证没有GPU可用时模型也可以训练
    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if torch.cuda.device_count() > 1:
            # 将模型封装到nn.DataParallel中以支持多GPU并行训练
            net = nn.DataParallel(net)
    net.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(), lr=config["lr"], momentum=0.9)  # 1个超参数


    # 用于存储检查点
    if checkpoint_dir:
        # 模型的状态、优化器的状态
        model_state, optimizer_state = torch.load(
            os.path.join(checkpoint_dir, "checkpoint"))
        net.load_state_dict(model_state)
        ptimizer.load_state_dict(optimizer_state)

    trainset = load_data(data_dir)
    trainloader = torch.utils.data.DataLoader(
        trainset,
        batch_size=int(config["batch_size"]),  # 1个超参数
        shuffle=True,
        num_workers=8)

    for epoch in range(4):  # loop over the dataset multiple times
        running_loss = 0.0
        epoch_steps = 0

        # 训练循环
        for i, data in enumerate(trainloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)
            # zero the parameter gradients
            optimizer.zero_grad()
            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            # print statistics
            running_loss += loss.item()
            epoch_steps += 1
            if i % 2000 == 1999:  # print every 2000 mini-batches
                print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1,
                                                running_loss / epoch_steps))
                running_loss = 0.0

        # # 保存检查点
        # ray.tune.checkpoint_dir(step) #返回检查点路径
        with tune.checkpoint_dir(epoch) as checkpoint_dir:
            path = os.path.join(checkpoint_dir, "checkpoint")
            torch.save((net.state_dict(), optimizer.state_dict()), path)
        # tune.report(loss=(val_loss / val_steps), accuracy=correct / total)
        tune.report(loss=0, accuracy=1)
    print("Finished Training")


def main(num_samples=3, max_num_epochs=4, gpus_per_trial=2):
    # 全局文件路径
    data_dir = os.path.abspath("./images")
    # 加载训练数据
    load_data(data_dir)
    # 配置超参数搜索空间
    # 每次实验,Ray Tune会随机采样超参数组合,并行训练模型,找到最优参数组合
    config = {
        # 自定义采样方法
        "l1": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)),   # [2 , 9)
        "l2": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)),
        # 随机分布采样
        "lr": tune.loguniform(1e-4, 1e-1),
        # 从类别型值中随机选择
        "batch_size": tune.choice([2, 4, 8, 16])
    }
    # ASHAScheduler会根据指定标准提前中止坏实验
    scheduler = ASHAScheduler(
        metric="loss",
        mode="min",
        max_t=max_num_epochs,
        grace_period=1,
        reduction_factor=2)
    # 在命令行打印实验报告
    reporter = CLIReporter(
        # parameter_columns=["l1", "l2", "lr", "batch_size"],
        metric_columns=["loss", "accuracy", "training_iteration"])
    # 执行训练过程
    result = tune.run(
        partial(train_cifar, data_dir=data_dir),
        # 指定训练资源
        resources_per_trial={"cpu": 12, "gpu": gpus_per_trial},
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter)

    # 找出最佳实验
    best_trial = result.get_best_trial("loss", "min", "last")
    # 打印最佳实验的参数配置
    print("Best trial config: {}".format(best_trial.config))
    print("Best trial final validation loss: {}".format(
        best_trial.last_result["loss"]))
    print("Best trial final validation accuracy: {}".format(
        best_trial.last_result["accuracy"]))


if __name__ == "__main__":
    # You can change the number of GPUs per trial here:
    main(num_samples=3, max_num_epochs=4, gpus_per_trial=0)

注:

 

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

程序猿的探索之路

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值