pytorch学习(十一)checkpoint

当训练一个大模型数据的时候,中途断电就可以造成已经训练几天或者几个小时的工作白做了,再此训练的时候需要从epoch=0开始训练,因此中间要不断保存(epoch,net,optimizer,scheduler)等几个内容,这样才能在发生意外之后快速恢复工作。

通过本博客的学习,你将学会最优模型保存和模型自动加载的方法

2.首先看代码

    1.2保存最优模型

保存最优模型的代码如下:

        if epoch > int(num_epoches/3) and test_loss_value < min_loss_val:
            min_loss_val = test_loss_value
            checkpoint = {"epoch": epoch,
                        "net": model.state_dict(),
                          "optimizer":optimer.state_dict(),
                          "lr_schedule":scheduler_1.state_dict()}

            if not os.path.isdir(r'tf_logs/' + "save_module"):
                os.makedirs("tf_logs/" + "save_module")
            PATH = r'tf_logs/'+"save_module" + "/ckpt_best_%s.pth"%(str(epoch+1))
            torch.save(checkpoint, PATH)

 min_loss_val 定义成全局的变量之后,应该在用到的函数中,使用global min_loss_val再次定义,否则会报错误

 local variable 'min_loss_val' referenced before assignment 

1.2自动加载模型

#找文件夹中数字最大的文件
def getBestModuleFilename(browser):
    file_name = browser             #"tf_logs/save_module"
    filenames = os.listdir(file_name)
    pattern = r"d+"
    result = []
    for i in range(len(filenames)):
        rst = int(filenames[i][10:-4])

        result.append(rst)
    val = max(result)
    index = result.index(val)
    file_best = filenames[index]
    print(file_best)
    return file_best

Resume = False

    if Resume == False:
        start_epoch = 0
    else:
        #找到数字最大的pth文件
        path_checkpoint = r'tf_logs/'+"save_module"
        best_path_checkpoint = getBestModuleFilename(path_checkpoint)
        if(best_path_checkpoint == ""):
            return
        else:
            checkpointResume = torch.load(path_checkpoint)
            start_epoch = checkpointResume["epoch"]
            model.load_state_dict(checkpointResume["net"])
            optimer.load_state_dict(checkpointResume["optimizer"])
            scheduler_1.load_state_dict(checkpointResume["lr_schedule"])

2.代码

import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
import matplotlib.pyplot as plt
import numpy as np
import torchvision
from torch.utils.tensorboard import SummaryWriter
from torch.optim.lr_scheduler import LambdaLR
import os
import re

cur_pwd_path = os.getcwd()

def getBestModuleFilename(browser):
    file_name = browser             #"tf_logs/save_module"
    filenames = os.listdir(file_name)
    pattern = r"d+"
    result = []
    for i in range(len(filenames)):
        rst = int(filenames[i][10:-4])

        result.append(rst)
    val = max(result)
    index = result.index(val)
    file_best = filenames[index]
    print(file_best)
    return file_best

tensor = torch.randn(3,3)
bTensor = type(tensor) == torch.Tensor
print(bTensor)
print("tensor is on ", tensor.device)
#数据转到GPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)
if torch.cuda.is_available():
    tensor = tensor.to(device)
    print("tensor is on ",tensor.device)
#数据转到CPU
if tensor.device == 'cuda:0':
    tensor = tensor.to(torch.device("cpu"))
    print("tensor is on", tensor.device)
if tensor.device == "cpu":
    tensor = tensor.to(torch.device("cuda:0"))
    print("tensor is on", tensor.device)


trainning_data =datasets.MNIST(root="data",train=True,transform=ToTensor(),download=True)
print(len(trainning_data))
test_data = datasets.MNIST(root="data",train=True,transform=ToTensor(),download=False)

train_loader = DataLoader(trainning_data, batch_size=64,shuffle=True)
test_loader = DataLoader(test_data, batch_size=64,shuffle=True)

print(len(train_loader)) #分成了多少个batch
print(len(trainning_data)) #总共多少个图像
# for x, y in train_loader:
#     print(x.shape)
#     print(y.shape)


class MinistNet(nn.Module):
    def __init__(self):
        super().__init__()
        # self.flat = nn.Flatten()
        self.conv1 = nn.Conv2d(1,1,3,1,1)
        self.hideLayer1 = nn.Linear(28*28,256)
        self.hideLayer2 = nn.Linear(256,10)
    def forward(self,x):
        x= self.conv1(x)
        x = x.view(-1,28*28)
        x = self.hideLayer1(x)
        x = torch.sigmoid(x)
        x = self.hideLayer2(x)
        # x = nn.Sigmoid(x)
        return x

model = MinistNet()
model = model.to(device)
cuda = next(model.parameters()).device
print(model)
criterion = nn.CrossEntropyLoss()
optimer = torch.optim.RMSprop(model.parameters(),lr= 0.001)

scheduler_1 = LambdaLR(optimer, lr_lambda=lambda epoch: 1/(epoch+1))

num_epoches =10
min_loss_val = 100000
Resume = False

def train():
    global min_loss_val
    start_epoch = -1
    if Resume == False:
        start_epoch = 0
    else:
        #找到数字最大的pth文件
        path_checkpoint = r'tf_logs/'+"save_module"
        best_path_checkpoint = getBestModuleFilename(path_checkpoint)
        if(best_path_checkpoint == ""):
            return
        else:
            checkpointResume = torch.load(path_checkpoint)
            start_epoch = checkpointResume["epoch"]
            model.load_state_dict(checkpointResume["net"])
            optimer.load_state_dict(checkpointResume["optimizer"])
            scheduler_1.load_state_dict(checkpointResume["lr_schedule"])

    train_losses = []
    train_acces = []
    eval_losses = []
    eval_acces = []
    #训练
    model.train()
    tensorboard_ind =0;
    for epoch in range(num_epoches):
        batchsizeNum = 0
        train_loss = 0
        train_acc = 0
        train_correct = 0
        for x,y in train_loader:
            # print(epoch)
            # print(x.shape)
            # print(y.shape)
            x = x.to('cuda')
            y = y.to('cuda')
            bte = type(x)==torch.Tensor
            bte1 = type(y)==torch.Tensor
            A = x.device
            B = y.device
            pred_y = model(x)
            loss = criterion(pred_y,y)
            optimer.zero_grad()
            loss.backward()
            optimer.step()
            loss_val = loss.item()
            batchsizeNum = batchsizeNum +1
            train_acc += (pred_y.argmax(1) == y).type(torch.float).sum().item()
            train_loss += loss.item()
            tensorboard_ind += 1
        train_losses.append(train_loss / len(trainning_data))
        train_acces.append(train_acc / len(trainning_data))


        #测试
        test_loss_value = 0
        model.eval()
        with torch.no_grad():
            num_batch = len(test_data)
            numSize = len(test_data)
            test_loss, test_correct = 0,0
            for x,y in test_loader:
                x = x.to(device)
                y = y.to(device)
                pred_y = model(x)
                test_loss += criterion(pred_y, y).item()
                test_correct += (pred_y.argmax(1) == y).type(torch.float).sum().item()
            test_loss /= num_batch
            test_correct /= numSize
            eval_losses.append(test_loss)
            eval_acces.append(test_correct)
            test_loss_value = test_loss
            print("test result:",100 * test_correct,"%  avg loss:",test_loss)
        scheduler_1.step()
        #设置checkpoint
        if epoch > int(num_epoches/3) and test_loss_value < min_loss_val:
            min_loss_val = test_loss_value
            checkpoint = {"epoch": epoch,
                        "net": model.state_dict(),
                          "optimizer":optimer.state_dict(),
                          "lr_schedule":scheduler_1.state_dict()}

            if not os.path.isdir(r'tf_logs/' + "save_module"):
                os.makedirs("tf_logs/" + "save_module")
            PATH = r'tf_logs/'+"save_module" + "/ckpt_best_%s.pth"%(str(epoch+1))
            torch.save(checkpoint, PATH)

# Press the green button in the gutter to run the script.

if __name__ == '__main__':
        train()

3.运行结果为

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值