使用GPU训练示例
问题说明:
此代码虽然可以运行,但据我观察每轮保存模型时会一点点吃掉缓存,以至于我电脑崩溃死机数次。
解决方案:
我使用的方法是,每轮删除上一个,就可让缓存保持平衡不会泄露导致电脑崩溃卡死。当然一直训练过拟合是必然的现象,所以要保存最好的模型,根据loss值最小的时候保存。但这些方法未应用此代码中。
以下代码作为示例,运行2轮看看即可。
GPU调用语句放置位置:开头实例化、调用模型、损失函数、训练与测试(图像、标签)
import torch
import torchvision
from torch import nn
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from model import Tudui
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
train_data = torchvision.datasets.CIFAR10("./dataset",
train=True,
download=True,
transform=torchvision.transforms.ToTensor())
test_data = torchvision.datasets.CIFAR10("./dataset",
train=False,
download=True,
transform=torchvision.transforms.ToTensor())
train_len = len(train_data)
val_len = len(test_data)
print("训练数据集合{} = 50000".format(train_len))
print("测试数据集合{} = 10000".format(val_len))
train_loader = DataLoader(dataset=train_data, batch_size=2, shuffle=True, num_workers=0, drop_last=True)
test_loader = DataLoader(dataset=test_data, batch_size=2, shuffle=True, num_workers=0, drop_last=True)
tudui = Tudui()
tudui = tudui.to(device)
loss_fn = nn.CrossEntropyLoss()
loss_fn = loss_fn.to(device)
learning_rate = 1e-4
optimizer = torch.optim.SGD(tudui.parameters(), lr=learning_rate)
train = 0
val = 0
epoch = 1000
writer = SummaryWriter("logs")
for i in range(epoch):
print()
print("第{}轮训练开始".format(i + 1))
tudui.train(mode=True)
acc_ = 0
for data in train_loader:
imgs, targets = data
imgs = imgs.to(device)
targets = targets.to(device)
outputs = tudui(imgs)
loss = loss_fn(outputs, targets)
optimizer.zero_grad()
loss.backward()
optimizer.step()
train += 1
print("\r训练次数:{},Loss:{}".format(train, loss), end="")
accuracy = (outputs.argmax(1) == targets).sum()
acc_ += accuracy
if train % 4000 == 0:
print("训练次数:{},Loss:{}".format(train, loss))
writer.add_scalar("train", loss, train)
print()
print("Loss:{}, 准确率:{}".format(loss, acc_/train_len))
tudui.eval()
total_test_loss = 0
acc_val = 0
with torch.no_grad():
for data in test_loader:
imgs, targets = data
imgs = imgs.to(device)
targets = targets.to(device)
outputs = tudui(imgs)
loss = loss_fn(outputs, targets)
accuracy_val = (outputs.argmax(1) == targets).sum()
acc_val += accuracy_val
total_test_loss += loss
print("\r测试集的Loss:{}".format(total_test_loss), end="")
print()
print("整体测试集的Loss:{}, 准确率{}".format(total_test_loss, acc_val/val_len))
writer.add_scalar("val", loss, val)
val += 1
torch.save(tudui, "tudui_{}.pth".format(i))
print("模型已保存")
writer.close()