Mnist手写数据集的模型搭建及GPU训练(基于pytorch)

1. 用意

pytorch看了有一段时间了,可是怎么熟练操作pytorch呢,于是就用手写数据集来办事,键盘滚起来!
四颗GPU训练
两个类完成训练

代码直接copy过去,立即跑通,但电脑上必须有GPU
代码直接copy过去,立即跑通,但电脑上必须有GPU
代码直接copy过去,立即跑通,但电脑上必须有GPU

2.结果展示

2.1 四颗GPU
在这里插入图片描述

2.2 训练过程
在这里插入图片描述
在这里插入图片描述

2.3 训练结果
在这里插入图片描述

3.代码如下

import os
import random
import torch
import numpy as np
import torch.nn as nn
import torch.utils.data as Data
import torchvision
import matplotlib.pyplot as plt

class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Sequential(  # 输入图片 shape (1, 28, 28)
            nn.Conv2d(
                in_channels=1,
                out_channels=16,
                kernel_size=5,
                stride=1,
                padding=2,
            ),  # 卷积后的shape (16, 28, 28)
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),   # 池化之后的shape (16, 14, 14)
        )
        self.conv2 = nn.Sequential(  # input shape (16, 14, 14)
            nn.Conv2d(16, 32, 5, 1, 2),  # output shape (32, 14, 14)
            nn.ReLU(),
            nn.MaxPool2d(2),  # output shape (32, 7, 7)
        )
        self.out = nn.Linear(32 * 7 * 7, 10)  # 全连接层,输出为10

    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = x.view(x.size(0), -1)  # flatten the output of conv2 to (batch_size, 32 * 7 * 7)
        output = self.out(x)
        return output


class TrainMnistDataset:
    def __init__(self, epoch=1000, lr=0.001,  batch_size=50, data_root_dir="./datasets/", model_save_dir="./model_save_dir",
                 pretraind_model_path=False):
        self.data_root_dir = data_root_dir
        self.batch_size = batch_size
        self.epoch = epoch
        self.lr = lr
        self.model_save_dir = model_save_dir
        self.pretraind_model_path = pretraind_model_path

        if not os.path.exists(self.data_root_dir):
            os.mkdir(self.data_root_dir)
        if not os.path.exists(self.model_save_dir):
            os.mkdir(self.model_save_dir)

        self.seed_set()
        os.environ["CUDA_VISIBLE_DEVICES"] = '0,1,2,3'

    def seed_set(self, seed=1):
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.manual_seed(seed)
        np.random.seed(seed)
        random.seed(seed)
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    def download_mnist_set(self):
        download_mnist = False
        if not (os.path.exists(self.data_root_dir)) or not os.listdir(self.data_root_dir):
            # not mnist dir or mnist is empyt dir
            download_mnist = True
        return download_mnist

    def plot_image(self, image, label):
        plt.imshow(image, cmap='gray')
        plt.title('%i' % label)
        plt.savefig("mnist-1.jpg")
        plt.show()

    def get_mnist_dataset(self):
        download_mnist_set = self.download_mnist_set()
        train_data = torchvision.datasets.MNIST(root=self.data_root_dir, train=True,
            transform=torchvision.transforms.ToTensor(),download=download_mnist_set)
        val_data   = torchvision.datasets.MNIST(root=self.data_root_dir, train=False,
            transform=torchvision.transforms.ToTensor())
        print("训练集图片和标签的shape: ", train_data.data.size(), train_data.targets.size())
        # torch.Size([60000, 28, 28]), torch.Size([60000])
        print("训练集图片和标签的shape: ", val_data.data.size(), val_data.targets.size())
        # torch.Size([10000, 28, 28]) torch.Size([10000])
        # self.plot_image(train_data.data[0].numpy(), train_data.targets[0])
        return train_data, val_data

    def pretrained_model_load(self, model):
        print('Loading weights into state dict...')
        model_dict = model.state_dict()
        pretrained_dict = torch.load(self.pretraind_model_path)
        pretrained_dict = {k: v for k, v in pretrained_dict.items() if np.shape(model_dict[k])==np.shape(v)}
        model_dict.update(pretrained_dict)
        model.load_state_dict(model_dict)
        return model

    def model_load(self):
        model = CNN()
        print(model)
        return model

    def save_train_information(self, message):
        path = os.path.join(self.model_save_dir, "train_info.txt")
        with open(path, 'a') as f:
            f.write(message)
            f.write("\n")

    def fit_one_epoch(self, model, train_loader, val_loader, epoch_size_train, epoch_size_val,
                               optimizer, loss_func, epoch_cur):

        # (一) 训练模型
        print("start training")
        model = model.train()  # Sets the module in training mode
        total_loss_train = 0
        for iteration, (images_batch, labels_batch) in enumerate(train_loader):
            # images_batch shape: torch.Size([50, 1, 28, 28]), labels_batch shape: torch.Size([50])
            # 1.1 判断在当前epoch下,是否遍历完训练集图片
            if iteration >= epoch_size_train:
                break

            # 1.2 进行no_grad和cuda加速
            images_batch = images_batch.cuda()
            labels_batch = labels_batch.cuda()
            assert images_batch.requires_grad==False, "images_batch.requires_grad should be set False "
            assert labels_batch.requires_grad==False, "labels_batch.requires_grad should be set False "

            # 1.3 前向传播与loss计算
            output = model(images_batch)  # output shape : (50, 10)
            loss = loss_func(output, labels_batch)  # output shape: (50, 10), labels_batch shape:(50,)

            # 1.4 之前梯度置零、反向传播计算当前梯度,权重更新
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # 1.5 相关信息打印
            total_loss_train += loss.data.cpu().numpy()
            if iteration % 50 == 0:
                labels_batch_pred = torch.max(output, 1)[1].data.cpu().numpy()  # (50,)
                accuracy = float((labels_batch_pred == labels_batch.data.cpu().numpy()).astype(int).sum()) / float(self.batch_size)
                print(f"Epoch {(epoch_cur + 1):04d}/{self.epoch:04d} ||"
                      + f" 训练集【{(iteration + 1):04d}/{epoch_size_train:04d}】 ||"
                      + f" total_loss: {(total_loss_train / (iteration + 1)):.4f} ||"
                      + f" accuracy: {accuracy:.4f}")

        # (二) 验证模型
        print('Start Validation')
        total_loss_val = 0
        model = model.eval()  # Sets the module in eval mode
        for iteration, (images_batch, labels_batch) in enumerate(val_loader):
            # 2.1判断在当前epoch下,是否遍历完验证集图片
            if iteration >= epoch_size_val:
                break

            # 2.2 cuda加速和进行no_grad判断
            images_batch = images_batch.cuda()
            labels_batch = labels_batch.cuda()

            # 2.3 前向传播与loss计算
            output = model(images_batch)  # output shape : (50, 10)
            loss = loss_func(output, labels_batch)  # output shape: (50, 10), labels_batch shape:(50,)
            total_loss_val += loss.data.cpu().numpy()

            # 2.4 相关信息打印
            if iteration % 50 == 0:
                labels_batch_pred = torch.max(output, 1)[1].data.cpu().numpy()  # (50,)
                accuracy = float((labels_batch_pred == labels_batch.data.cpu().numpy()).astype(int).sum()) / float(
                    self.batch_size)
                print(f"Epoch {(epoch_cur + 1):04d}/{self.epoch:04d} ||"
                      + f" 验证集【{(iteration + 1):04d}/{epoch_size_val:04d}】 ||"
                      + f" total_loss: {(total_loss_val / (iteration + 1)):.4f} ||"
                      + f" accuracy: {accuracy:.4f}")

        # (三) 模型保存及相关信息保存
        print(f"train loss: {total_loss_train/epoch_size_train: .4f}|| val loss: {total_loss_val/epoch_size_val:.4f} ")
        info = f"Epoch{(epoch_cur+1):04d}-train_Loss{total_loss_train/(epoch_size_train):.4f}-Val_Loss{total_loss_val/(epoch_size_val+1):.4f}.pth"
        torch.save(model.module.state_dict(), os.path.join(self.model_save_dir, info))
        self.save_train_information(info)

    def train(self):
        # 1.获取数据集
        train_data, val_data = self.get_mnist_dataset()
        train_loader = Data.DataLoader(dataset=train_data, batch_size=self.batch_size, shuffle=True)  # (50, 1, 28, 28)
        val_loader = Data.DataLoader(dataset=val_data, batch_size=self.batch_size, shuffle=True)

        # 2.模型加载与GPU加速
        model = self.model_load()
        if self.pretraind_model_path:
            model = self.pretrained_model_load(model)
        model = torch.nn.DataParallel(model).cuda()
        print("GPU加速了...赶快起飞吧..........!!!!!!!!!!   " * 3)

        # 3.设置优化器和损失函数
        optimizer = torch.optim.Adam(model.parameters(), lr=self.lr)
        loss_func = nn.CrossEntropyLoss()

        # 4.训练模型
        epoch_size_train = max(1, train_data.data.size()[0])//self.batch_size
        epoch_size_val   = max(1, val_data.data.size()[0])//self.batch_size
        for epoch_cur in range(self.epoch):
            self.fit_one_epoch(model, train_loader, val_loader, epoch_size_train, epoch_size_val,
                               optimizer, loss_func, epoch_cur)

def run():
    data_root_dir = "/home/panxy/datasets/mnist/"
    trian_mnist = TrainMnistDataset(data_root_dir=data_root_dir)
    trian_mnist.train()

if __name__ == "__main__":
    run()

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
以下是使用 PyTorch 框架基于 MNIST 手写数字识别模型的代码示例: ```python import torch import torch.nn as nn import torch.optim as optim from torchvision import datasets, transforms # 定义模型结构 class Net(nn.Module): def __init__(self): super(Net, self).__init__() self.conv1 = nn.Conv2d(1, 10, kernel_size=5) self.conv2 = nn.Conv2d(10, 20, kernel_size=5) self.fc1 = nn.Linear(320, 50) self.fc2 = nn.Linear(50, 10) def forward(self, x): x = nn.functional.relu(nn.functional.max_pool2d(self.conv1(x), 2)) x = nn.functional.relu(nn.functional.max_pool2d(self.conv2(x), 2)) x = x.view(-1, 320) x = nn.functional.relu(self.fc1(x)) x = self.fc2(x) return nn.functional.log_softmax(x, dim=1) # 定义训练函数 def train(model, device, train_loader, optimizer, epoch): model.train() for batch_idx, (data, target) in enumerate(train_loader): data, target = data.to(device), target.to(device) optimizer.zero_grad() output = model(data) loss = nn.functional.nll_loss(output, target) loss.backward() optimizer.step() if batch_idx % 100 == 0: print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( epoch, batch_idx * len(data), len(train_loader.dataset), 100. * batch_idx / len(train_loader), loss.item())) # 定义测试函数 def test(model, device, test_loader): model.eval() test_loss = 0 correct = 0 with torch.no_grad(): for data, target in test_loader: data, target = data.to(device), target.to(device) output = model(data) test_loss += nn.functional.nll_loss(output, target, reduction='sum').item() pred = output.argmax(dim=1, keepdim=True) correct += pred.eq(target.view_as(pred)).sum().item() test_loss /= len(test_loader.dataset) print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)'.format( test_loss, correct, len(test_loader.dataset), 100. * correct / len(test_loader.dataset))) # 设置训练参数 batch_size = 64 test_batch_size = 1000 epochs = 10 lr = 0.01 momentum = 0.5 seed = 1 log_interval = 100 # 检查是否支持 GPU use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") # 加载数据集 torch.manual_seed(seed) transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ]) train_loader = torch.utils.data.DataLoader( datasets.MNIST('../data', train=True, download=True, transform=transform), batch_size=batch_size, shuffle=True) test_loader = torch.utils.data.DataLoader( datasets.MNIST('../data', train=False, transform=transform), batch_size=test_batch_size, shuffle=True) # 初始化模型和优化器 model = Net().to(device) optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum) # 训练模型 for epoch in range(1, epochs + 1): train(model, device, train_loader, optimizer, epoch) test(model, device, test_loader) ``` 这段代码会下载 MNIST 数据集并进行训练和测试,训练过程中会输出每个 batch 的 loss 值,测试过程会输出准确率。你可以根据需要调整训练参数,例如学习率、批量大小等,以优化模型性能。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值