5.9--GoogLeNET

LL.。

已于 2024-09-26 15:26:01 修改

阅读量498

点赞数 5

分类专栏：动手学深度学习文章标签：深度学习 cnn 人工智能

于 2024-09-17 15:59:12 首次发布

本文链接：https://blog.csdn.net/qq_53243414/article/details/142314914

版权

动手学深度学习专栏收录该内容

16 篇文章 0 订阅

订阅专栏

摘要

GoogleNet网络的创新点是在VGG和NIN网络的基础上提出了Inception模块。
在这里插入图片描述
Inception模块由四条并行路径组成，第一条用1×1卷积减少通道数，第二层在1×1卷积后使用3×3卷积核进行卷积，第三层在1×1卷积后使用5×5卷积，第四层3×3最大池化后进行1×1卷积。这四层输出的特征图大小不变。每个Inception后添加3×3步幅为2的最大池化层。
解决的问题是通过Inception模块提取不同尺度的特征图，并将它们融合起来，这样就能有效地识别不同范围的图像细节。

5.9.1 导入相关库

import time
import torch
from torch import nn, optim
import torch.nn.functional as F
import torchvision
import sys
sys.path.append("..") 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(torch.__version__)
print(device)

在这里插入图片描述

5.9.2 Inception块

继承自nn.Module中的forward函数会自动调用。

class Inception(nn.Module):
    # c1 - c4为每条线路里的层的输出通道数
    def __init__(self, in_c, c1, c2, c3, c4):
        super(Inception, self).__init__()
        # 线路1，单1 x 1卷积层
        self.p1_1 = nn.Conv2d(in_c, c1, kernel_size=1)
        # 线路2，1 x 1卷积层后接3 x 3卷积层
        self.p2_1 = nn.Conv2d(in_c, c2[0], kernel_size=1)
        self.p2_2 = nn.Conv2d(c2[0], c2[1], kernel_size=3, padding=1)
        # 线路3，1 x 1卷积层后接5 x 5卷积层
        self.p3_1 = nn.Conv2d(in_c, c3[0], kernel_size=1)
        self.p3_2 = nn.Conv2d(c3[0], c3[1], kernel_size=5, padding=2)
        # 线路4，3 x 3最大池化层后接1 x 1卷积层
        self.p4_1 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
        self.p4_2 = nn.Conv2d(in_c, c4, kernel_size=1)

    def forward(self, x):
        p1 = F.relu(self.p1_1(x))
        p2 = F.relu(self.p2_2(F.relu(self.p2_1(x))))
        p3 = F.relu(self.p3_2(F.relu(self.p3_1(x))))
        p4 = F.relu(self.p4_2(self.p4_1(x)))
        return torch.cat((p1, p2, p3, p4), dim=1)  # 在通道维上连结输出

5.9.3 GoogleNet模型

GoogLeNet架构
在这里插入图片描述

下面的b3、b4、b5是三个参数不同的Inception块。

b1 = nn.Sequential(nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3),
                   nn.ReLU(),
                   nn.MaxPool2d(kernel_size=3, stride=2, padding=1))

b2 = nn.Sequential(nn.Conv2d(64, 64, kernel_size=1),
                   nn.Conv2d(64, 192, kernel_size=3, padding=1),
                   nn.MaxPool2d(kernel_size=3, stride=2, padding=1))

b3 = nn.Sequential(Inception(192, 64, (96, 128), (16, 32), 32),
                   Inception(256, 128, (128, 192), (32, 96), 64),
                   nn.MaxPool2d(kernel_size=3, stride=2, padding=1))

class GlobalAvgPool2d(nn.Module):
    # 全局平均池化层可通过将池化窗口形状设置成输入的高和宽实现
    def __init__(self):
        super(GlobalAvgPool2d, self).__init__()
    def forward(self, x):
        return F.avg_pool2d(x, kernel_size=x.size()[2:])

b5模块与NIN类似，在最后添加了全局平均池化层，将每个通道的高和宽变为1，nn.Flatten()函数默认从第一维到最后一维压缩为一个张量。

b4 = nn.Sequential(Inception(480, 192, (96, 208), (16, 48), 64),
                   Inception(512, 160, (112, 224), (24, 64), 64),
                   Inception(512, 128, (128, 256), (24, 64), 64),
                   Inception(512, 112, (144, 288), (32, 64), 64),
                   Inception(528, 256, (160, 320), (32, 128), 128),
                   nn.MaxPool2d(kernel_size=3, stride=2, padding=1))

b5 = nn.Sequential(Inception(832, 256, (160, 320), (32, 128), 128),
                   Inception(832, 384, (192, 384), (48, 128), 128),
                   GlobalAvgPool2d(),
                   nn.Flatten())

测试GoogleNet网络每个小模块的输出
b3由2个Inception组成，b4由5个Inception组成，b5由2个Inception组成

net = nn.Sequential(b1, b2, b3, b4, b5, nn.Linear(1024, 10))
X = torch.rand(1, 1, 96, 96)
for blk in net.children():
    X = blk(X)
    print('output shape: ', X.shape)

在这里插入图片描述
下面是我自己画的b1、b2、b3的流程图

5.9.4 获取数据和训练模型

def load_data_fashion_mnist(batch_size, resize=None, root='~/Datasets/FashionMNIST'):
    """Download the fashion mnist dataset and then load into memory."""
    # trans = []
    # if resize:
    #     trans.append(torchvision.transforms.Resize(size=resize))
    # trans.append(torchvision.transforms.ToTensor())
    
    # transform = torchvision.transforms.Compose(trans)
    transform = torchvision.transforms.ToTensor()
    mnist_train = torchvision.datasets.FashionMNIST(root=root, train=True, download=True, transform=transform)
    mnist_test = torchvision.datasets.FashionMNIST(root=root, train=False, download=True, transform=transform)
    if sys.platform.startswith('win'):
        num_workers = 0  # 0表示不用额外的进程来加速读取数据
    else:
        num_workers = 4
    train_iter = torch.utils.data.DataLoader(mnist_train, batch_size=batch_size, shuffle=True, num_workers=num_workers)
    test_iter = torch.utils.data.DataLoader(mnist_test, batch_size=batch_size, shuffle=False, num_workers=num_workers)

    return train_iter, test_iter

计算测试数据集的准确率

def evaluate_accuracy(data_iter, net, device=None):
    if device is None and isinstance(net, torch.nn.Module):
        # 如果没指定device就使用net的device
        device = list(net.parameters())[0].device 
    acc_sum, n = 0.0, 0
    with torch.no_grad():
        for X, y in data_iter:
            if isinstance(net, torch.nn.Module):
                net.eval() # 评估模式, 这会关闭dropout
                acc_sum += (net(X.to(device)).argmax(dim=1) == y.to(device)).float().sum().cpu().item()
                net.train() # 改回训练模式
            else: # 自定义的模型, 3.13节之后不会用到, 不考虑GPU
                if('is_training' in net.__code__.co_varnames): # 如果有is_training这个参数
                    # 将is_training设置成False
                    acc_sum += (net(X, is_training=False).argmax(dim=1) == y).float().sum().item() 
                else:
                    acc_sum += (net(X).argmax(dim=1) == y).float().sum().item()
            n += y.shape[0]
    return acc_sum / n

def train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs):
    net = net.to(device)
    print("training on ", device)
    loss = torch.nn.CrossEntropyLoss()
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n, batch_count, start = 0.0, 0.0, 0, 0, time.time()
        for X, y in train_iter:
            X = X.to(device)
            y = y.to(device)
            y_hat = net(X)
            l = loss(y_hat, y)
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            train_l_sum += l.cpu().item()
            train_acc_sum += (y_hat.argmax(dim=1) == y).sum().cpu().item()
            n += y.shape[0]
            batch_count += 1
        test_acc = evaluate_accuracy(test_iter, net)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'
              % (epoch + 1, train_l_sum / batch_count, train_acc_sum / n, test_acc, time.time() - start))

进行训练

batch_size = 128
# 如出现“out of memory”的报错信息，可减小batch_size或resize
train_iter, test_iter = load_data_fashion_mnist(batch_size, resize=96)

lr, num_epochs = 0.001, 10
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs)

在这里插入图片描述

5.9.5 观察模型计算量、参数量、最大显存

from thop import profile
# 使用 thop 计算 FLOPs 和参数量
X = torch.randn(1, 1, 96, 96)
X = X.to(device)
flops, params = profile(net, inputs=(X,))

print(f"FLOPs: {flops}")
print(f"参数量: {params}")
max_memory_allocated = torch.cuda.max_memory_allocated()
print(f"Max Memory Allocated: {max_memory_allocated / 1024**2:.2f} MB")

在这里插入图片描述

总结

GoogLeNet模型减少了显存的占用率，更有效识别不同范围的图像细节，同时为不同的滤波器分配不同的参数。
与之前的模型相比，训练集和测试集上准确率高，训练模型时间较短且参数量适中。

AlexNet与GoogLeNet网络比较：

AlexNet网络结构
GoogLeNet网络结构

对比两个网络可以看出，在前两个卷积池化中，GoogLeNet使用更小的卷积核并且添加了一个1×1卷积核用来降低通道数量，通过这种方法不仅能降低模型的参数量，还能进一步提取更抽象的特征。GoogLeNet网络中的Inception模块汇聚了1×1卷积核，3×3卷积核，5×5卷积核和最大池化等，发挥这些方法的优点得到包含各个方面的特征，再将这些方法得到的特征图叠加起来，至于为什么使用多个Inception模块以及为什么每个Inception模块中4个通路中的通道数是那样设计的，这个可能是作者通过大量实验的出来的吧。