摘要
GoogleNet网络的创新点是在VGG和NIN网络的基础上提出了Inception模块。
Inception模块由四条并行路径组成,第一条用1×1卷积减少通道数,第二层在1×1卷积后使用3×3卷积核进行卷积,第三层在1×1卷积后使用5×5卷积,第四层3×3最大池化后进行1×1卷积。这四层输出的特征图大小不变。每个Inception后添加3×3步幅为2的最大池化层。
解决的问题是通过Inception模块提取不同尺度的特征图,并将它们融合起来,这样就能有效地识别不同范围的图像细节。
5.9.1 导入相关库
import time
import torch
from torch import nn, optim
import torch.nn.functional as F
import torchvision
import sys
sys.path.append("..")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(torch.__version__)
print(device)
5.9.2 Inception块
继承自nn.Module中的forward函数会自动调用。
class Inception(nn.Module):
# c1 - c4为每条线路里的层的输出通道数
def __init__(self, in_c, c1, c2, c3, c4):
super(Inception, self).__init__()
# 线路1,单1 x 1卷积层
self.p1_1 = nn.Conv2d(in_c, c1, kernel_size=1)
# 线路2,1 x 1卷积层后接3 x 3卷积层
self.p2_1 = nn.Conv2d(in_c, c2[0], kernel_size=1)
self.p2_2 = nn.Conv2d(c2[0], c2[1], kernel_size=3, padding=1)
# 线路3,1 x 1卷积层后接5 x 5卷积层
self.p3_1 = nn.Conv2d(in_c, c3[0], kernel_size=1)
self.p3_2 = nn.Conv2d(c3[0], c3[1], kernel_size=5, padding=2)
# 线路4,3 x 3最大池化层后接1 x 1卷积层
self.p4_1 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
self.p4_2 = nn.Conv2d(in_c, c4, kernel_size=1)
def forward(self, x):
p1 = F.relu(self.p1_1(x))
p2 = F.relu(self.p2_2(F.relu(self.p2_1(x))))
p3 = F.relu(self.p3_2(F.relu(self.p3_1(x))))
p4 = F.relu(self.p4_2(self.p4_1(x)))
return torch.cat((p1, p2, p3, p4), dim=1) # 在通道维上连结输出
5.9.3 GoogleNet模型
GoogLeNet架构
下面的b3、b4、b5是三个参数不同的Inception块。
b1 = nn.Sequential(nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3),
nn.ReLU(),
nn.MaxPool2d(kernel_size=3, stride=2, padding=1))
b2 = nn.Sequential(nn.Conv2d(64, 64, kernel_size=1),
nn.Conv2d(64, 192, kernel_size=3, padding=1),
nn.MaxPool2d(kernel_size=3, stride=2, padding=1))
b3 = nn.Sequential(Inception(192, 64, (96, 128), (16, 32), 32),
Inception(256, 128, (128, 192), (32, 96), 64),
nn.MaxPool2d(kernel_size=3, stride=2, padding=1))
class GlobalAvgPool2d(nn.Module):
# 全局平均池化层可通过将池化窗口形状设置成输入的高和宽实现
def __init__(self):
super(GlobalAvgPool2d, self).__init__()
def forward(self, x):
return F.avg_pool2d(x, kernel_size=x.size()[2:])
b5模块与NIN类似,在最后添加了全局平均池化层,将每个通道的高和宽变为1,nn.Flatten()函数默认从第一维到最后一维压缩为一个张量。
b4 = nn.Sequential(Inception(480, 192, (96, 208), (16, 48), 64),
Inception(512, 160, (112, 224), (24, 64), 64),
Inception(512, 128, (128, 256), (24, 64), 64),
Inception(512, 112, (144, 288), (32, 64), 64),
Inception(528, 256, (160, 320), (32, 128), 128),
nn.MaxPool2d(kernel_size=3, stride=2, padding=1))
b5 = nn.Sequential(Inception(832, 256, (160, 320), (32, 128), 128),
Inception(832, 384, (192, 384), (48, 128), 128),
GlobalAvgPool2d(),
nn.Flatten())
测试GoogleNet网络每个小模块的输出
b3由2个Inception组成,b4由5个Inception组成,b5由2个Inception组成
net = nn.Sequential(b1, b2, b3, b4, b5, nn.Linear(1024, 10))
X = torch.rand(1, 1, 96, 96)
for blk in net.children():
X = blk(X)
print('output shape: ', X.shape)
下面是我自己画的b1、b2、b3的流程图
5.9.4 获取数据和训练模型
def load_data_fashion_mnist(batch_size, resize=None, root='~/Datasets/FashionMNIST'):
"""Download the fashion mnist dataset and then load into memory."""
# trans = []
# if resize:
# trans.append(torchvision.transforms.Resize(size=resize))
# trans.append(torchvision.transforms.ToTensor())
# transform = torchvision.transforms.Compose(trans)
transform = torchvision.transforms.ToTensor()
mnist_train = torchvision.datasets.FashionMNIST(root=root, train=True, download=True, transform=transform)
mnist_test = torchvision.datasets.FashionMNIST(root=root, train=False, download=True, transform=transform)
if sys.platform.startswith('win'):
num_workers = 0 # 0表示不用额外的进程来加速读取数据
else:
num_workers = 4
train_iter = torch.utils.data.DataLoader(mnist_train, batch_size=batch_size, shuffle=True, num_workers=num_workers)
test_iter = torch.utils.data.DataLoader(mnist_test, batch_size=batch_size, shuffle=False, num_workers=num_workers)
return train_iter, test_iter
计算测试数据集的准确率
def evaluate_accuracy(data_iter, net, device=None):
if device is None and isinstance(net, torch.nn.Module):
# 如果没指定device就使用net的device
device = list(net.parameters())[0].device
acc_sum, n = 0.0, 0
with torch.no_grad():
for X, y in data_iter:
if isinstance(net, torch.nn.Module):
net.eval() # 评估模式, 这会关闭dropout
acc_sum += (net(X.to(device)).argmax(dim=1) == y.to(device)).float().sum().cpu().item()
net.train() # 改回训练模式
else: # 自定义的模型, 3.13节之后不会用到, 不考虑GPU
if('is_training' in net.__code__.co_varnames): # 如果有is_training这个参数
# 将is_training设置成False
acc_sum += (net(X, is_training=False).argmax(dim=1) == y).float().sum().item()
else:
acc_sum += (net(X).argmax(dim=1) == y).float().sum().item()
n += y.shape[0]
return acc_sum / n
def train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs):
net = net.to(device)
print("training on ", device)
loss = torch.nn.CrossEntropyLoss()
for epoch in range(num_epochs):
train_l_sum, train_acc_sum, n, batch_count, start = 0.0, 0.0, 0, 0, time.time()
for X, y in train_iter:
X = X.to(device)
y = y.to(device)
y_hat = net(X)
l = loss(y_hat, y)
optimizer.zero_grad()
l.backward()
optimizer.step()
train_l_sum += l.cpu().item()
train_acc_sum += (y_hat.argmax(dim=1) == y).sum().cpu().item()
n += y.shape[0]
batch_count += 1
test_acc = evaluate_accuracy(test_iter, net)
print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'
% (epoch + 1, train_l_sum / batch_count, train_acc_sum / n, test_acc, time.time() - start))
进行训练
batch_size = 128
# 如出现“out of memory”的报错信息,可减小batch_size或resize
train_iter, test_iter = load_data_fashion_mnist(batch_size, resize=96)
lr, num_epochs = 0.001, 10
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs)
5.9.5 观察模型计算量、参数量、最大显存
from thop import profile
# 使用 thop 计算 FLOPs 和参数量
X = torch.randn(1, 1, 96, 96)
X = X.to(device)
flops, params = profile(net, inputs=(X,))
print(f"FLOPs: {flops}")
print(f"参数量: {params}")
max_memory_allocated = torch.cuda.max_memory_allocated()
print(f"Max Memory Allocated: {max_memory_allocated / 1024**2:.2f} MB")
总结
GoogLeNet模型减少了显存的占用率,更有效识别不同范围的图像细节,同时为不同的滤波器分配不同的参数。
与之前的模型相比,训练集和测试集上准确率高,训练模型时间较短且参数量适中。
AlexNet与GoogLeNet网络比较:
- AlexNet网络结构
- GoogLeNet网络结构
对比两个网络可以看出,在前两个卷积池化中,GoogLeNet使用更小的卷积核并且添加了一个1×1卷积核用来降低通道数量,通过这种方法不仅能降低模型的参数量,还能进一步提取更抽象的特征。GoogLeNet网络中的Inception模块汇聚了1×1卷积核,3×3卷积核,5×5卷积核和最大池化等,发挥这些方法的优点得到包含各个方面的特征,再将这些方法得到的特征图叠加起来,至于为什么使用多个Inception模块以及为什么每个Inception模块中4个通路中的通道数是那样设计的,这个可能是作者通过大量实验的出来的吧。