一、原理
VGG16是一个具有深度堆叠卷积层和全连接层的神经网络,共有16层,包括13层卷积层和3层全连接层。VGG16 引入了使用非常小的 3x3 卷积核进行卷积操作的概念。这种设计选择使得网络进一步深化,增加了网络的表征能力。并通过多次使用 3x3 卷积层后接一个池化层,使得VGG16拥有更多的非线性变换和抽象表示能力。其网络结构图如下所示:
二、训练与测试
(1)模型微调:
1、在每个卷积层后都加上nn.BatchNorm2d(x),即批量归一化层。该层有助于解决训练过程中的内部协变量偏移问题,从而加速收敛,并提高模型的稳定性和准确性。通过对每个通道的特征进行均值和方差归一化,使得每个通道具有类似的分布,有利于模型的训练和泛化。
2、最后的三个全连接层转变为只使用1个全连接层。
3、最后一层加上nn.AvgPool2d(kernel_size=1, stride=1),即平均池化层。用于对输入的特征图进行平均池化操作,将特征图的空间维度缩减为更小的尺寸,以减少参数数量和计算量,并提取特征的更高级表示。
原始VGG16模型具体结构:
微调1VGG16模型具体结构:
微调2VGG16模型具体结构
(2)参数设置:
1、数据增强。主要用到了图像随机裁剪、像素填充、水平翻转、归一化等处理。数据增强操作不宜过多,适当即可。
transform_train = transforms.Compose(
[
transforms.RandomCrop(32, padding=4),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])
transform_test = transforms.Compose(
[
transforms.ToTensor(),
transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]
)
2、数据集准备。batch_size设置为128。
trainset = torchvision.datasets.CIFAR10(root='D:\CV\pytorch\dataset\cifar-10-batches-py',
train=True, download=True, transform=transform_train)
trainLoader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True)
testset = torchvision.datasets.CIFAR10(root='D:\CV\pytorch\dataset\cifar-10-batches-py',
train=False, download=True, transform=transform_test)
testLoader = torch.utils.data.DataLoader(testset, batch_size=128, shuffle=False)
3、优化器
损失函数主要用到了交叉熵损失,优化器为随机梯度下降optim.SGD(),学习率初始值为0.01,momentum=0.9,并使用了余弦退火学习率调度器,可以在训练过程中逐渐降低学习率,有助于更好地优化模型。
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4) # 0.1,0.01
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=200)
具体代码如下所示:
model.py
'''VGG11/13/16/19 in Pytorch.'''
import torch
import torch.nn as nn
cfg = {
'VGG11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
'VGG13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
'VGG16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
'VGG19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'],
}
class VGG(nn.Module):
def __init__(self, vgg_name):
super(VGG, self).__init__()
self.features = self._make_layers(cfg[vgg_name])
self.classifier = nn.Linear(512, 10)
def forward(self, x):
out = self.features(x)
out = out.view(out.size(0), -1)
out = self.classifier(out)
return out
def _make_layers(self, cfg):
layers = []
in_channels = 3
for x in cfg:
if x == 'M':
layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
else:
layers += [nn.Conv2d(in_channels, x, kernel_size=3, padding=1),
nn.BatchNorm2d(x),
nn.ReLU(inplace=True)]
in_channels = x
layers += [nn.AvgPool2d(kernel_size=1, stride=1)]
return nn.Sequential(*layers)
def test():
net = VGG('VGG11')
x = torch.randn(2,3,32,32)
y = net(x)
print(y.size())
main.py
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
from torch.nn import init
from torch.optim import Adam
from torch.optim.lr_scheduler import StepLR
from torch.utils.tensorboard import SummaryWriter
from models import *
transform_train = transforms.Compose(
[
transforms.RandomCrop(32, padding=4),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])
transform_test = transforms.Compose(
[
transforms.ToTensor(),
transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]
)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
trainset = torchvision.datasets.CIFAR10(root='D:\CV\pytorch\dataset\cifar-10-batches-py',
train=True, download=True, transform=transform_train)
trainLoader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True)
testset = torchvision.datasets.CIFAR10(root='D:\CV\pytorch\dataset\cifar-10-batches-py',
train=False, download=True, transform=transform_test)
testLoader = torch.utils.data.DataLoader(testset, batch_size=128, shuffle=False)
writer = SummaryWriter('D:\CV\pytorch\pytorch-cifar-master\logs_vgg16_100ep')
net = VGG('VGG16')
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4) # 0.1,0.01
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=200)
total_times = 100
total = 0
accuracy_rate = []
for epoch in range(total_times):
net .train()
net .to(device)
running_loss = 0.0
total_train_correct = 0
total_train_samples = 0
total_correct = 0
total_trainset = 0
for i, (data, labels) in enumerate(trainLoader, 0):
data = data.to(device)
outputs = net(data).to(device)
labels = labels.to(device)
loss = criterion(outputs, labels).to(device)
optimizer.zero_grad()
loss.backward()
optimizer.step()
running_loss += loss.item()
_, pred = outputs.max(1)
correct = (pred == labels).sum().item()
total_correct += correct
total_trainset += data.shape[0]
running_loss += loss.item()
total_train_correct += correct
total_train_samples += data.shape[0]
train_loss = running_loss / len(trainLoader)
train_accuracy = total_train_correct / total_train_samples
writer.add_scalar('Train/Loss', train_loss, epoch)
writer.add_scalar('Train/Accuracy', train_accuracy, epoch)
print('epoch[%d] train_loss: %.4f, train_acc: %.4f' % (epoch + 1, running_loss / len(trainLoader), train_accuracy))
net.eval()
correct = 0 # 预测正确的图片数
total = 0 # 总共的图片数
losses = [] # 用于记录每个批次的损失值
with torch.no_grad():
for data in testLoader:
images, labels = data
images = images.to(device)
outputs = net(images).to(device)
outputs = outputs.cpu()
outputarr = outputs.numpy() # 将output转换为numpy
_, predicted = torch.max(outputs, 1) # 获取每个样本在输出中的最大值以及对应索引,predicted 保存预测类别标签。
total += labels.size(0) # 这是为了计算整个测试集的准确率时,获得正确的总样本数
correct += (predicted == labels).sum() # (predicted == labels) 会生成一个布尔张量.sum() 对布尔张量进行求和,得到预测正确的样本数量。
# 计算测试损失
loss = criterion(outputs, labels)
losses.append(loss.item())
accuracy = 100 * correct / total
accuracy_rate.append(accuracy)
mean_loss = sum(losses) / len(losses) # 平均损失值
writer.add_scalar('Test/Loss', mean_loss, epoch)
writer.add_scalar('Test/Accuracy', accuracy, epoch)
print(f'epoch[{epoch + 1}] test_lost: {mean_loss:.4f} test_acc: {accuracy:.2f}%')
# print(f'测试准确率:{accuracy}%'.format(accuracy))
# test()
scheduler.step()
writer.close()
torch.save(net.state_dict(), 'D:\CV\pytorch\pytorch-cifar-master/res/VGG16_100epoch.pth')
accuracy_rate = np.array(accuracy_rate)
times = np.linspace(1, total_times, total_times)
plt.xlabel('times')
plt.ylabel('accuracy rate')
plt.plot(times, accuracy_rate)
plt.show()
print(accuracy_rate)
3、训练测试结果与总结
我们用微调后的VG16模型在CIFAR10数据集上训练了100个epoch,最高测试准确率为91.33%,这相对于原始的VGG16模型有了较大的改善。我们对微调模型进行了对比与分析,得出以下结论。
下图为在pycham上运行的结果,因为算力有限,我的batch_size设置为64,而一般最好设置为128或256。