VGG卷积神经网络最大的特点就是由多个vgg_block构成,每个vgg_block包含多个卷积层,除第一个卷积层外,其余卷积层输入输出通道数量保持不变,卷积核大小一般为3*3,填充为1,即卷积层不改变特征图大小,由卷积核为2步长为2的池化层进行特征图缩放。
vgg_block实现:
def vgg_block(conv_num, in_channel, out_channel):
layers = []
for _ in range(conv_num):
layers.append(nn.Conv2d(in_channel, out_channel, kernel_size=3, padding=1))
layers.append(nn.ReLU())
in_channel = out_channel
layers.append(nn.MaxPool2d(kernel_size=2, stride=2))
return nn.Sequential(*layers)
VGG根据不同网络深度分为VGG11、VGG13、VGG16和VGG19几个版本,每个VGG网络都是由5个vgg_block和三个全连接层组成。
VGG网络实现:
def vgg(in_channel, conv_arch):
conv_blocks = []
for (conv_num, out_channel) in conv_arch:
conv_blocks.append(vgg_block(conv_num, in_channel, out_channel))
in_channel = out_channel
return nn.Sequential(
*conv_blocks,
nn.Flatten(),
nn.Linear(512 * 7 * 7, 4096),
nn.ReLU(),
nn.Dropout(),
nn.Linear(4096, 4096),
nn.ReLU(),
nn.Dropout(),
nn.Linear(4096, 2)
)
def vgg11(channel_num):
conv_arch = ((1, 64), (1, 128), (2, 256), (2, 512), (2, 512))
model = vgg(channel_num, conv_arch)
return model
def vgg13(channel_num):
conv_arch = ((2, 64), (2, 128), (2, 256), (2, 512), (2, 512))
model = vgg(channel_num, conv_arch)
return model
def vgg16(channel_num):
conv_arch = ((2, 64), (2, 128), (3, 256), (3, 512), (3, 512))
model = vgg(channel_num, conv_arch)
return model
def vgg19(channel_num):
conv_arch = ((2, 64), (2, 128), (4, 256), (4, 512), (4, 512))
model = vgg(channel_num, conv_arch)
return model
模型构建后,定义个训练脚本,使用自定义数据集进行训练。
def train():
# 如有GPU,默认使用第一块GPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('using device {}'.format(device))
#数据预处理
data_transform = {
"train": transforms.Compose([
transforms.RandomResizedCrop(224), #随机缩放裁剪
transforms.RandomHorizontalFlip(), #随机水平翻转
transforms.ToTensor(), #转换为Tensor
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) #归一化
]),
"val": transforms.Compose([
transforms.RandomResizedCrop(224),
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])
}
#加载数据集
batch_size = 32
data_path = 'dataset/dogs'
assert os.path.exists(data_path), "{} does not exist".format(data_path)
train_dataset = datasets.ImageFolder(root=os.path.join(data_path, 'train'), transform=data_transform['train'])
val_dataset = datasets.ImageFolder(root=os.path.join(data_path, 'val'), transform=data_transform['val'])
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=8)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=5, shuffle=False, num_workers=8)
model = vgg11(channel_num=3)
model.to(device)
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
epochs = 10
save_path = 'vgg11.pt'
best_acc = 0.0
steps = len(train_loader)
for epoch in range(epochs):
model.train()
running_loss = 0.0
for step, data in enumerate(train_loader):
images, labels = data
optimizer.zero_grad()
outputs = model(images.to(device))
loss = loss_function(outputs, labels.to(device))
loss.backward()
optimizer.step()
running_loss += loss.item()
print('epoch:{},step:{}/{},loss:{}'.format(epoch + 1, step + 1, steps, loss))
model.eval()
acc = 0.0
with torch.no_grad():
for val_data in val_loader:
images, labels = val_data
outputs = model(images.to(device))
predict = torch.max(outputs, dim=1)[1]
acc += torch.eq(predict, labels.to(device)).sum().item()
val_acc = acc / len(val_dataset)
print('epoch:{}, acc:{}'.format(epoch + 1, val_acc))
if val_acc > best_acc:
best_acc = val_acc
torch.save(model.state_dict(), save_path)
print("finish train")
if __name__ == '__main__':
train()