VGG16图像分类模型（纯手写数据集读取代码，帮你加深理解）

最新推荐文章于 2024-05-14 11:07:26 发布

疯人忠

最新推荐文章于 2024-05-14 11:07:26 发布

阅读量3.8k

点赞数 2

分类专栏：深度学习文章标签： python 深度学习神经网络目标检测 opencv

本文链接：https://blog.csdn.net/Zbreakzhong/article/details/118970360

版权

深度学习专栏收录该内容

4 篇文章 0 订阅

订阅专栏

前言

由于最近项目的需要利用深度学习模型完成图像分类的任务，我的个人数据集比较简单因此选用VGG16深度学习模型，后期数据集增加之后会采用VGG19深度学习模型。

正文

1、VGG16网络

废话不多说

import torch.nn as nn
class Vgg16Net(nn.Module):
    def __init__(self):
        super(Vgg16Net, self).__init__()

        self.layer1 = nn.Sequential(
            # （输入通道，输出通道，卷积核大小） 例：32*32*3 —> (32+2*1-3)/1+1 = 32，输出：32*32*64
            nn.Conv2d(3, 64, 3, padding=1), 
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            # （输入通道，输出通道，卷积核大小） 输入：32*32*64，卷积：3*64*64，输出：32*32*64
            nn.Conv2d(64, 64, 3, padding=1), 
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),

            nn.MaxPool2d(kernel_size=2, stride=2)   # 输入：32*32*64，输出：6*16*64
        )

        self.layer2 = nn.Sequential(
            nn.Conv2d(64, 128, 3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),

            nn.Conv2d(128, 128, 3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),

            nn.MaxPool2d(kernel_size=2, stride=2)
        )

        self.layer3 = nn.Sequential(
            nn.Conv2d(128, 256, 3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),

            nn.Conv2d(256, 256, 3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            
            nn.MaxPool2d(kernel_size=2, stride=2)
        )

        self.layer4 = nn.Sequential(
            nn.Conv2d(256, 512, 3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),

            nn.Conv2d(512, 512, 3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),

            nn.MaxPool2d(kernel_size=2, stride=2)
        )

        self.layer5 = nn.Sequential(
            nn.Conv2d(512, 512, 3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),

            nn.Conv2d(512, 512, 3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),

            nn.Conv2d(512, 512, 3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),

            nn.MaxPool2d(kernel_size=2, stride=2)
        )

        self.conv_layer = nn.Sequential(
            self.layer1,
            self.layer2,
            self.layer3,
            self.layer4,
            self.layer5,
        )

        self.fc = nn.Sequential(
            nn.Linear(512, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(),

            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(),

            nn.Linear(4096, 1000),
            nn.ReLU(inplace=True),
            nn.Dropout(),

            nn.Linear(1000, 82),
        )

    def forward(self, x):
        x = self.conv_layer(x)
        x = x.view(-1, 512)
        x = self.fc(x)
        return x

2、训练以及需要注意的地方

注意：在读取数据集时，大部分网络上的教程都使用了torch自带的库来读取，确实这样非常方便，但是这里面也会有一些细小的坑会被跳过，不利于后面像素分类神经网络的学习。

我们都知道图像送入神经网络时，要将图像转为张量再送入网络，但是图像的标签呢？我们利用下面的流程图来讲述。
在这里插入图片描述

由上图我们知道，图像数据 Y 经过前向传播之后得到预测结果 y，利用 Y 的标签与 y 的结果对比来判断预测结果是否正确，计算损失函数并进行反向传播。

在这里我们的图像 Y 是张量，那么得到的结果 y 也应该是一个张量，而损失函数是利用 Y 的标签与预测结果 y 相互计算得到的，因此 Y 的标签也应该是一个张量。

import cv2
import random
from tqdm import tqdm
from model import Vgg16Net

import torch
import torch.nn as nn
import torch.optim as optim
from torchsummary import summary
import torchvision.models as models


Epochs = 2000
learning_rate = 0.001

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
model = Vgg16Net().to(device)
# model = models.vgg19(pretrained=False).to(device)
# model = models.resnext101_32x8d(pretrained=False).to(device)
model.train()
summary(model, (3, 32, 32))
loss_func = nn.CrossEntropyLoss()

# 获取数据地址与Lable
with open('./data/train_label.txt', 'r') as f:
    train = f.readlines()

# 定义计算平均正确率函数
def test_ACC(img, label, Accuracy):
    out = model(img)
    _, pre_tensor = torch.max(out, 1)

    if pre_tensor == label:
        Accuracy.append(1)
    else:
        Accuracy.append(0)

train_data = []
train_lable = []

# 读取数据
for i in range(len(train)):
    labels = []

	# 这里是我个人数据集的问题，你们可以不写这一段
    # lists = train[i].strip('\n').split(' ')
    # if len(lists) == 3:
    #     img_path, label = lists[0] + str(' ') + lists[1], lists[2]
    # elif len(lists) == 2:
    #     img_path, label = lists[0], lists[1]
  
	# 利用opencv读取图像数据
    img = cv2.imread(img_path)
    img = cv2.resize(img, (32, 32))
    # 数组转化为张量  unsqueeze：扩展数据维度  div:除法   将像素点转化为概率
    img = torch.from_numpy(img).to(device).div(255.0).unsqueeze(0) 
    img = img.permute(0, 3, 1, 2)  #将tensor的维度换位   训练学习
    train_data.append(img.float())
    labels.append(int(label))
    # 下面这一句是重点，如果我们的标签定义的时候是中文或者英文，必须要将你的标签转换为一个数，例如：    
    # 标签【‘person’，‘dog，’cat‘】-> 【1，2，3】即：1代表人，2代表狗，3代表    
    # 然后将这些数，转换为张量！！！
    # 注意：图像的标签必须也要转为张量，这是直接使用 torch 自带的数据集读取函数注意不到的小细节。
    train_lable.append(torch.tensor(labels).to(device))

# 下面这一段测试集读取和上面的训练集读取一致
test_data = []
test_lable = []

for i in range(len(train)):
    labels = []
    lists = train[i].strip('\n').split(' ')
    if len(lists) == 3:
        img_path, label = lists[0] + str(' ') + lists[1], lists[2]
    elif len(lists) == 2:
        img_path, label = lists[0], lists[1]
    img = cv2.imread(img_path)
    img = cv2.resize(img, (32, 32))
    img = torch.from_numpy(img).to(device).div(255.0).unsqueeze(0)  #数组转化为张量  unsqueeze：扩展数据维度  div:除法   将像素点转化为概率
    img = img.permute(0, 3, 1, 2)  #将tensor的维度换位   训练学习
    test_data.append(img.float())
    labels.append(int(label))
    test_lable.append(torch.tensor(labels).to(device))    

# 开始训练
for i in range(1, Epochs+1):
    batch = 1
    ACC_Loss = 0
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)
    for f in tqdm(range(0, 256), desc='Epoch: ' + str(i), postfix={'lr':learning_rate}, colour='red'):
        rand_num = random.randint(0, 3190)
        out = model(train_data[rand_num])
        loss = loss_func(out, train_lable[rand_num])
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        ACC_Loss = (ACC_Loss + loss.item()) / batch
        batch += 1

    log_train = 'Train Epoch: ' + str(i) + ', Learning_rate: ' + str(learning_rate) + ', Batch: ' + str(batch - 1) + ', ACC_LOSS: ' + str(ACC_Loss)
    print(log_train）

    # 每10个 epoch 进行一次测试并将结果记录到 log.txt 文件中，方便我们回顾选择模型
    if i % 10 == 0:
        model.eval()
        Accuracy = []
        for num_b in tqdm(range(0, len(test_data)), desc='Test: '):
            test_ACC(test_data[num_b], test_lable[num_b], Accuracy)
            num = 0
            for j in Accuracy:
                num += j
        test = 'Test: Data Volume: {}, Accuracy: {:.2f} %'.format(len(Accuracy), num / len(Accuracy) * 100)
        print(test + '\n')
        with open('log.txt', 'a') as f:
                f.write(log_train + '\n' + test + '\n')

    if i % 10 == 0:
        torch.save(model.state_dict(), './VGG16_model/' + str(i) + '.pt')

3、测试

import cv2
import zmq
import time
import json
import torch
import base64
import numpy as np
from glob import glob
from model import Vgg16Net, ResModel
import torchvision.models as models

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

def predict(img, label):
    # 继承VGG16网络并加载已训练好的权重文件
    model = Vgg16Net().to(device)
    model.load_state_dict(torch.load('VGG16_model/best_97.88.pt'))
    cv2.imshow('img', img)
    img = torch.from_numpy(img).to(device).div(255.0).unsqueeze(0)
    img = img.permute(0, 3, 1, 2)
    out = model(img)
    _, pre_tensor = torch.max(out, 1)
    # print(pre_tensor)
    try:
        print(label[str(pre_tensor.item()))
    except:
        print('predict faile')

if __name__ == '__main__':
    # 读取我自己的标签对照表
    with open('./label.json', 'r') as f:
        label = json.load(f)

    # 读取要分类的图片
    path = 'data/123.png'
    s_time = time.time()
    img = cv2.imread(path)
    img = cv2.resize(img, (1920,1080))
    predeict(img, label)
    e_time = time.time()
    print('\nTime: {:2f} s\n'.format(e_time-s_time))

VGG16的图像分类任务就结束了，期间要注意的问题就是：

不仅图片需要转为张量，图片的标签也需要转为张量；
图片读取的时候记得需要转换图片维度，图片维度取决于你的深度学习网络；
模型训练时记得将训练结果保存到一份文件当中，方便训练结束之后选取训练效果最好且未过拟合的模型，也可以选择相近的模型进行迁移训练；

疯人忠

关注

2
点赞
踩
51

收藏

觉得还不错? 一键收藏
打赏
6
评论
VGG16图像分类模型（纯手写数据集读取代码，帮你加深理解）

前言由于最近项目的需要利用深度学习模型完成图像分类的任务，我的个人数据集比较简单因此选用VGG16深度学习模型，后期数据集增加之后会采用VGG19深度学习模型。目录1、VGG16网络2、训练以及需要注意的地方3、测试使用自己的模型进行图像分类正文1、VGG16网络废话不多说import torch.nn as nnclass Vgg16Net(nn.Module): def __init__(self): super(Vgg16Net, self).__ini
复制链接

扫一扫