在学习CS231计算机视觉的过程中了解到了alexnet模型、GoogLenet以及resnet的相关内容,故查看了相关的文献和博客
其中在Alexnet模型主要运用了以下五个方法:
- 使用了ReLu激活函数加速收敛
- 使用GPU并行,加速训练,也为之后的分组卷积理论奠定基础
- 提出了局部响应归一化(LRN)增加了泛化特性
- 使用交叠池化防止过拟合
一般的在池化操作用,kernel-size和stride是相等的,但是交叠池化指的是stride < kernel-size - 提出DropOut
Alexnet网络模型总共只有五个卷集层和三个全连接层,在论文中将模型均分为两块,分别部署到两个GPU上加快模型的训练(只在特定的卷集层上进行数据交互),但是只有单个GPU的情况下,就没必要拆分了
数据增强采用了两种方式
- 通过对训练集的图片进行随机的水平镜像,随机的剪切来扩充训练集
- 通过对RGB空间做降维,然后添加高斯噪声
网络Alexnet结构
import torch
import torch.nn as nn
import torch.nn.functional as F
class AlexNet(nn.Module):
def __init__(self, num_classes):
'''
numclasses 为最后分类的类别数
'''
super(AlexNet, self).__init__()
self.num_classes = num_classes
self.features = nn.Sequential(
# (3, 224, 224)- > (96, 55, 55)
nn.Conv2d(in_channels=3, out_channels=96, kernel_size=11, stride=4, padding=2),
nn.ReLU(inplace=True),
# (96, 55, 55) - > (96, 27, 27)
nn.MaxPool2d(kernel_size=2),
# (27, 27, 96) - > (27, 27, 256)
nn.Conv2d(in_channels=96, out_channels=256, kernel_size=5, stride=1, padding=2),
nn.ReLU(inplace=True),
# (27, 27, 256) - > (13, 13, 256)
nn.MaxPool2d(kernel_size=2),
# (13, 13, 256) - > (13, 13, 384)
nn.Conv2d(in_channels=256, out_channels=384, kernel_size=3, stride=1, padding=1),
nn.ReLU(inplace=True),
# (13, 13, 384) - > (13, 13, 384)
nn.Conv2d(in_channels=384, out_channels=384, kernel_size=3, stride=1, padding=1),
nn.ReLU(inplace=True),
# (13, 13, 384) - > (13, 13, 256)
nn.Conv2d(in_channels=384, out_channels=256, kernel_size=3, stride=1, padding=1),
nn.ReLU(inplace=True),
# (13, 13, 256) - > (6, 6, 256)
nn.MaxPool2d(kernel_size=2),
)
self.classifiler = nn.Sequential(
# Dropout() 中默认值为0.5
nn.Dropout(),
nn.Linear(in_features=6 * 6 * 256, out_features=4096),
nn.Dropout(0.5),
nn.Linear(in_features=4096, out_features=4096),
nn.Dropout(0.5),
nn.Linear(in_features=4096, out_features=self.num_classes),
)
def forward(self, x):
x = self.features(x)
x = x.reshape(-1, 6 * 6 * 256)
x = self.classifiler(x)
return x
#
# try:
# from torch.hub import load_state_dict_from_url
# except:
# from torch.utils.model_zoo import load_url as load_state_dict_from_url
#
# model_url = {
# 'alexnet': 'https://download.pytorch.org/models/alexnet-owt-4df8aa71.pth',
# }
def alexnet(pretrain=False, show=True, **kwargs):
'''
pretain: bool, return a pretained model on ImageNet
show: bool type, displays the model
return: the model
'''
model_in = AlexNet(**kwargs)
if pretrain:
# import the pretrained model. default=False
state_dict = load_state_dict_from_url(model_url['alexnet'], progress=show)
model_in.load_state_dict_from_url(state_dict)
return model_in
#
# device = torch.device("cuda:0" if torch.cuda.is_available else "cpu")
# x = torch.randn((1, 3, 224, 224))
# net = alexnet()
# net.to(device)
# x = net.forward(x.cuda())
#
# torch.save(net.state_dict, r"D:\QQPCmgr\Desktop\a.pth")
训练模型:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from torchvision.datasets import CIFAR10
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import numpy as np
from net import alexnet
# 调整训练数据的尺寸,然后随机剪切
mytransforms = transforms.Compose([
transforms.Resize(256),
transforms.RandomHorizontalFlip(),
transforms.RandomCrop(224),
transforms.ToTensor()])
trainset = CIFAR10(download=False, root=r'D:\QQPCmgr\Desktop\data', transform=mytransforms)
Ctrainset = zip(trainset.data[:3000], trainset.targets[:3000])
Ctestset = zip(trainset.data[3000:3500], trainset.targets[3000:3500])
class CTraindata(Dataset):
def __init__(self, trainset):
super(CTraindata, self).__init__()
self.trainset = trainset
def __len__(self):
return len(self.trainset)
def __getitem__(self, index):
return self.trainset[index][0], self.trainset[index][1]
# 参数设置
epoch = 50
num_classes = 10
batch_size = 64
lr = 1e-4
classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
trainLoader = DataLoader(dataset=CTraindata(trainset),
batch_size=batch_size,
shuffle=False)
path = 'D://Desktop/data/'
if __name__ == '__main__':
device = ("cuda:0" if torch.cuda.is_available() else "cpu")
net = alexnet(num_classes=num_classes)
net.train()
certrion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(params=net.parameters(), lr=lr)
net.to(device)
for i in range(epoch):
for index, data in enumerate(trainLoader, 0):
optimizer.zero_grad()
X, y = data
y = (F.one_hot(y, num_classes))
X = X.to(device)
y = y.to(device)
y_pre = net.forward(X)
loss = certrion(y_pre, y)
loss.back_ward()
optimizer.step()
print("[epoch, loss]" % (loss.data.item, epoch))
CtestX = Ctestset[0]
CY = Ctestset[1].numpy()
CY_pre = net.forward(CtestX.to(device))
accuracy = np.equal(np.unravel_index(np.argmax(CY_pre.numpy()), CY_pre.numpy().shape), CY).sum / CY.shape[0]
torch.save(net.state_dict, path + str(accuracy[:5]) + '_' + str(epoch) + '.pth')
预测模型:
from net import Alexnet
import torch
import torch.nn as nn
import torch.nn.functional as F
import PIL.Image as Image
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
import numpy as np
device = ("cuda:0" if torch.cuda.is_available() else "cpu")
net = Alexnet()
net.eval()
#
weight_path = ""
net.state_dict = torch.load(weight_path)
net.to(device)
classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
if __name__ == '__main__':
img_path = str(input())
img = Image.open(img_path)
# 将图片的尺寸调整为网络输入的尺寸
mytransforms = transforms.Compose([
transforms.Resize(224),
transforms.ToTensor(),
])
imgX = (mytransforms(img)).to(device)
y = net.forward(imgX).numpy()
print("The result class is %s, and the accuracy is %f"%(classes[np.argmax(y)], y[np.argmax(y)]))