基于PyTorch实现LeNet、AlexNet、ResNet完成MNIST手写字体识别任务
引言
本文基于PyTorch实现LeNet、AlexNet、ResNet完成MNIST手写字体识别任务。1节中说明了数据集的处理方法,2节说明了算法的设计思路,3节给出了LeNet、AlexNet、ResNet的实现代码,最后分析了采用Pytorch框架与自己手写的深度学习框架的不同,4节中对各个卷积网络进行训练,给出了训练结果和可视化结果,其中对LeNet网络结构进行了一些不同的调整和训练,5节中是本项目的一些总结,文末在附录中给出了本项目的所有代码。
1 数据集
简便起见,直接使用torchvision.datasets中封装好的MNIST()来加载MNIST的ubyte格式的数据集,定义数据集加载函数load_data_mnist,代码如下:
def load_data_mnist(batch_size, resize=None):
num_workers = 4
trans = [transforms.ToTensor()]
if resize:
trans.insert(0, transforms.Resize(resize))
trans = transforms.Compose(trans)
mnist_train = torchvision.datasets.MNIST(root="../data", train=True, transform=trans, download=True)
mnist_test = torchvision.datasets.MNIST(root="../data", train=False, transform=trans, download=True)
return (data.DataLoader(mnist_train, batch_size, shuffle=True, num_workers=num_workers),
data.DataLoader(mnist_test, batch_size, shuffle=False, num_workers=num_workers))
2 算法设计
2.1 train_one_epoch
定义train_one_epoch函数,实现对所有数据的一次训练迭代,返回该次迭代的训练损失和数据集ACC,代码如下:
def train_one_epoch(net, train_iter, device, optimizer, loss):
train_l = 0.0
train_ac = 0.0
cnt = 0
for i, (X, y) in enumerate(train_iter):
net.train()
optimizer.zero_grad()
X, y = X.to(device), y.to(device)
y_hat = net(X)
l = loss(y_hat, y)
l.backward()
optimizer.step()
train_l += l.item()
train_ac += torch.sum((torch.argmax(y_hat, dim=1) == y)).item()
cnt += len(y)
return train_l/len(train_iter), train_ac/cnt
2.2 evaluate_accuracy
定义evaluate_accuracy函数,以对每一次调用完train_one_epoch函数后,对测试集(验证集)进行模型ACC的评估并返回,代码实现如下:
def evaluate_accuracy(net, data_iter, device=None):
net.eval()
if not device:
device = next(iter(net.parameters())).device
ac, cnt = 0.0, 0
for X, y in data_iter:
X, y = X.to(device), y.to(device)
Z = torch.argmax(net(X), dim=1)
ac += torch.sum(Z == y).item()
cnt += len(y)
return ac/cnt
2.3 train
定义train函数,实现对模型整个训练过程的控制,首先对网络的卷积和全连接层采用xavier初始化,定义优化器、损失函数,随后进行num_epochs次训练迭代,对每个epoch的迭代,依次调用train_one_epoch函数和evaluate_accuracy函数并记录训练过程,最后用matplotlib对训练过程可视化,具体的实现代码如下;
def train(net, train_iter, test_iter, num_epochs, lr, device):
def init_weights(m):
if type(m) == nn.Linear or type(m) == nn.Conv2d:
torch.nn.init.xavier_uniform_(m.weight)
net.apply(init_weights)
print('training on', device)
net.to(device)
optimizer = torch.optim.SGD(net.parameters(), lr=lr)
loss = nn.CrossEntropyLoss()
# 用来保存训练以及验证过程中信息
results_file = "results{}.txt".format(datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
trl, tra, tea = [], [], []
for epoch in range(num_epochs):
train_l, train_acc = train_one_epoch(net, train_iter, device, optimizer, loss)
test_acc = evaluate_accuracy(net, test_iter, device)
trl.append(train_l), tra.append(train_acc), tea.append(test_acc)
# print(epoch, train_l, train_acc, test_acc)
with open(results_file, "a") as f:
f.write(f'epoch {epoch} loss {train_l:.3f}, train acc {train_acc:.3f}, test acc {test_acc:.3f} \n')
print(f'epoch {epoch} loss {train_l:.3f}, train acc {train_acc:.3f}, test acc {test_acc:.3f} \n')
plt.xlabel = 'epoch'
xlim = range(0, num_epochs)
legend = ['train loss', 'train acc', 'test acc']
plt.plot(xlim, trl, linewidth=1, color='purple')
plt.plot(xlim, tra, linewidth=1, color='green', linestyle='--')
plt.plot(xlim, tea, linewidth=1, color='blue', linestyle='--')
plt.legend(legend, ncol=4)
plt.grid(axis='y', linewidth=0.3)
plt.xticks(range(0, num_epochs, 2))
plt.show()
2.4 程序入口
以LeNet为例,定义好批大小batch_size、 学习率lr、迭代次数num_epochs、调用load_data_mnist函数加载数据集得到train_iter, test_iter,再把这些参数传入train函数即可,代码如下,其中try_gpu函数用于检验cuda是否可用:
if __name__ == '__main__':
def try_gpu():
if torch.cuda.device_count() >= 1:
return torch.device('cuda:0')
return torch.device('cpu')
net = LeNet5
batch_size = 256
train_iter, test_iter = load_data_mnist(batch_size=batch_size)
lr, num_epochs = 0.9, 10
train(net, train_iter, test_iter, num_epochs, lr, try_gpu())
3 模型实现
3.1 LeNet
搭建LeNet模型的代码如下:
import torch
from torch import nn
from torch.nn import functional as F
LeNet5 = nn.Sequential(
nn.Conv2d(1, 6, kernel_size=5, padding=2), nn.Sigmoid(),
nn.AvgPool2d(kernel_size=2, stride=2),
nn.Conv2d(6, 16, kernel_size=5), nn.Sigmoid(),
nn.AvgPool2d(kernel_size=2, stride=2),
nn.Flatten(),
nn.Linear(16 * 5 * 5, 120), nn.Sigmoid(),
nn.Linear(120, 84), nn.Sigmoid(),
nn.Linear(84, 10))
3.2 AlexNet
搭建AlexNet模型的代码如下:
AlexNet = nn.Sequential(
nn.Conv2d(1, 96, kernel_size=11, stride=4, padding=1), nn.ReLU(),
nn.MaxPool2d(kernel_size=3, stride=2),
nn.Conv2d(96, 256, kernel_size=5, padding=2), nn.ReLU(),
nn.MaxPool2d(kernel_size=3, stride=2),
nn.Conv2d(256, 384, kernel_size=3, padding=1), nn.ReLU(),
nn.Conv2d(384, 384, kernel_size=3, padding=1), nn.ReLU(),
nn.Conv2d(384, 256, kernel_size=3, padding=1), nn.ReLU(),
nn.MaxPool2d(kernel_size=3, stride=2),
nn.Flatten(),
nn.Linear(6400, 4096), nn.ReLU(),
nn.Dropout(p=0.5),
nn.Linear(4096, 4096), nn.ReLU(),
nn.Dropout(p=0.5),
nn.Linear(4096, 10))
3.3 ResNet
搭建ResNet模型的代码如下:
class Residual(nn.Module):
def __init__(self, input_channels, num_channels,use_1x1conv=False, strides=1):
super().__init__()
self.conv1 = nn.Conv2d(input_channels, num_channels,kernel_size=3, padding=1, stride=strides)
self.conv2 = nn.Conv2d(num_channels, num_channels,kernel_size=3, padding=1)
if use_1x1conv:
self.conv3 = nn.Conv2d(input_channels, num_channels,
kernel_size=1, stride=strides)
else:
self.conv3 = None
self.bn1 = nn.BatchNorm2d(num_channels)
self.bn2 = nn.BatchNorm2d(num_channels)
def forward(self, X):
Y = F.relu(self.bn1(self.conv1(X)))
Y = self.bn2(self.conv2(Y))
if self.conv3:
X = self.conv3(X)
Y += X
return F.relu(Y)
b1 = nn.Sequential(nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3),
nn.BatchNorm2d(64), nn.ReLU(),
nn.MaxPool2d(kernel_size=3, stride=2, padding=1))
def resnet_block(input_channels, num_channels, num_residuals,
first_block=False):
blk = []
for i in range(num_residuals):
if i == 0 and not first_block:
blk.append(Residual(input_channels, num_channels,use_1x1conv=True, strides=2))
else:
blk.append(Residual(num_channels, num_channels))
return blk
b2 = nn.Sequential(*resnet_block(64, 64, 2, first_block=True))
b3 = nn.Sequential(*resnet_block(64, 128, 2))
b4 = nn.Sequential(*resnet_block(128, 256, 2))
b5 = nn.Sequential(*resnet_block(256, 512, 2))
ResNet18 = nn.Sequential(b1, b2, b3, b4, b5,
nn.AdaptiveAvgPool2d((1,1)),
nn.Flatten(), nn.Linear(512, 10))
3.4 简要说明PyTorch框架与作业02的不同
我认为PyTorch框架与自己手写的精简版深度框架最不大区别同是以下5点:
1) PyTorch能够调用GPU进行矩阵的快速运算,使得神经网络训练迅速;
2) PyTorch功能齐全,激活函数,优化器,损失函数,初始化方法等很齐全,配套的其他库也很多;
3) PyTorch生态好,使用者众多,能在网络中快速找到问题的解决办法;
4) PyTorch能够自动计算梯度,模块化设计得很好,搭建模型十分方便;
5) PyTorch一直在更新,能够不断适应最新的深度学习模型,如图神经网络等。
4 模型训练及结果
4.1 训练LeNet
4.1.1 使用原始的LeNet5的网络结构进行训练
训练的一些参数如下:
net = LeNet5
batch_size = 256
train_iter, test_iter = load_data_mnist(batch_size=batch_size)
lr, num_epochs = 0.6, 20
optimizer = torch.optim.SGD(net.parameters(), lr=lr)
loss = nn.CrossEntropyLoss()
训练最终结果:[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-jkIBN831-1682870518342)(E:/typora-image/image-20230430210026789.png)]
训练过程可视化如下:
4.1.2 调整LeNet5的网络结构进行训练
本小节模型训练所需参数与4.1.1节中的保持不变,尝试改变卷积核大小、输出通道数、激活函数、卷积层或全连接层数量进行训练。
4.1.2.1 调整卷积核大小
将LeNet5中卷积非池化层的卷积核大小改为3。
kernel_size=3
训练最终结果:[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-X4SnO5KM-1682870518344)(E:/typora-image/image-20230430230814355.png)]
训练过程可视化如下:
4.1.2.2 调整输出通道数
将LeNet5中卷积层的输出通道数改为原来的一半。
nn.Conv2d(1, 6//2, kernel_size=5, padding=2) # 卷积层1
nn.Conv2d(6//2, 16//2, kernel_size=5) # 卷积层2
训练最终结果:
![在这里插入图片描述](https://img-blog.csdnimg.cn/0b93b74f16534f2ea033dffe2dba8eb3.png#pic_center
训练过程可视化如下:
4.1.2.3 调整激活函数
将LeNet5中的sigmoid激活函数换成ReLU。
训练最终结果:[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-Y2EdJg6A-1682870518346)(E:/typora-image/image-20230430235456762.png)]
训练过程可视化如下:
4.1.2.4 调整卷积层或全连接层数量
减少卷积层与全连接层数量,均减少为1。
LeNet5_04 = nn.Sequential(
nn.Conv2d(1, 16, kernel_size=5, padding=2), nn.Sigmoid(),
nn.AvgPool2d(kernel_size=2, stride=2),
nn.Flatten(),
nn.Linear(3136, 10))
训练最终结果:![在这里插入图片描述](https://img-blog.csdnimg.cn/d767d779c8a247a885227f90038d7fbc.png#pic_center
训练过程可视化如下:
4.2 训练AlexNet
训练的一些参数如下:
net = AlexNet
batch_size = 128
train_iter, test_iter = load_data_mnist(batch_size=batch_size, resize=224)
lr, num_epochs = 0.01, 10
optimizer = torch.optim.SGD(net.parameters(), lr=lr)
loss = nn.CrossEntropyLoss()
训练最终结果:[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-RLA2zfeb-1682870518347)(E:/typora-image/image-20230430220326417.png)]
训练过程可视化如下:
4.3 训练ResNet
训练的一些参数如下:
net = ResNet18
batch_size = 256
train_iter, test_iter = load_data_mnist(batch_size=batch_size, resize=96)
lr, num_epochs = 0.05, 10
optimizer = torch.optim.SGD(net.parameters(), lr=lr)
loss = nn.CrossEntropyLoss()
训练最终结果:[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-psJMwZGW-1682870518348)(E:/typora-image/image-20230430222303506.png)]
训练过程可视化如下:
5 小结
从4中结果可以看出,不同的网络结果将对模型的训练效果有影响,其次,模型越复杂,其收敛速度越快,精度越高。最后:PyTorch很好用。
附录
完整代码
model.py
import torch
from torch import nn
from torch.nn import functional as F
LeNet5 = nn.Sequential(
nn.Conv2d(1, 6, kernel_size=5, padding=2), nn.Sigmoid(),
nn.AvgPool2d(kernel_size=2, stride=2),
nn.Conv2d(6, 16, kernel_size=5), nn.Sigmoid(),
nn.AvgPool2d(kernel_size=2, stride=2),
nn.Flatten(),
nn.Linear(16 * 5 * 5, 120), nn.Sigmoid(),
nn.Linear(120, 84), nn.Sigmoid(),
nn.Linear(84, 10))
LeNet5_01 = nn.Sequential(
nn.Conv2d(1, 6, kernel_size=3, padding=2), nn.Sigmoid(),
nn.AvgPool2d(kernel_size=2, stride=2),
nn.Conv2d(6, 16, kernel_size=3), nn.Sigmoid(),
nn.AvgPool2d(kernel_size=2, stride=2),
nn.Flatten(),
nn.Linear(576, 120), nn.Sigmoid(),
nn.Linear(120, 84), nn.Sigmoid(),
nn.Linear(84, 10))
LeNet5_02 = nn.Sequential(
nn.Conv2d(1, 6//2, kernel_size=5, padding=2), nn.Sigmoid(),
nn.AvgPool2d(kernel_size=2, stride=2),
nn.Conv2d(6//2, 16//2, kernel_size=5), nn.Sigmoid(),
nn.AvgPool2d(kernel_size=2, stride=2),
nn.Flatten(),
nn.Linear(16//2 * 5 * 5, 120), nn.Sigmoid(),
nn.Linear(120, 84), nn.Sigmoid(),
nn.Linear(84, 10))
LeNet5_03 = nn.Sequential(
nn.Conv2d(1, 6, kernel_size=5, padding=2), nn.ReLU(),
nn.AvgPool2d(kernel_size=2, stride=2),
nn.Conv2d(6, 16, kernel_size=5), nn.ReLU(),
nn.AvgPool2d(kernel_size=2, stride=2),
nn.Flatten(),
nn.Linear(16 * 5 * 5, 120), nn.ReLU(),
nn.Linear(120, 84), nn.ReLU(),
nn.Linear(84, 10))
LeNet5_04 = nn.Sequential(
nn.Conv2d(1, 16, kernel_size=5, padding=2), nn.Sigmoid(),
nn.AvgPool2d(kernel_size=2, stride=2),
nn.Flatten(),
nn.Linear(3136, 10))
AlexNet = nn.Sequential(
nn.Conv2d(1, 96, kernel_size=11, stride=4, padding=1), nn.ReLU(),
nn.MaxPool2d(kernel_size=3, stride=2),
nn.Conv2d(96, 256, kernel_size=5, padding=2), nn.ReLU(),
nn.MaxPool2d(kernel_size=3, stride=2),
nn.Conv2d(256, 384, kernel_size=3, padding=1), nn.ReLU(),
nn.Conv2d(384, 384, kernel_size=3, padding=1), nn.ReLU(),
nn.Conv2d(384, 256, kernel_size=3, padding=1), nn.ReLU(),
nn.MaxPool2d(kernel_size=3, stride=2),
nn.Flatten(),
nn.Linear(6400, 4096), nn.ReLU(),
nn.Dropout(p=0.5),
nn.Linear(4096, 4096), nn.ReLU(),
nn.Dropout(p=0.5),
nn.Linear(4096, 10))
class Residual(nn.Module):
def __init__(self, input_channels, num_channels,use_1x1conv=False, strides=1):
super().__init__()
self.conv1 = nn.Conv2d(input_channels, num_channels,kernel_size=3, padding=1, stride=strides)
self.conv2 = nn.Conv2d(num_channels, num_channels,kernel_size=3, padding=1)
if use_1x1conv:
self.conv3 = nn.Conv2d(input_channels, num_channels,
kernel_size=1, stride=strides)
else:
self.conv3 = None
self.bn1 = nn.BatchNorm2d(num_channels)
self.bn2 = nn.BatchNorm2d(num_channels)
def forward(self, X):
Y = F.relu(self.bn1(self.conv1(X)))
Y = self.bn2(self.conv2(Y))
if self.conv3:
X = self.conv3(X)
Y += X
return F.relu(Y)
b1 = nn.Sequential(nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3),
nn.BatchNorm2d(64), nn.ReLU(),
nn.MaxPool2d(kernel_size=3, stride=2, padding=1))
def resnet_block(input_channels, num_channels, num_residuals,
first_block=False):
blk = []
for i in range(num_residuals):
if i == 0 and not first_block:
blk.append(Residual(input_channels, num_channels,use_1x1conv=True, strides=2))
else:
blk.append(Residual(num_channels, num_channels))
return blk
b2 = nn.Sequential(*resnet_block(64, 64, 2, first_block=True))
b3 = nn.Sequential(*resnet_block(64, 128, 2))
b4 = nn.Sequential(*resnet_block(128, 256, 2))
b5 = nn.Sequential(*resnet_block(256, 512, 2))
ResNet18 = nn.Sequential(b1, b2, b3, b4, b5,
nn.AdaptiveAvgPool2d((1,1)),
nn.Flatten(), nn.Linear(512, 10))
main.py
import datetime
import matplotlib.pylab as plt
import torch
import torchvision
from torch import nn
from torch.utils import data
from torchvision import transforms
from model import LeNet5, AlexNet, ResNet18, LeNet5_01, LeNet5_02, LeNet5_03, LeNet5_04
def load_data_mnist(batch_size, resize=None):
num_workers = 4
trans = [transforms.ToTensor()]
if resize:
trans.insert(0, transforms.Resize(resize))
trans = transforms.Compose(trans)
mnist_train = torchvision.datasets.MNIST(root="../data", train=True, transform=trans, download=True)
mnist_test = torchvision.datasets.MNIST(root="../data", train=False, transform=trans, download=True)
return (data.DataLoader(mnist_train, batch_size, shuffle=True, num_workers=num_workers),
data.DataLoader(mnist_test, batch_size, shuffle=False, num_workers=num_workers))
def evaluate_accuracy(net, data_iter, device=None):
net.eval()
if not device:
device = next(iter(net.parameters())).device
ac, cnt = 0.0, 0
for X, y in data_iter:
X, y = X.to(device), y.to(device)
Z = torch.argmax(net(X), dim=1)
ac += torch.sum(Z == y).item()
cnt += len(y)
return ac/cnt
def train_one_epoch(net, train_iter, device, optimizer, loss):
train_l = 0.0
train_ac = 0.0
cnt = 0
for i, (X, y) in enumerate(train_iter):
net.train()
optimizer.zero_grad()
X, y = X.to(device), y.to(device)
y_hat = net(X)
l = loss(y_hat, y)
l.backward()
optimizer.step()
train_l += l.item()
train_ac += torch.sum((torch.argmax(y_hat, dim=1) == y)).item()
cnt += len(y)
return train_l/len(train_iter), train_ac/cnt
def train(net, train_iter, test_iter, num_epochs, lr, device):
def init_weights(m):
if type(m) == nn.Linear or type(m) == nn.Conv2d:
torch.nn.init.xavier_uniform_(m.weight)
net.apply(init_weights)
print('training on', device)
net.to(device)
optimizer = torch.optim.SGD(net.parameters(), lr=lr)
loss = nn.CrossEntropyLoss()
# 用来保存训练以及验证过程中信息
results_file = "results{}.txt".format(datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
trl, tra, tea = [], [], []
for epoch in range(num_epochs):
train_l, train_acc = train_one_epoch(net, train_iter, device, optimizer, loss)
test_acc = evaluate_accuracy(net, test_iter, device)
trl.append(train_l), tra.append(train_acc), tea.append(test_acc)
# print(epoch, train_l, train_acc, test_acc)
with open(results_file, "a") as f:
f.write(f'epoch {epoch} loss {train_l:.3f}, train acc {train_acc:.3f}, test acc {test_acc:.3f} \n')
print(f'epoch {epoch} loss {train_l:.3f}, train acc {train_acc:.3f}, test acc {test_acc:.3f} \n')
plt.xlabel = 'epoch'
xlim = range(0, num_epochs)
legend = ['train loss', 'train acc', 'test acc']
plt.plot(xlim, trl, linewidth=1, color='purple')
plt.plot(xlim, tra, linewidth=1, color='green', linestyle='--')
plt.plot(xlim, tea, linewidth=1, color='blue', linestyle='--')
plt.legend(legend, ncol=4)
plt.grid(axis='y', linewidth=0.3)
plt.xticks(range(0, num_epochs, 2))
plt.show()
if __name__ == '__main__':
def try_gpu():
if torch.cuda.device_count() >= 1:
return torch.device('cuda:0')
return torch.device('cpu')
net = LeNet5
batch_size = 256
train_iter, test_iter = load_data_mnist(batch_size=batch_size)
lr, num_epochs = 0.6, 20
train(net, train_iter, test_iter, num_epochs, lr, try_gpu())
# net = AlexNet
# batch_size = 128
# train_iter, test_iter = load_data_mnist(batch_size=batch_size, resize=224)
# lr, num_epochs = 0.01, 10
# train(net, train_iter, test_iter, num_epochs, lr, try_gpu())
# net = ResNet18
# batch_size = 256
# train_iter, test_iter = load_data_mnist(batch_size=batch_size, resize=96)
# lr, num_epochs = 0.05, 10
# train(net, train_iter, test_iter, num_epochs, lr, try_gpu())
参考文献
[1] 胡晓武等.智能之门:神经网络与深度学习入门:基于Python〔M〕.北京∶高等教育出版社,2020.12 ;