PyTorch基于卷积神经网络的手写数字识别
卷积神经网络是深度学习中处理图像相关问题时常用的一种神经网络模型。本文演示了搭建一个4层卷积网络来识别MNIST数据集中的手写数字。经过2000步梯度下降,神经网络的识别错误率能下降到3%左右。
本文的Jupyter Notebook代码已经放到了我的gitee项目“deeplbox”上,大家可以克隆下来调试。“deeplbox”收集了一些深度学习领域的小例程,欢迎大家也把自己在学习过程中写的一些神经网络模型放到上面。
import numpy as np
import matplotlib.pyplot as plt
import torch
import torchvision
USEGPU = True #是否使用gpu训练
device = "cuda" if USEGPU else "cpu"
从MNIST文件中加载数据
我们先把MNIST数据集从官网上下载下来,一共有四个文件:
train-images-idx3-ubyte.gz: 训练集图片 (9912422 bytes)
train-labels-idx1-ubyte.gz: 训练集标签 (28881 bytes)
t10k-images-idx3-ubyte.gz: 测试集图片 (1648877 bytes)
t10k-labels-idx1-ubyte.gz: 测试集标签 (4542 bytes)
我们按照官网上的说明把这4个文件解压,并且重命名成好认识的形式
def read_mnist_data(lab_path, img_path):
lab_file = open(lab_path, 'rb')
img_file = open(img_path, 'rb')
labels = np.fromfile(lab_file, offset=8, dtype=np.uint8) #label文件中的数据从第8个字节开始
images = np.fromfile(img_file, offset=16, dtype=np.uint8).reshape((len(labels), 784)) #image文件中的数据从第16个字节开始
lab_file.close()
img_file.close()
return labels, images
train_labs, train_imgs = read_mnist_data("./train_labs", "./train_imgs") #加载训练集
test_labs, test_imgs = read_mnist_data("./test_labs", "./test_imgs") #加载测试集
print("train_img num: ", len(train_labs)) #查看训练集中图片的数量
print("test_img num: ", len(test_labs))
#显示一张图片看看数据是否加载成功
plt.imshow(train_imgs[0:1].reshape(28, 28), cmap='gray') #cmap='gray':显示灰度图
print("lab=",train_labs[0])
train_img num: 60000
test_img num: 10000
lab= 5
格式化数据
train_imgs = train_imgs.reshape(60000,1, 28,28) #卷积网络的第一层的输入格式是1*28*28,而不是28*28,所以需要重新格式化
test_imgs = test_imgs.reshape(10000,1, 28, 28)
# plt.imshow(train_imgs[0].reshape(28,28)) #检查一下重新格式化得对不对
#把数据从numpy格式转化成torch格式的张量
X = torch.from_numpy(train_imgs).type(torch.FloatTensor)
Y = torch.from_numpy(train_labs).type(torch.LongTensor)
X_test = torch.from_numpy(test_imgs).type(torch.FloatTensor)
Y_test = torch.from_numpy(test_labs).type(torch.LongTensor)
#把训练集搬到目标设备上
X = X.to(device)
Y = Y.to(device)
X_test = X_test.to(device)
Y_test = Y_test.to(device)
print("X.device:", X.device)
print("Y.device:", Y.device)
X.device: cuda:0
Y.device: cuda:0
构建神经网络
我们要构建的神经网络结构如下,这里要注意的是卷积运算的输入大小和输出大小之间的关系:
W o u t = ( W i n − K + 2 P ) / S + 1 W_{out} = (W_{in} - K + 2P)/S + 1 Wout=(Win−K+2P)/S+1
这里:
W o u t W_{out} Wout是输出图片的大小, W i n W_{in} Win是输入图片的大小(假设图片是正方形), K K K是卷积核的大小, P P P是对输入图片补0的数量, S S S是卷积核的滑动步长
class neural_net(torch.nn.Module):
def __init__(self):
super(neural_net, self).__init__()
#卷积层
self.conv = torch.nn.Sequential(
torch.nn.Conv2d(1, #输入通道数,灰度图像是1,RGB彩色图像是3
16, #卷积核数量
(3,3), #卷积核大小,这里是3*3。(3,3)这里也可以简写成3
stride=1, #卷积核的滑动步长。如果这个参数不设置,那么默认是1
padding=0, #图片边缘填充0的个数,这里设置成不填充。如果这个参数不设置,那么默认是0
), #输出16*26*26,28-3+2*0)/1 + 1 = 26。
torch.nn.ReLU(),
torch.nn.Conv2d(16,32,3), #26-3+1=24,输出32*24*24
torch.nn.ReLU(),
torch.nn.MaxPool2d(kernel_size=2), #最大池化。这里把2*2个像素汇聚成1个像素。输出32*12*12
torch.nn.Conv2d(32,64,3), #12-3+1=10,输出64*10*10
torch.nn.ReLU(),
torch.nn.Conv2d(64,128,3), #10-3+1=8,输出128*8*8
torch.nn.ReLU(),
torch.nn.MaxPool2d(kernel_size=2) #输出128*4*4
)
#输出的全连接层
self.lo = torch.nn.Sequential(
torch.nn.Linear(128*4*4, 20),
torch.nn.ReLU(),
torch.nn.Linear(20, 10)
)
def forward(self, input):
out = self.conv(input)
out = out.view(-1, 128*4*4) #把128*4*4的数据碾平成1*2048,好让全连接层接收
out = self.lo(out)
return out
net = neural_net().to(device)
print(net)
neural_net(
(conv): Sequential(
(0): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1))
(1): ReLU()
(2): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1))
(3): ReLU()
(4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(5): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
(6): ReLU()
(7): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1))
(8): ReLU()
(9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
)
(lo): Sequential(
(0): Linear(in_features=2048, out_features=20, bias=True)
(1): ReLU()
(2): Linear(in_features=20, out_features=10, bias=True)
)
)
开始训练
#获取数组a中最大值的下标
def get_max(a):
max = a.max();
max_i = 0;
for i in range(len(a)):
if a[i] == max:
return i
optimizer = torch.optim.SGD(net.parameters(), lr = 0.001)
loss_func = torch.nn.CrossEntropyLoss()
import time
import random
t0 = time.time_ns()
loss = 0
BATCHSIZE = 100 #分批训练,一批图片是100张
EPOCH = 500
for i in range(5000):
#创建batch
offset = random.randint(0, 60000-BATCHSIZE)
X_batch = X[offset:offset+BATCHSIZE]
Y_batch = Y[offset:offset+BATCHSIZE]
Y_hat = net(X_batch)
loss = loss_func(Y_hat, Y_batch)
optimizer.zero_grad()
loss.backward()
optimizer.step()
#定期输出训练集和测试集上的损失函数,输出错误率
if i%EPOCH == 0:
offset = random.randint(0, 10000-BATCHSIZE)
X_test_b = X_test[offset:offset+BATCHSIZE]
Y_test_b = Y_test[offset:offset+BATCHSIZE]
Y_test_hat = net.forward(X_test_b)
test_loss = loss_func(Y_test_hat, Y_test_b)
error = 0
for j in range(BATCHSIZE):
p_value = get_max(Y_test_hat[j]) #训练好的模型预测出来的值
r_value = test_labs[offset+j] #训练集的标签,即实际值
if(p_value != r_value): #预测值和实际值不相等
error += 1
print("step=",i, "train_loss=", loss.data.item(), "test_loss=", test_loss.data.item(), "error_rate=", error/BATCHSIZE)
t1 = time.time_ns()
print("in the end loss=", loss.data.item())
print("running time=%ds" %((t1-t0)/1000000000))
step= 0 train_loss= 2.4394755363464355 test_loss= 2.420698881149292 error_rate= 0.89
step= 500 train_loss= 0.10490211844444275 test_loss= 0.4802166223526001 error_rate= 0.14
step= 1000 train_loss= 0.1551591157913208 test_loss= 0.15561605989933014 error_rate= 0.05
step= 1500 train_loss= 0.06410376727581024 test_loss= 0.07621936500072479 error_rate= 0.03
step= 2000 train_loss= 0.18166059255599976 test_loss= 0.06346110999584198 error_rate= 0.03
step= 2500 train_loss= 0.05365173891186714 test_loss= 0.17552213370800018 error_rate= 0.05
step= 3000 train_loss= 0.06641805917024612 test_loss= 0.055969901382923126 error_rate= 0.03
step= 3500 train_loss= 0.025603635236620903 test_loss= 0.040669035166502 error_rate= 0.01
step= 4000 train_loss= 0.012363594025373459 test_loss= 0.04872165620326996 error_rate= 0.01
step= 4500 train_loss= 0.03253624960780144 test_loss= 0.0683927983045578 error_rate= 0.04
in the end loss= 0.02621467038989067
running time=334s