动手学pytorch（2）实现手写数字识别

最新推荐文章于 2025-03-26 18:09:48 发布

好好607

最新推荐文章于 2025-03-26 18:09:48 发布

阅读量259

点赞数

分类专栏： pytorch 文章标签： pytorch python 深度学习

本文链接：https://blog.csdn.net/JavaLihua/article/details/129883359

版权

pytorch 专栏收录该内容

4 篇文章

订阅专栏

本文介绍了使用PyTorch实现手写数字识别的过程，包括数据预处理、构建神经网络模型、训练模型以及模型的预测。通过MNIST数据集，利用卷积神经网络对图像进行处理，并进行了图像预处理如灰度化、高斯滤波和边缘检测来优化输入。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

前言

提示：预测部分没有做好，需要继续改进
参考链接：训练模型实现手写数字识别，https://blog.csdn.net/zhaohongfei_358/article/details/122800647
使用模型预测结果，重点包括预处理的方法，https://blog.csdn.net/Qyun_lucky_star/article/details/119606015

提示：工具jupyter notebook

代码

训练部分：

#导入相关包
import os
import numpy as np
import torch 
import torchvision
import matplotlib.pyplot as plt
from time import time
from torchvision import datasets,transforms
from torch import nn,optim

#transforms对图像进行预处理
transform =transforms.Compose([transforms.ToTensor(),#转化为tensor格式，并归一化到[0,1]
                              transforms.Normalize((0.5,),(0.5,)),])#数据标准化（均值，标准差）
#使其服从标准正态分布

#加载和下载训练数据集（torchvision中的dataset自带手写数字集，import后不需要带前缀）
train_set =datasets.MNIST('./dataset/MNIST',#下载位置
                          download=False,#是否下载
                          train=True,#是否为训练集
                          transform=transform#对图片进行预处理
                         )
train_set

#加载和下载训练数据集（torchvision中的dataset自带手写数字集，import后不需要带前缀）
test_set =datasets.MNIST('./dataset/MNIST',#下载位置
                          download=False,#是否下载
                          train=False,#是否为训练集
                          transform=transform#对图片进行预处理
                         )
test_set

#数据集分批使用
batch_size=64

#数据集再加载
train_loader =torch.utils.data.DataLoader(train_set,batch_size=batch_size,shuffle=True)#shuffle是否打乱
test_loader =torch.utils.data.DataLoader(test_set,batch_size=batch_size,shuffle=False)#shuffle是否打乱

#iter（）生成迭代器，next（）和它同时使用
dataiter =iter(train_loader)
images, labels=dataiter.next()
#images, labels=iter(train_loader).next()

images.shape, labels.shape,len(labels)#（batch_size，channel=1,h,w）

#查看其中一张训练集图片
plt.imshow(images[0].numpy().squeeze(),cmap='gray_r')#squeeze()压缩或者解压，去除维数为1的维度，camp:# 将标量数据映射到色彩图(灰度图)

#定义网络（BP神经网络）
class NerualNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        #初始化
        
        #第一个线性层（长*宽，输出维度）
        self.linear1 =nn.Linear(28*28,128)
        #使用Relu激活函数
        self.relu1=nn.ReLU()
        
        #第二个线性层（第一层输出维度，第二层输出维度）
        self.linear2 =nn.Linear(128,64)
        #使用Relu激活函数
        self.relu2=nn.ReLU()
        
        #第三个线性层（第二层输出维度，第三层输出维度）
        self.linear3 =nn.Linear(64,10)
        #softmax进行归一化
        self.softmax=nn.LogSoftmax(dim=1)
        
        #网络架构标准写法（和前面效果一样）
        self.model=nn.Sequential(nn.Linear(28*28,128),#Sequential打平
                                nn.ReLU(),
                                nn.Linear(128,64),
                                nn.ReLU(),
                                nn.Linear(64,10),
                                nn.LogSoftmax(dim=1))
        
    #定义前向传播图
    def forward(self,x):#x为输入图像，形状为（64，1，28，28）
        #将x转化为符合网络的形状（矩阵乘法）
        x=x.view(x.shape[0],-1)
        #x.shape[0]取第一个维度64，view（64，1*28*28）
        
        #前向传播
        x = self.linear1(x)
        x = self.relu1(x)
        x = self.linear2(x)
        x = self.relu2(x)
        x = self.linear3(x)
        x = self.softmax(x)

        # 上述一串，可以直接使用 x = self.model(x) 代替。

        return x

#实例化网络
model=NerualNetwork()

#定义损失函数（负对数似然）
criterion = nn.NLLLoss()

#定义优化器（随机梯度下降法）
optimizer =optim.SGD(model.parameters(),lr=0.003,momentum=0.9)
#网络参数，学习率，冲量防止过拟合

#开始训练

#记录开始时间：time当下时间
start_time=time()

#一共训练15轮
epochs=15

for i in range(epochs):
    running_loss =0#本轮损失
    for images, labels in train_loader:
        #前向传播获得预测值
        output=model(images)
        
        #计算损失
        loss=criterion(output,labels)
        
        #进行反向传播
        loss.backward()
        
        #更新权重
        optimizer.step()
        
        #清空梯度
        optimizer.zero_grad()
        
        #累加损失
        running_loss +=loss.item()
        
    else:
        #一轮循环结束打印本轮的损失函数(平均损失)
        print("epoch{}-Trainning loss:{}".format(i,running_loss/len(train_loader)))
        
#打印总耗时
print("\nTraining time:{}.minutes".format((time()-start_time)/60))

#测试
correct_count,all_count =0,0
model.eval()#设置为评估模式

#从测试集中一批一批加载数据(epochs没有限制)
for images,labels in test_loader:
    #循环检测这一批(64张28*28的灰度图)
    for i in range(len(labels)):
#         print(images[i].shape)
        logps =model(images[i])#进行前向传播，获取当前第i张预测值
        probab =list(logps.detach().numpy()[0])#预测结果转化成概率列表，[0]只取第一张图片，一张图片有0—9十个数字概率
        pred_label =probab.index(max(probab))#取最大的概率索引为预测结果
        true_label =labels.numpy()[i]#注意和预测的区分
        if(true_label==pred_label):
            correct_count+=1
        all_count+=1
print("测试次数：",all_count)
print("\n模型准确度：",correct_count/all_count)

#torch.save(NerualNetwork,'./pth/NerualNetwork_mnist.pkl')

预测部分：

#对预测图片预处理
import cv2
import numpy as np

def image_preprocessing():

	# 读取图片
	img=cv2.imread(r'D:\6.jpeg')

	# =====================图像处理======================== #

	# 转换成灰度图像
	gray_img = cv2.cvtColor(img ,cv2.COLOR_BGR2GRAY)

	# 进行高斯滤波
	gauss_img = cv2.GaussianBlur(gray_img, (5,5), 0, 0, cv2.BORDER_DEFAULT)

	# 边缘检测
	img_edge1 = cv2.Canny(gauss_img, 100, 200)

	# ==================================================== #
	# =====================图像分割======================== #

	# 获取原始图像的宽和高
	high = img.shape[0]
	width = img.shape[1]

	# 分别初始化高和宽的和
	add_width = np.zeros(high, dtype = int)
	add_high = np.zeros(width, dtype = int)

	# 计算每一行的灰度图的值的和
	for h in range(high):
		for w in range(width):
			add_width[h] = add_width[h] + img_edge1[h][w]

	# 计算每一列的值的和
	for w in range(width):
		for h in range(high):
			add_high[w] = add_high[w] + img_edge1[h][w]

	# 初始化上下边界为宽度总值最大的值的索引
	acount_high_up = np.argmax(add_width)
	acount_high_down = np.argmax(add_width)

	# 将上边界坐标值上移，直到没有遇到白色点停止，此为数字的上边界
	while add_width[acount_high_up] != 0:
		acount_high_up = acount_high_up + 1

	# 将下边界坐标值下移，直到没有遇到白色点停止，此为数字的下边界
	while add_width[acount_high_down] != 0:
		acount_high_down = acount_high_down - 1

	# 初始化左右边界为宽度总值最大的值的索引
	acount_width_left = np.argmax(add_high)
	acount_width_right = np.argmax(add_high)

	# 将左边界坐标值左移，直到没有遇到白色点停止，此为数字的左边界
	while add_high[acount_width_left] != 0:
		acount_width_left = acount_width_left - 1

	# 将右边界坐标值右移，直到没有遇到白色点停止，此为数字的右边界
	while add_high[acount_width_right] != 0:
		acount_width_right = acount_width_right + 1

	# 求出宽和高的间距
	width_spacing = acount_width_right - acount_width_left
	high_spacing = acount_high_up - acount_high_down

	# 求出宽和高的间距差
	poor = width_spacing - high_spacing

	# 将数字进行正方形分割，目的是方便之后进行图像压缩
	if poor > 0:
		tailor_image = img[acount_high_down - poor // 2 - 5:acount_high_up + poor - poor // 2 + 5, acount_width_left - 5:acount_width_right + 5]
	else:
		tailor_image = img[acount_high_down - 5:acount_high_up + 5, acount_width_left + poor // 2 - 5:acount_width_right - poor + poor // 2 + 5]

	# ==================================================== #
	# ======================小图处理======================= #

	# 将裁剪后的图片进行灰度化
	gray_img = cv2.cvtColor(tailor_image , cv2.COLOR_BGR2GRAY)

	# 高斯去噪
	gauss_img = cv2.GaussianBlur(gray_img, (5,5), 0, 0, cv2.BORDER_DEFAULT)

	# 将图像形状调整到28*28大小
	zoom_image = cv2.resize(gauss_img, (28, 28))

	# 获取图像的高和宽
	high = zoom_image.shape[0]
	wide = zoom_image.shape[1]

	# 将图像每个点的灰度值进行阈值比较
	for h in range(high):
		for w in range(wide):

			# 若灰度值大于100，则判断为背景并赋值0，否则将深灰度值变白处理
			if zoom_image[h][w] > 100:
				zoom_image[h][w] = 0
			else:
				zoom_image[h][w] = 255 - zoom_image[h][w]

	# ==================================================== #

	return zoom_image

如果想查看，其中的效果，可以利用如下代码

import cv2
#读取图片
img=cv2.imread(r'D:\6.jpeg')
#展示图片
cv2.imshow('img',img)
#如果产生死机情况
cv2.waitKey(0)

##对图片进行预测
import cv2
import torch

#读取图片
img=image_preprocessing()#格式为numpy（28，28）


#转换形状
inputs=img

#转换成tensor格式
inputs=torch.from_numpy(inputs)

#转化成浮点型
inputs=inputs.float()

#增加维度(1,28,28)
inputs=inputs.unsqueeze(dim=0)

# inputs.shape

#丢入网络(形状为1，28，28)
predict=model(inputs)

#打印结果（最大概率的索引）
print("最有可能是：{}".format(torch.argmax(predict).detach().numpy()))