Pytorch基于神经网络实现手写数字识别

本文基于PyTorch框架实现了手写数字识别任务,我使用Resnet18作为特征提取骨干,通过opencv框架实现了对手写数字的识别和分割,再将图像转为MNIST数据集格式,输入到神经网络中进行识别分类,最后输出内容。

下面我将详细讲解步骤。

一、对手写数字的分割框选。

首先我们将图像反相二值化处理,具体操作是通过反相操作,进行边缘膨胀操作,最后通过阈值将图像二值化。


# 反相灰度图,将黑白阈值颠倒
def accessPiexl(img):
    height = img.shape[0]
    width = img.shape[1]
    for i in range(height):
       for j in range(width):
           img[i][j] = 255 - img[i][j]
    return img

# 反相二值化图像
def accessBinary(img, threshold=128):
    img = accessPiexl(img)
    # 边缘膨胀,不加也可以
    kernel = np.ones((3, 3), np.uint8)
    img = cv2.dilate(img, kernel, iterations=1)
    _, img = cv2.threshold(img, threshold, 0, cv2.THRESH_TOZERO)
    return img

然后要找到图像中的边缘,并返回每个边框的左上角和右下角坐标,使用到opencv的findContours函数提取图像中的轮廓。

# # 寻找边缘,返回边框的左上角和右下角(利用cv2.findContours)
def findBorderContours(path, maxArea=50):
    img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
    img = accessBinary(img)
    contours, _ = cv2.findContours(img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
    borders = []
    for contour in contours:
        # 将边缘拟合成一个边框
        x, y, w, h = cv2.boundingRect(contour)
        if w * h > maxArea:
            border = [(x, y), (x + w, y + h)]
            borders.append(border)
    return borders

然后是显示结果和框选图像的数字。

# 显示结果及边框
def showResults(path, borders, results=None):
    img = cv2.imread(path)
    # 绘制
    print(img.shape)
    for i, border in enumerate(borders):
        cv2.rectangle(img, border[0], border[1], (0, 0, 255))
        if results:
            cv2.putText(img, str(results[i]), border[0], cv2.FONT_HERSHEY_COMPLEX, 0.8, (0, 255, 0), 1)
        #cv2.circle(img, border[0], 1, (0, 255, 0), 0)
    cv2.imshow('test', img)
    cv2.waitKey(0)
# 分割数字图像
def cropImages(path, borders):
    img = cv2.imread(path)
    cropped_images = []
    for border in borders:
        x1, y1 = border[0]
        x2, y2 = border[1]
        cropped_img = img[y1:y2, x1:x2]
        cropped_images.append(cropped_img)
    return cropped_images
#

具体效果是这样的:输入:

path = 'test2.jpg'
borders = findBorderContours(path)
showResults(path, borders)

输出:

二、训练神经网络

训练识别数字可以不需要准备自己的数据集,我使用了MNIST数据集训练我的Resnet18网络,步骤如下:

import torch
from torch import nn as nn
from torchvision.models import resnet18
import torch.utils.data
import torch.utils.data.distributed
import torch.optim as optim
from torch.autograd import Variable
from torchvision import datasets, transforms
lr = 1e-4  #学习率
batch_size = 16  #批量大小
epochs = 30  #批量数
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
classes = 10  #类别数
#数据预处理
transform = transforms.Compose(
    [transforms.Resize((224,224)),
     transforms.ToTensor(),
     transforms.Normalize((0.5,), (0.5,))]
)
transform_test = transforms.Compose(
    [transforms.Resize((224,224)),
     transforms.ToTensor(),
     transforms.Normalize((0.5,), (0.5,))]
)
#下载训练集
datasets_train = datasets.MNIST(root='./data',train=True,download=True,transform=transform)
datasets_test  = datasets.MNIST(root='./data',train=False,download=True,transform=transform_test)

#加载数据集
train_loader = torch.utils.data.DataLoader(datasets_train,batch_size= batch_size,shuffle=True)
test_loader = torch.utils.data.DataLoader(datasets_test,batch_size=batch_size,shuffle=False)

#实例化模型
criterion = nn.CrossEntropyLoss()
model = resnet18()

model.conv1 =nn.Conv2d(1,64,kernel_size=(7,7),stride=(2,2),padding=(3,3),bias=False)
model.fc = nn.Linear(in_features=512,out_features=classes,bias=True)
# print(model)
model.to(device)
#选择Adwm优化器,调学习率
optimizer = optim.Adam(model.parameters(),lr=lr)

def adjust_learning_rate(optimizer,epoch):
    modellrnew = lr*(0.1**(epoch//50))
    print("lr:",modellrnew)
    for program_group in optimizer.param_groups:
        program_group['lr'] = modellrnew

def train(model,device,train_loader,optimizer,epoch):
    model.train()
    sum_loss = 0
    total_num = len(train_loader.dataset)
    print(total_num,len(train_loader))
    for batch_idx ,(data,target) in enumerate(train_loader):
        data,target = Variable(data).to(device),Variable(target).to(device)
        output = model(data)
        loss = criterion(output,target)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print_loss = loss.data.item()
        sum_loss += print_loss
        if(batch_idx+1)%10==0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, (batch_idx + 1) * len(data), len(train_loader.dataset),
                       100. * (batch_idx + 1) / len(train_loader), loss.item()))
    ave_loss = sum_loss/len(train_loader)
    print('epoch:{},loss:{}'.format(epoch, ave_loss))

# 验证过程
def val(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    total_num = len(test_loader.dataset)
    print(total_num, len(test_loader))
    with torch.no_grad():
        for data, target in test_loader:
            data, target = Variable(data).to(device), Variable(target).to(device)
            output = model(data)
            loss = criterion(output, target)
            _, pred = torch.max(output.data, 1)
            correct += torch.sum(pred == target)
            print_loss = loss.data.item()
            test_loss += print_loss
        correct = correct.data.item()
        acc = correct / total_num
        avgloss = test_loss / len(test_loader)
        print('\nVal set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
            avgloss, correct, len(test_loader.dataset), 100 * acc))
#
#训练
for epoch in range(epochs+1):
    adjust_learning_rate(optimizer,epoch)
    train(model,device,train_loader,optimizer,epoch)
    val(model,device,test_loader)
torch.save(model,'model.pth')



 大家可以根据自己的需求更换网络。总之最后得到了一个模型权重:model.pth。

三、使用模型进行分类

可以发现MNIST数据集是灰度图,所以我们要把输入的图片转为MNIST格式,具体操作如下:

# 根据边框转换为MNIST格式
def transMNIST(path, borders, size=(224, 224)):
    imgData = np.zeros((len(borders), size[0], size[0], 1), dtype='uint8')
    img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
    img = accessBinary(img)
    for i, border in enumerate(borders):
        borderImg = img[border[0][1]:border[1][1], border[0][0]:border[1][0]]
        # 根据最大边缘拓展像素
        extendPiexl = (max(borderImg.shape) - min(borderImg.shape)) // 2
        targetImg = cv2.copyMakeBorder(borderImg, 7, 7, extendPiexl + 7, extendPiexl + 7, cv2.BORDER_CONSTANT)
        targetImg = cv2.resize(targetImg, size)
        targetImg = np.expand_dims(targetImg, axis=-1)
        imgData[i] = targetImg
    return imgData

这样我们会得到类似于MNIST数据集的图片:

 最后输入到神经网络里预测并显示在图上:

path = 'test.jpg'
borders = findBorderContours(path)  #返回所有边框的左上角和右下角
imgData = transMNIST(path, borders) #图像数据集    维度是(224,224,1)
# 加载模型
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = torch.load('model.pth', map_location=DEVICE)
model.eval()
model.to(DEVICE)
# 图像预处理
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])
result_number = []
for i in range(len(imgData)):
    image = transform(imgData[i]).unsqueeze(0).to(DEVICE)
    # 进行预测
    with torch.no_grad():
        output = model(image)

    # 获取预测结果
    _, predicted = torch.max(output.data, 1)
    prediction = predicted.item()
    result_number.append(prediction)
print(result_number)
showResults(path,borders,result_number)

得到的结果:

 

最后送上全部代码:

import cv2
import numpy as np
import torch
import torchvision.transforms as transforms

# 反相灰度图,将黑白阈值颠倒
def accessPiexl(img):
    height = img.shape[0]
    width = img.shape[1]
    for i in range(height):
       for j in range(width):
           img[i][j] = 255 - img[i][j]
    return img

# 反相二值化图像
def accessBinary(img, threshold=128):
    img = accessPiexl(img)
    # 边缘膨胀,不加也可以
    kernel = np.ones((3, 3), np.uint8)
    img = cv2.dilate(img, kernel, iterations=1)
    _, img = cv2.threshold(img, threshold, 0, cv2.THRESH_TOZERO)
    return img
# # 寻找边缘,返回边框的左上角和右下角(利用cv2.findContours)
def findBorderContours(path, maxArea=50):
    img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
    img = accessBinary(img)
    contours, _ = cv2.findContours(img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
    borders = []
    for contour in contours:
        # 将边缘拟合成一个边框
        x, y, w, h = cv2.boundingRect(contour)
        if w * h > maxArea:
            border = [(x, y), (x + w, y + h)]
            borders.append(border)
    return borders
# 显示结果及边框
def showResults(path, borders, results=None):
    img = cv2.imread(path)
    # 绘制
    print(img.shape)
    for i, border in enumerate(borders):
        cv2.rectangle(img, border[0], border[1], (0, 0, 255))
        if results:
            cv2.putText(img, str(results[i]), border[0], cv2.FONT_HERSHEY_COMPLEX, 0.8, (0, 255, 0), 1)
        #cv2.circle(img, border[0], 1, (0, 255, 0), 0)
    cv2.imshow('test', img)
    cv2.waitKey(0)
# 分割数字图像
def cropImages(path, borders):
    img = cv2.imread(path)
    cropped_images = []
    for border in borders:
        x1, y1 = border[0]
        x2, y2 = border[1]
        cropped_img = img[y1:y2, x1:x2]
        cropped_images.append(cropped_img)
    return cropped_images
#
# 根据边框转换为MNIST格式
def transMNIST(path, borders, size=(224, 224)):
    imgData = np.zeros((len(borders), size[0], size[0], 1), dtype='uint8')
    img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
    img = accessBinary(img)
    for i, border in enumerate(borders):
        borderImg = img[border[0][1]:border[1][1], border[0][0]:border[1][0]]
        # 根据最大边缘拓展像素
        extendPiexl = (max(borderImg.shape) - min(borderImg.shape)) // 2
        targetImg = cv2.copyMakeBorder(borderImg, 7, 7, extendPiexl + 7, extendPiexl + 7, cv2.BORDER_CONSTANT)
        targetImg = cv2.resize(targetImg, size)
        targetImg = np.expand_dims(targetImg, axis=-1)
        imgData[i] = targetImg
    return imgData

path = 'test.jpg'
borders = findBorderContours(path)  #返回所有边框的左上角和右下角
imgData = transMNIST(path, borders) #图像数据集    维度是(224,224,1)
# 加载模型
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = torch.load('model.pth', map_location=DEVICE)
model.eval()
model.to(DEVICE)
# 图像预处理
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])
result_number = []
for i in range(len(imgData)):
    image = transform(imgData[i]).unsqueeze(0).to(DEVICE)
    # 进行预测
    with torch.no_grad():
        output = model(image)
    # 获取预测结果
    _, predicted = torch.max(output.data, 1)
    prediction = predicted.item()
    result_number.append(prediction)
print(result_number)
showResults(path,borders,result_number)

 以上是全部内容!

  • 7
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
好的,我可以帮你讲解一下利用PyTorch平台实现数字图像识别任务的步骤。 1. 获取数字图像数据集 首先,我们需要获取一个适合数字图像识别任务的数据集。常见的数字图像数据集包括MNIST、CIFAR-10、Fashion-MNIST等。其中,MNIST数据集是最简单的数据集之一,它包含了手写数字0到9的灰度图像,每张图像的大小为28x28像素。你可以通过下载官方提供的数据集或使用PyTorch内置的数据集来获取数据。 2. 搭建深度卷积神经网络 接下来,我们需要搭建一个深度卷积神经网络来进行数字图像识别。在PyTorch中,我们可以通过继承`nn.Module`类来构建网络。具体而言,我们需要定义网络的结构和前向传播函数。一个简单的卷积神经网络结构如下: ``` class Net(nn.Module): def __init__(self): super(Net, self).__init__() # 定义卷积层和池化层 self.conv1 = nn.Conv2d(1, 32, 3, 1) self.conv2 = nn.Conv2d(32, 64, 3, 1) self.dropout1 = nn.Dropout2d(0.25) self.dropout2 = nn.Dropout2d(0.5) self.fc1 = nn.Linear(9216, 128) self.fc2 = nn.Linear(128, 10) def forward(self, x): x = F.relu(self.conv1(x)) x = F.relu(self.conv2(x)) x = F.max_pool2d(x, 2) x = self.dropout1(x) x = torch.flatten(x, 1) x = F.relu(self.fc1(x)) x = self.dropout2(x) x = self.fc2(x) output = F.log_softmax(x, dim=1) return output ``` 这个网络包含两个卷积层、两个池化层、两个dropout层和两个全连接层。其中,第一个卷积层的输入通道数为1,输出通道数为32,卷积核大小为3x3,步长为1,并使用ReLU激活函数;第二个卷积层的输入通道数为32,输出通道数为64,其余与第一个卷积层相同。接下来是两个池化层,每个池化层的大小为2x2。之后是两个dropout层,分别设置为0.25和0.5,用于防止过拟合。最后是两个全连接层,分别将输出大小降至128和10(在MNIST数据集中,一共有10个类别)。 3. 定义各种超参数、损失函数和优化算法 在训练神经网络时,我们需要为其指定各种超参数,包括学习率、训练轮数、批次大小等。同时,我们需要定义一个损失函数来衡量网络输出与真实标签之间的差异,常见的损失函数包括交叉熵损失函数、均方误差损失函数等。最后,我们还需要选择一个优化算法来更新网络参数,常见的优化算法包括随机梯度下降(SGD)、Adam等。 ``` # 定义超参数 learning_rate = 0.001 num_epochs = 5 batch_size = 64 # 定义损失函数和优化器 criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(net.parameters(), lr=learning_rate) ``` 4. 网络训练 网络训练的过程包括多次前向传播、计算损失、反向传播和权重更新。在PyTorch中,可以通过`torch.utils.data.DataLoader`和`torch.utils.data.Dataset`来构建数据集和数据加载器。具体而言,我们需要将数据集划分为训练集和测试集,并将其转换为`Tensor`类型。接下来,我们需要循环多次训练集,并对每个小批次进行反向传播和权重更新。最后,我们可以在测试集上评估网络的性能。 ``` # 划分训练集和测试集 train_set = datasets.MNIST('./data', train=True, download=True, transform=transforms.ToTensor()) test_set = datasets.MNIST('./data', train=False, download=True, transform=transforms.ToTensor()) # 创建数据加载器 train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle=True) test_loader = torch.utils.data.DataLoader(test_set, batch_size=batch_size, shuffle=False) # 训练网络 for epoch in range(num_epochs): for i, (images, labels) in enumerate(train_loader): # 前向传播 outputs = net(images) loss = criterion(outputs, labels) # 反向传播和优化 optimizer.zero_grad() loss.backward() optimizer.step() # 每训练100个批次,输出一次损失和准确率 if (i+1) % 100 == 0: print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' .format(epoch+1, num_epochs, i+1, len(train_loader), loss.item())) # 在测试集上评估网络 with torch.no_grad(): correct = 0 total = 0 for images, labels in test_loader: outputs = net(images) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum().item() print('Test Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total)) ``` 5. 性能测试 在训练完成后,我们可以在测试集上评估网络的性能。具体而言,我们可以计算出网络在测试集上的准确率。 ``` # 在测试集上评估网络 with torch.no_grad(): correct = 0 total = 0 for images, labels in test_loader: outputs = net(images) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum().item() print('Test Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total)) ``` 以上就是利用PyTorch平台实现数字图像识别任务的主要步骤。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值