本文基于PyTorch框架实现了手写数字识别任务,我使用Resnet18作为特征提取骨干,通过opencv框架实现了对手写数字的识别和分割,再将图像转为MNIST数据集格式,输入到神经网络中进行识别分类,最后输出内容。
下面我将详细讲解步骤。
一、对手写数字的分割框选。
首先我们将图像反相二值化处理,具体操作是通过反相操作,进行边缘膨胀操作,最后通过阈值将图像二值化。
# 反相灰度图,将黑白阈值颠倒
def accessPiexl(img):
height = img.shape[0]
width = img.shape[1]
for i in range(height):
for j in range(width):
img[i][j] = 255 - img[i][j]
return img
# 反相二值化图像
def accessBinary(img, threshold=128):
img = accessPiexl(img)
# 边缘膨胀,不加也可以
kernel = np.ones((3, 3), np.uint8)
img = cv2.dilate(img, kernel, iterations=1)
_, img = cv2.threshold(img, threshold, 0, cv2.THRESH_TOZERO)
return img
然后要找到图像中的边缘,并返回每个边框的左上角和右下角坐标,使用到opencv的findContours函数提取图像中的轮廓。
# # 寻找边缘,返回边框的左上角和右下角(利用cv2.findContours)
def findBorderContours(path, maxArea=50):
img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
img = accessBinary(img)
contours, _ = cv2.findContours(img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
borders = []
for contour in contours:
# 将边缘拟合成一个边框
x, y, w, h = cv2.boundingRect(contour)
if w * h > maxArea:
border = [(x, y), (x + w, y + h)]
borders.append(border)
return borders
然后是显示结果和框选图像的数字。
# 显示结果及边框
def showResults(path, borders, results=None):
img = cv2.imread(path)
# 绘制
print(img.shape)
for i, border in enumerate(borders):
cv2.rectangle(img, border[0], border[1], (0, 0, 255))
if results:
cv2.putText(img, str(results[i]), border[0], cv2.FONT_HERSHEY_COMPLEX, 0.8, (0, 255, 0), 1)
#cv2.circle(img, border[0], 1, (0, 255, 0), 0)
cv2.imshow('test', img)
cv2.waitKey(0)
# 分割数字图像
def cropImages(path, borders):
img = cv2.imread(path)
cropped_images = []
for border in borders:
x1, y1 = border[0]
x2, y2 = border[1]
cropped_img = img[y1:y2, x1:x2]
cropped_images.append(cropped_img)
return cropped_images
#
具体效果是这样的:输入:
path = 'test2.jpg'
borders = findBorderContours(path)
showResults(path, borders)
输出:
二、训练神经网络
训练识别数字可以不需要准备自己的数据集,我使用了MNIST数据集训练我的Resnet18网络,步骤如下:
import torch
from torch import nn as nn
from torchvision.models import resnet18
import torch.utils.data
import torch.utils.data.distributed
import torch.optim as optim
from torch.autograd import Variable
from torchvision import datasets, transforms
lr = 1e-4 #学习率
batch_size = 16 #批量大小
epochs = 30 #批量数
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
classes = 10 #类别数
#数据预处理
transform = transforms.Compose(
[transforms.Resize((224,224)),
transforms.ToTensor(),
transforms.Normalize((0.5,), (0.5,))]
)
transform_test = transforms.Compose(
[transforms.Resize((224,224)),
transforms.ToTensor(),
transforms.Normalize((0.5,), (0.5,))]
)
#下载训练集
datasets_train = datasets.MNIST(root='./data',train=True,download=True,transform=transform)
datasets_test = datasets.MNIST(root='./data',train=False,download=True,transform=transform_test)
#加载数据集
train_loader = torch.utils.data.DataLoader(datasets_train,batch_size= batch_size,shuffle=True)
test_loader = torch.utils.data.DataLoader(datasets_test,batch_size=batch_size,shuffle=False)
#实例化模型
criterion = nn.CrossEntropyLoss()
model = resnet18()
model.conv1 =nn.Conv2d(1,64,kernel_size=(7,7),stride=(2,2),padding=(3,3),bias=False)
model.fc = nn.Linear(in_features=512,out_features=classes,bias=True)
# print(model)
model.to(device)
#选择Adwm优化器,调学习率
optimizer = optim.Adam(model.parameters(),lr=lr)
def adjust_learning_rate(optimizer,epoch):
modellrnew = lr*(0.1**(epoch//50))
print("lr:",modellrnew)
for program_group in optimizer.param_groups:
program_group['lr'] = modellrnew
def train(model,device,train_loader,optimizer,epoch):
model.train()
sum_loss = 0
total_num = len(train_loader.dataset)
print(total_num,len(train_loader))
for batch_idx ,(data,target) in enumerate(train_loader):
data,target = Variable(data).to(device),Variable(target).to(device)
output = model(data)
loss = criterion(output,target)
optimizer.zero_grad()
loss.backward()
optimizer.step()
print_loss = loss.data.item()
sum_loss += print_loss
if(batch_idx+1)%10==0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, (batch_idx + 1) * len(data), len(train_loader.dataset),
100. * (batch_idx + 1) / len(train_loader), loss.item()))
ave_loss = sum_loss/len(train_loader)
print('epoch:{},loss:{}'.format(epoch, ave_loss))
# 验证过程
def val(model, device, test_loader):
model.eval()
test_loss = 0
correct = 0
total_num = len(test_loader.dataset)
print(total_num, len(test_loader))
with torch.no_grad():
for data, target in test_loader:
data, target = Variable(data).to(device), Variable(target).to(device)
output = model(data)
loss = criterion(output, target)
_, pred = torch.max(output.data, 1)
correct += torch.sum(pred == target)
print_loss = loss.data.item()
test_loss += print_loss
correct = correct.data.item()
acc = correct / total_num
avgloss = test_loss / len(test_loader)
print('\nVal set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
avgloss, correct, len(test_loader.dataset), 100 * acc))
#
#训练
for epoch in range(epochs+1):
adjust_learning_rate(optimizer,epoch)
train(model,device,train_loader,optimizer,epoch)
val(model,device,test_loader)
torch.save(model,'model.pth')
大家可以根据自己的需求更换网络。总之最后得到了一个模型权重:model.pth。
三、使用模型进行分类
可以发现MNIST数据集是灰度图,所以我们要把输入的图片转为MNIST格式,具体操作如下:
# 根据边框转换为MNIST格式
def transMNIST(path, borders, size=(224, 224)):
imgData = np.zeros((len(borders), size[0], size[0], 1), dtype='uint8')
img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
img = accessBinary(img)
for i, border in enumerate(borders):
borderImg = img[border[0][1]:border[1][1], border[0][0]:border[1][0]]
# 根据最大边缘拓展像素
extendPiexl = (max(borderImg.shape) - min(borderImg.shape)) // 2
targetImg = cv2.copyMakeBorder(borderImg, 7, 7, extendPiexl + 7, extendPiexl + 7, cv2.BORDER_CONSTANT)
targetImg = cv2.resize(targetImg, size)
targetImg = np.expand_dims(targetImg, axis=-1)
imgData[i] = targetImg
return imgData
这样我们会得到类似于MNIST数据集的图片:
最后输入到神经网络里预测并显示在图上:
path = 'test.jpg'
borders = findBorderContours(path) #返回所有边框的左上角和右下角
imgData = transMNIST(path, borders) #图像数据集 维度是(224,224,1)
# 加载模型
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = torch.load('model.pth', map_location=DEVICE)
model.eval()
model.to(DEVICE)
# 图像预处理
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.5,), (0.5,))
])
result_number = []
for i in range(len(imgData)):
image = transform(imgData[i]).unsqueeze(0).to(DEVICE)
# 进行预测
with torch.no_grad():
output = model(image)
# 获取预测结果
_, predicted = torch.max(output.data, 1)
prediction = predicted.item()
result_number.append(prediction)
print(result_number)
showResults(path,borders,result_number)
得到的结果:
最后送上全部代码:
import cv2
import numpy as np
import torch
import torchvision.transforms as transforms
# 反相灰度图,将黑白阈值颠倒
def accessPiexl(img):
height = img.shape[0]
width = img.shape[1]
for i in range(height):
for j in range(width):
img[i][j] = 255 - img[i][j]
return img
# 反相二值化图像
def accessBinary(img, threshold=128):
img = accessPiexl(img)
# 边缘膨胀,不加也可以
kernel = np.ones((3, 3), np.uint8)
img = cv2.dilate(img, kernel, iterations=1)
_, img = cv2.threshold(img, threshold, 0, cv2.THRESH_TOZERO)
return img
# # 寻找边缘,返回边框的左上角和右下角(利用cv2.findContours)
def findBorderContours(path, maxArea=50):
img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
img = accessBinary(img)
contours, _ = cv2.findContours(img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
borders = []
for contour in contours:
# 将边缘拟合成一个边框
x, y, w, h = cv2.boundingRect(contour)
if w * h > maxArea:
border = [(x, y), (x + w, y + h)]
borders.append(border)
return borders
# 显示结果及边框
def showResults(path, borders, results=None):
img = cv2.imread(path)
# 绘制
print(img.shape)
for i, border in enumerate(borders):
cv2.rectangle(img, border[0], border[1], (0, 0, 255))
if results:
cv2.putText(img, str(results[i]), border[0], cv2.FONT_HERSHEY_COMPLEX, 0.8, (0, 255, 0), 1)
#cv2.circle(img, border[0], 1, (0, 255, 0), 0)
cv2.imshow('test', img)
cv2.waitKey(0)
# 分割数字图像
def cropImages(path, borders):
img = cv2.imread(path)
cropped_images = []
for border in borders:
x1, y1 = border[0]
x2, y2 = border[1]
cropped_img = img[y1:y2, x1:x2]
cropped_images.append(cropped_img)
return cropped_images
#
# 根据边框转换为MNIST格式
def transMNIST(path, borders, size=(224, 224)):
imgData = np.zeros((len(borders), size[0], size[0], 1), dtype='uint8')
img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
img = accessBinary(img)
for i, border in enumerate(borders):
borderImg = img[border[0][1]:border[1][1], border[0][0]:border[1][0]]
# 根据最大边缘拓展像素
extendPiexl = (max(borderImg.shape) - min(borderImg.shape)) // 2
targetImg = cv2.copyMakeBorder(borderImg, 7, 7, extendPiexl + 7, extendPiexl + 7, cv2.BORDER_CONSTANT)
targetImg = cv2.resize(targetImg, size)
targetImg = np.expand_dims(targetImg, axis=-1)
imgData[i] = targetImg
return imgData
path = 'test.jpg'
borders = findBorderContours(path) #返回所有边框的左上角和右下角
imgData = transMNIST(path, borders) #图像数据集 维度是(224,224,1)
# 加载模型
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = torch.load('model.pth', map_location=DEVICE)
model.eval()
model.to(DEVICE)
# 图像预处理
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.5,), (0.5,))
])
result_number = []
for i in range(len(imgData)):
image = transform(imgData[i]).unsqueeze(0).to(DEVICE)
# 进行预测
with torch.no_grad():
output = model(image)
# 获取预测结果
_, predicted = torch.max(output.data, 1)
prediction = predicted.item()
result_number.append(prediction)
print(result_number)
showResults(path,borders,result_number)
以上是全部内容!