1. 实验内容
自建数据集,并应用该数据集训练神经网络,用于预测手写小写字母。
2. 实验过程
1.构建数据集
-
通过绘画软件,写出26个手写小写字母。
-
通过数据增广操作增加数据样本。
import torch
import torchvision
from torchvision import transforms
from PIL import Image
import os
# 数据增广
change_ways = transforms.Compose([
transforms.Grayscale(1),
transforms.ColorJitter(0.6, 0.4, 0.6, 0.4),
transforms.RandomVerticalFlip(0.4),
transforms.RandomHorizontalFlip(p=0.4), # 随机水平翻转
transforms.RandomRotation(degrees=360), # 随机旋转角度范围
])
# 设置图像路径
ini_img = 'C:\\Users\\X15\\Desktop\\abc' # 初始图像
save_img = 'C:\\Users\\X15\\Desktop\\save_abc' # 增广后的图像
value = 96
for file in os.listdir(ini_img):
value += 1
path1 = f'C:\\Users\\X15\\Desktop\\save_abc\\test\\{chr(value)}'
if not os.path.exists(path1):
os.makedirs(path1)
img = Image.open(os.path.join(ini_img, file))
img = torchvision.transforms.ToTensor()(img) # 将图像转换为 Tensor
for i in range(40):
augmented_img = change_ways(img)
augmented_img = torchvision.transforms.ToPILImage()(augmented_img)
augmented_img.save(os.path.join(path1, f"{file[:-4]}_{i}.jpg"))
通过这样的方式得到了a-z这26个字母的训练数据
2.加载数据集
标签在文件夹上类型的数据可以调用ImageFolder来载入,具体应用代码如下:
import torch
import torchvision
from torchvision import transforms
train_root ="C:\\Users\\X15\\Desktop\\abc_recog\\save_abc\\train"
test_root = "C:\\Users\\X15\\Desktop\\abc_recog\\save_abc\\test"
# 定义变换
transform = transforms.Compose([
transforms.Resize(40),
transforms.ToTensor(), # 将输入图像转换为张量
transforms.Normalize(mean=[0.5], std=[0.5]) # 对张量进行归一化
])
# 训练数据集
train_data = torchvision.datasets.ImageFolder(train_root, transform=transform)
train_iter = torch.utils.data.DataLoader(train_data, batch_size=20, shuffle=True, num_workers=0)
3. 定义网络
通过查看训练数据的具体属性,可以知道我们的数据大小是3通道的160 * 160图像,定义网络如下:
#定义网络
class cnn(nn.Module):
def __init__(self):
super(cnn, self).__init__()
# 第一层卷积
self.conv1 = nn.Sequential(
nn.Conv2d(1, 16, 3, 1, 1), # 40
nn.ReLU(),
nn.Conv2d(16, 16, 3, 1, 1),
nn.ReLU(),
nn.MaxPool2d(2) # 20
)
self.conv2 = nn.Sequential(
nn.Conv2d(16, 32, 3, 1, 1),
nn.ReLU(),
nn.Conv2d(32, 32, 3, 1, 1),
nn.ReLU(),
nn.MaxPool2d(2) # 10
)
self.conv3 = nn.Sequential(
nn.Conv2d(32, 64, 3, 1, 1),
nn.ReLU(),
nn.Conv2d(64, 64, 3, 1, 1),
nn.ReLU(),
nn.MaxPool2d(2) # 5
)
self.fc = nn.Linear(64*5*5,50)
self.out = nn.Linear(50, 26)
def forward(self, x):
x = self.conv1(x)
x = self.conv2(x)
x = self.conv3(x)
x = x.view(x.size(0), -1)
x = self.fc(x)
x = self.out(x)
return x
4.实例化网络并训练
# 实例化
cnn = cnn()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(cnn.parameters(), lr=0.001)
# 训练
import torch.optim as optim
import matplotlib.pyplot as plt
def train_model(model, criterion, optimizer, train_loader, num_epochs=10):
losses = []
accuracies = []
for epoch in range(num_epochs):
model.train() # 将模型设置为训练模式
running_loss = 0.0
correct = 0
total = 0
for i, data in enumerate(train_loader, 0):
inputs, labels = data
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
_, predicted = torch.max(outputs, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
epoch_loss = running_loss / len(train_loader)
accuracy = correct / total
losses.append(epoch_loss)
accuracies.append(accuracy)
print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss}, Accuracy: {accuracy}')
# 绘制损失图像
plt.figure()
plt.plot(losses, label='Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Loss over Epochs')
plt.legend()
plt.show()
# 训练模型
train_model(cnn, criterion, optimizer, train_iter, num_epochs=10)
5. 预测
通过cv库获取手写输入并预测:
import cv2
import time
import numpy as np
# 预测
def number_predict(img, model):
img = img/255.0 # 归一化
x = torch.from_numpy(img).type(torch.FloatTensor) # 转为Tensor
out = model(x)
_, result = torch.max(out, 1) # 沿维度1找到out的最大值和label,_让最大值弃用,result接受label
return result.numpy() # 将预测结果转换为 NumPy 数组并返回
drawing = False # 用于记录是否在绘制图形
last = (-1, -1) # 记录鼠标位置
color = (255, 255, 255)
canvas = (256, 512, 1)
delta = 1.0
last_time = time.time() - delta
def recognition(image, model):
contours, hierarchy = cv2.findContours(image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) # `findContours()` 函数查找输入图像中的轮廓
for i in range(len(contours)):
x, y, w, h = cv2.boundingRect(contours[i])
cx = x + w // 2
cy = y + h // 2
pad = (max(w, h) + 8) // 2
x1 = max(0, cx - pad)
y1 = max(0, cy - pad)
x2 = min(image.shape[1], cx + pad)
y2 = min(image.shape[0], cy + pad)
cv2.rectangle(image, (x1, y1), (x2, y2), (255, 255, 255))
number_i = (image[y1:y2, x1:x2])
number_i = cv2.resize(number_i, (40, 40))
number_i = np.reshape(number_i, (-1, 1, 40, 40)).astype('float')
result = number_predict(number_i, model)
text = chr(int(str(result)[1]) + 97)
cv2.putText(image, text, org=(x, y), fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=1.0, color=(255, 255, 255))
def mouse_event(event, x, y, flags, param):
global last, drawing, img, last_time, cnn
if event == cv2.EVENT_LBUTTONDOWN:
drawing = True
last = (x, y)
if time.time() - last_time > delta:
img = np.zeros(canvas, np.uint8)
last_time = time.time()
elif event == cv2.EVENT_MOUSEMOVE:
if drawing:
cv2.line(img, last, (x, y), color, 3)
last = (x, y)
elif event == cv2.EVENT_LBUTTONUP:
drawing = False
cv2.line(img, last, (x, y), color, 3)
last_time = time.time()
elif event == cv2.EVENT_RBUTTONUP:
recognition(img, cnn)
img = np.zeros(canvas, np.uint8)
cv2.namedWindow('image')
cv2.setMouseCallback('image', mouse_event)
while True:
cv2.imshow('image', img)
if cv2.waitKey(1) == 27:
break
cv2.destroyAllWindows()
3. 实验结果
4. 实验收获
- os.listdir(目录):返回指定目录下所有文件的文件夹名称和列表。
- 可以通过 if not os.path.exists(path)对目录进行检查。
- 通过os.path.join(a,b)可以访问a目录下的b文件。
- augmented_img.save(os.path.join(path1, f"{file[:-4]}_{i}.jpg")) : file[:4]可以获得去掉扩展名的文件名。