NormalFormCaptchaOCR : 深度学习范式,以验证码识别为例

   今天我想和大家分享一些关于深度学习入门的知识。深度学习是人工智能领域的一个重要分支,目前已经被广泛应用于图像识别、自然语言处理、语音识别等领域。如果你对深度学习感兴趣,那么本文可能会对你有所帮助。
   在这篇文章中,我们将一起学习如何构建一个简单的深度学习模型,并了解如何使用常用的深度学习框架 ( PyTorch)来编写和运行代码。我们将以规范代码结构开始深度学习,以便初学者可以更好地学习和使用深度学习框架。
   希望通过这篇文章,你可以更好地了解深度学习的基本概念和编程技巧,并开始实践编写自己的深度学习模型。

一、导入包以及设置随机种子

使实验结果可以复现

import numpy as np
from tqdm import tqdm
import os
import time

import torch
from torch import nn
from torchvision import models
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torchvision  import transforms


from PIL import Image


import random
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

二、以类的方式定义超参数

统一设置超参数,方便调参

class argparse():
    def __init__(self) -> None:
        self.captcha_size = 4  # 4位验证码
        self.captcha_array = "0123456789abcdefghijklmnopqrstuvwxyz" # 验证码从36位字符中取出
        
        self.batch_size = 512 # 批量大小
        self.lr = 0.001   #学习率
        self.epochs = 20  # 所有数据被训练的总轮数
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # 在支持GPU时使用GPU,否则使用CPU
        

args = argparse() # 实例化超参数类

三、定义自己的模型

继承自 nn.Module , 定义自己的模型

class Mymodel(nn.Module):
    def __init__(self):
        super(Mymodel,self).__init__()

        self.seq = nn.Sequential(
            nn.Conv2d(in_channels=1,out_channels=64,kernel_size=3,padding=1),
            nn.ReLU(),  
            nn.MaxPool2d(kernel_size=2), #[6, 64, 30, 80],

            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2), #[6, 128, 15, 40]

            nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),  # [6, 256, 7, 20]
            
            nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Flatten()

        )

        self.layer = nn.Sequential(
          nn.Linear(in_features=15360,out_features=4096),
          nn.Dropout(0.2),  # drop 20% of the neuron
          nn.ReLU(),
          nn.Linear(in_features=4096, out_features = args.captcha_size*args.captcha_array.__len__())
        )

    def forward(self, x):
        x = self.seq(x)
        x = self.layer(x)
        return x

也可以从PyTorch中自带的模型中读取,修改网络的输入层和输出层适应目标任务

class myResNet(nn.Module):
    def __init__(self) -> None:
        super(myResNet,self).__init__()
        self.model = models.resnet50(pretrained=False)
        self.model.conv1 = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
        self.model.fc = nn.Linear(in_features=2048,out_features=args.captcha_size*args.captcha_array.__len__(), bias=True)

    def forward(self, x):
        x = self.model(x)
        return x 

五、定义自己的数据集Dataset,DataLoader

以类的形式放置工具函数,独热编码(one-hot)实现文本转向量与向量转文本

class Tools():

    # def __init__(self):
    #   pass

    def text2vec(self, text):
        # 4行36列
        vec = torch.zeros((args.captcha_size, len(args.captcha_array)))
        for i in range(len(text)):
            vec[i, args.captcha_array.index(text[i])] = 1
        return vec
    # text2vec('aab1')

    def vec2text(self, vec):
        vec = torch.argmax(vec, dim = 1)
        # print(vec)
        text = ''
        for v in vec:
            text += args.captcha_array[v]
        return text
    
    def accuracy(self, y_hat, y):  #@save
        """计算预测正确的数量"""
        
        pass
            

测试工具函数

tls = Tools()
tls.vec2text(tls.text2vec('aab1'))

Output:

    'aab1'

定义读取数据集,继承自Dataset,主要实现 __init__() __getitem__() __len__()

class My_datasets(Dataset):
    def __init__(self, root_dir):
        super(My_datasets, self).__init__()
        # self.image_path = os.listdir(root_dir)
        self.list_image_path = [os.path.join(root_dir, image_path) for image_path in os.listdir(root_dir)]

        self.transforms = transforms.Compose(
            [
            transforms.Resize((60,160)),
            transforms.Grayscale(),
            transforms.ToTensor()
            ]
        )

        # print(self.list_image_path)
    
    def __getitem__(self, index) :
        image_path = self.list_image_path[index]
        # print(image_path)
        img_ = Image.open(image_path)
        image_name = image_path.split('/')[-1]

        img_tensor = self.transforms(img_)
        # img_.show()
        img_label = image_name.split('_')[0]

        img_label = tls.text2vec(img_label)
        img_label = img_label.view(1, -1)[0]

        return img_tensor, img_label



        # return super().__getitem__(index)

    def __len__(self):
        return  self.list_image_path.__len__()

六、定义训练函数

def train_original():
    train_path = r"./dataset/train/"
    test_path = r"./dataset/test/"

    train_dataset = My_datasets(train_path)
    train_dataloader = DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=True)
    val_dataset = My_datasets(test_path)
    val_dataloader = DataLoader(dataset=val_dataset, batch_size=args.batch_size, shuffle=True)
    loss_fn=nn.MultiLabelSoftMarginLoss().to(args.device)
    model = myResNet().to(args.device) 

    optimizer = torch.optim.Adam(model.parameters(), lr = args.lr)
  
    total_step=0
    save_step = 100

    for epoch in range(args.epochs):
        min_loss = 100000
        for i,(imgs,targets) in enumerate(train_dataloader):
            imgs=imgs.to(args.device)
            targets=targets.to(args.device)
            # print(imgs.shape)
            # print(targets.shape)
            outputs=model(imgs)
            # print(outputs.shape)
            loss = loss_fn(outputs, targets)
            
            optimizer.zero_grad()

            loss.backward()
            optimizer.step()

            if loss < min_loss:
                min_loss = loss
                total_step += 1
                if total_step % save_step == 0:
                    save_step = max(save_step - 20 ,10)
                    print("save model {}".format(total_step))
                torch.save(model.state_dict(),"afterResNetmodel.pth")
        
        print("epoch{}, loss:{}".format(epoch, min_loss.item()))
    

train_original()

七、测试保存的最优模型的识别正确率以及对单张的验证码识别

定义测试函数和单张预测函数

def test_pred():
    m = myResNet()
    # m.load_state_dict(torch.load("best_model.pth",map_location = args.device))
    m.load_state_dict(torch.load("afterResNetmodel.pth",map_location = args.device), strict=False)

    m.to(args.device)
    m.eval()
    test_data = My_datasets("./dataset/test/")

    test_dataloader = DataLoader(test_data, batch_size=1, shuffle=False)
    test_length = test_data.__len__()
    correct = 0;
    for i, (imgs, lables) in enumerate(test_dataloader):
        imgs = imgs.to(args.device)
        lables = lables.to(args.device)

        lables = lables.view(-1, args.captcha_array.__len__())

        lables_text = tls.vec2text(lables)
        predict_outputs = m(imgs)
        predict_outputs = predict_outputs.view(-1, args.captcha_array.__len__())
        predict_labels = tls.vec2text(predict_outputs)
        if predict_labels == lables_text:
            correct += 1
            # print("预测正确:正确值:{},预测值:{}".format(lables_text, predict_labels))
        else:
            print("预测失败:正确值:{},预测值:{}".format(lables_text, predict_labels))
        # m(imgs)
    print("正确率{}".format(correct / test_length * 100))
def pred_pic(pic_path):
    img=Image.open(pic_path)
    tersor_img=transforms.Compose([
        transforms.Grayscale(),
        transforms.Resize((60,160)),
        transforms.ToTensor()
    ])
    img=tersor_img(img).to(args.device)
    # print(img.shape)
    img=torch.reshape(img,(-1,1,60,160))
    # print(img.shape)
   
    m = myResNet()
    m.load_state_dict(torch.load("afterResNetmodel.pth",map_location = args.device), strict=False)
    m.to(args.device)
    m.eval()
    outputs = m(img)
    outputs=outputs.view(-1,len(args.captcha_array))
    outputs_lable=tls.vec2text(outputs)
    print(outputs_lable)

测试

test_pred()

Output:

预测失败:正确值:jbgl,预测值:ibgl
预测失败:正确值:w164,预测值:wz64
预测失败:正确值:g80u,预测值:880u
预测失败:正确值:t9eo,预测值:t980
预测失败:正确值:jln3,预测值:jln9
预测失败:正确值:k0g5,预测值:kog5
预测失败:正确值:etni,预测值:etnd
预测失败:正确值:oqhx,预测值:oqhh
预测失败:正确值:q045,预测值:qo45
预测失败:正确值:pdhb,预测值:pdbb
预测失败:正确值:17ig,预测值:12ig
预测失败:正确值:sz4c,预测值:5z4c
正确率94.0

  今天的教程学习已经全部结束,我很高兴能够与大家分享这些内容。我希望能够为大家提供有用的信息和帮助,以便大家更好地理解深度学习和神经网络的知识。

  如果您有任何疑问或建议,请随时与我联系。

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 3
    评论
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值