C++ opencv and QT5.9.8 use CRNN to OCR

最新推荐文章于 2023-09-05 02:36:29 发布

李伯爵的指间沙

最新推荐文章于 2023-09-05 02:36:29 发布

阅读量1.4k

点赞数 1

分类专栏： QT C++ English blog

你若不肯低头，谁能为你戴上桂冠

本文链接：https://blog.csdn.net/m0_37690102/article/details/107492470

版权

English blog 同时被 3 个专栏收录

90 篇文章 4 订阅

订阅专栏

C++

55 篇文章 0 订阅

订阅专栏

16 篇文章 1 订阅

订阅专栏

实现了C++调用CRNN模型，实现OCR识别。

C++ calls CRNN model and OCR recognition.

CRNN是一种卷积循环神经网络结构，用于解决基于图像的序列识别问题，特别是场景文字识别问题。

CRNN is a kind of convolution loop neural network structure, which is used to solve the sequence recognition problem based on image, especially the scene text recognition problem.

CRNN 全称为 Convolutional Recurrent Neural Network，主要用于端到端地对不定长的文本序列进行识别，不用先对单个文字进行切割，而是将文本识别转化为时序依赖的序列学习问题，就是基于图像的序列识别。

CRNN, fully known as Convolutional Neural Network, is found to be mainly used for end-to-end recognition of variable text sequences; it turns text recognition into a sequential sequential learning problem, which is image-based sequence recognition, rather than cutting a single text first.

CRNN网络结构

CRNN network structure

整个CRNN网络结构包含三部分，从下到上依次为：

CNN（卷积层），使用深度CNN，对输入图像提取特征，得到特征图；
RNN（循环层），使用双向RNN（BLSTM）对特征序列进行预测，对序列中的每个特征向量进行学习，并输出预测标签（真实值）分布；
CTC loss（转录层），使用 CTC 损失，把从循环层获取的一系列标签分布转换成最终的标签序列。

The whole CRNN network structure consists of three parts, which are in order from bottom to top:

CNN (convolutional layer) USES depth CNN to extract features from the input image to obtain the feature map.

RNN (cyclic layer) is used to predict the feature sequence, learn each feature vector in the sequence, and output the distribution of prediction label (true value).

CTC loss (transcription layer), using CTC loss, converts a series of label distributions obtained from the cyclic layer into the final label sequence.

C++实现CRNN模型的调用：

C++ implements the call of CRNN model:

（1）projects：ENV config


INCLUDEPATH += C:\opencv-4.4.0\opencv\build\include
               C:\opencv-4.4.0\opencv\build\include\opencv
               C:\opencv-4.4.0\opencv\build\include\opencv2

LIBS +=-LC:\opencv-4.4.0\opencv\build\x64\vc14\lib -lopencv_world440
INCLUDEPATH += $$PWD/../opencv-4.4.0/opencv/build/x64/vc14
DEPENDPATH += $$PWD/../opencv-4.4.0/opencv/build/x64/vc14

（2）essential files：

crnn.pth(该软件在pytorch中实现了卷积递归神经网络(CRNN)

his software implements the Convolutional Recurrent Neural Network (CRNN) in pytorch. ) convert crnn.pth to crnn.onnx to use in opencv by the c++.

frozen_east_text_detection.pb:Text positioning function.(It's not necessary,You can do the projection, get the text area).

（3）convert crnn.pth to crnn.onnx

ptonnx:

    import torch
    import models.crnn as crnn

    model = CRNN(32, 1, 37, 256)
    model.load_state_dict(torch.load('crnn.pth'))
    dummy_input = torch.randn(1, 1, 32, 100)
    torch.onnx.export(model, dummy_input, "crnn.onnx", verbose=True)

models:

import torch.nn as nn

class BidirectionalLSTM(nn.Module):

    def __init__(self, nIn, nHidden, nOut):
        super(BidirectionalLSTM, self).__init__()

        self.rnn = nn.LSTM(nIn, nHidden, bidirectional=True)
        self.embedding = nn.Linear(nHidden * 2, nOut)

    def forward(self, input):
        recurrent, _ = self.rnn(input)
        T, b, h = recurrent.size()
        t_rec = recurrent.view(T * b, h)

        output = self.embedding(t_rec)  # [T * b, nOut]
        output = output.view(T, b, -1)

        return output

class CRNN(nn.Module):

    def __init__(self, imgH, nc, nclass, nh, n_rnn=2, leakyRelu=False):
        super(CRNN, self).__init__()
        assert imgH % 16 == 0, 'imgH has to be a multiple of 16'

        ks = [3, 3, 3, 3, 3, 3, 2]
        ps = [1, 1, 1, 1, 1, 1, 0]
        ss = [1, 1, 1, 1, 1, 1, 1]
        nm = [64, 128, 256, 256, 512, 512, 512]

        cnn = nn.Sequential()

        def convRelu(i, batchNormalization=False):
            nIn = nc if i == 0 else nm[i - 1]
            nOut = nm[i]
            cnn.add_module('conv{0}'.format(i),
                           nn.Conv2d(nIn, nOut, ks[i], ss[i], ps[i]))
            if batchNormalization:
                cnn.add_module('batchnorm{0}'.format(i), nn.BatchNorm2d(nOut))
            if leakyRelu:
                cnn.add_module('relu{0}'.format(i),
                               nn.LeakyReLU(0.2, inplace=True))
            else:
                cnn.add_module('relu{0}'.format(i), nn.ReLU(True))

        convRelu(0)
        cnn.add_module('pooling{0}'.format(0), nn.MaxPool2d(2, 2))  # 64x16x64
        convRelu(1)
        cnn.add_module('pooling{0}'.format(1), nn.MaxPool2d(2, 2))  # 128x8x32
        convRelu(2, True)
        convRelu(3)
        cnn.add_module('pooling{0}'.format(2),
                       nn.MaxPool2d((2, 2), (2, 1), (0, 1)))  # 256x4x16
        convRelu(4, True)
        convRelu(5)
        cnn.add_module('pooling{0}'.format(3),
                       nn.MaxPool2d((2, 2), (2, 1), (0, 1)))  # 512x2x16
        convRelu(6, True)  # 512x1x16

        self.cnn = cnn
        self.rnn = nn.Sequential(
            BidirectionalLSTM(512, nh, nh),
            BidirectionalLSTM(nh, nh, nclass))

    def forward(self, input):
        # conv features
        conv = self.cnn(input)
        b, c, h, w = conv.size()
        assert h == 1, "the height of conv must be 1"
        conv = conv.squeeze(2)
        conv = conv.permute(2, 0, 1)  # [w, b, c]

        # rnn features
        output = self.rnn(conv)

        return output

(4)CPP（Modified the code provided by Opencv）

#include <QCoreApplication>
/*
    import torch
    import models.crnn as crnn

    model = CRNN(32, 1, 37, 256)
    model.load_state_dict(torch.load('crnn.pth'))
    dummy_input = torch.randn(1, 1, 32, 100)
    torch.onnx.export(model, dummy_input, "crnn.onnx", verbose=True)
*/

#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>
#include <opencv2/dnn.hpp>
#include <QDebug>
using namespace cv;
using namespace cv::dnn;

void decodeBoundingBoxes(const Mat& scores, const Mat& geometry, float scoreThresh,
                         std::vector<RotatedRect>& detections, std::vector<float>& confidences);

void fourPointsTransform(const Mat& frame, Point2f vertices[4], Mat& result);

void decodeText(const Mat& scores, std::string& text);

void decodeBoundingBoxes(const Mat& scores, const Mat& geometry, float scoreThresh,
                         std::vector<RotatedRect>& detections, std::vector<float>& confidences)
{
    detections.clear();
    CV_Assert(scores.dims == 4); CV_Assert(geometry.dims == 4); CV_Assert(scores.size[0] == 1);
    CV_Assert(geometry.size[0] == 1); CV_Assert(scores.size[1] == 1); CV_Assert(geometry.size[1] == 5);
    CV_Assert(scores.size[2] == geometry.size[2]); CV_Assert(scores.size[3] == geometry.size[3]);

    const int height = scores.size[2];
    const int width = scores.size[3];
    for (int y = 0; y < height; ++y)
    {
        const float* scoresData = scores.ptr<float>(0, 0, y);
        const float* x0_data = geometry.ptr<float>(0, 0, y);
        const float* x1_data = geometry.ptr<float>(0, 1, y);
        const float* x2_data = geometry.ptr<float>(0, 2, y);
        const float* x3_data = geometry.ptr<float>(0, 3, y);
        const float* anglesData = geometry.ptr<float>(0, 4, y);
        for (int x = 0; x < width; ++x)
        {
            float score = scoresData[x];
            if (score < scoreThresh)
                continue;

            // Decode a prediction.
            // Multiple by 4 because feature maps are 4 time less than input image.
            float offsetX = x * 4.0f, offsetY = y * 4.0f;
            float angle = anglesData[x];
            float cosA = std::cos(angle);
            float sinA = std::sin(angle);
            float h = x0_data[x] + x2_data[x];
            float w = x1_data[x] + x3_data[x];

            Point2f offset(offsetX + cosA * x1_data[x] + sinA * x2_data[x],
                           offsetY - sinA * x1_data[x] + cosA * x2_data[x]);
            Point2f p1 = Point2f(-sinA * h, -cosA * h) + offset;
            Point2f p3 = Point2f(-cosA * w, sinA * w) + offset;
            RotatedRect r(0.5f * (p1 + p3), Size2f(w, h), -angle * 180.0f / (float)CV_PI);
            detections.push_back(r);
            confidences.push_back(score);
        }
    }
}

void fourPointsTransform(const Mat& frame, Point2f vertices[4], Mat& result)
{
    const Size outputSize = Size(100, 32);

    Point2f targetVertices[4] = {Point(0, outputSize.height - 1),
                                  Point(0, 0), Point(outputSize.width - 1, 0),
                                  Point(outputSize.width - 1, outputSize.height - 1),
                                  };
    Mat rotationMatrix = getPerspectiveTransform(vertices, targetVertices);

    warpPerspective(frame, result, rotationMatrix, outputSize);
}

void decodeText(const Mat& scores, std::string& text)
{
    static const std::string alphabet = "0123456789abcdefghijklmnopqrstuvwxyz";
    Mat scoresMat = scores.reshape(1, scores.size[0]);

    std::vector<char> elements;
    elements.reserve(scores.size[0]);

    for (int rowIndex = 0; rowIndex < scoresMat.rows; ++rowIndex)
    {
        Point p;
        minMaxLoc(scoresMat.row(rowIndex), 0, 0, 0, &p);
        if (p.x > 0 && static_cast<size_t>(p.x) <= alphabet.size())
        {
            elements.push_back(alphabet[p.x - 1]);
        }
        else
        {
            elements.push_back('-');
        }
    }

    if (elements.size() > 0 && elements[0] != '-')
        text += elements[0];

    for (size_t elementIndex = 1; elementIndex < elements.size(); ++elementIndex)
    {
        if (elementIndex > 0 && elements[elementIndex] != '-' &&
            elements[elementIndex - 1] != elements[elementIndex])
        {
            text += elements[elementIndex];
        }
    }
}


int main(int argc, char** argv)
{
    QCoreApplication a(argc, argv);

    qDebug()<<"=========================================="<<endl;

    Mat img = imread(".//3.jpg");
    imshow("kWinName", img);
    //waitKey(0);

    #if 0
    float confThreshold = parser.get<float>("thr");
    float nmsThreshold = parser.get<float>("nms");
    int inpWidth = parser.get<int>("width");
    int inpHeight = parser.get<int>("height");
    String modelDecoder = parser.get<String>("model");
    String modelRecognition = parser.get<String>("ocr");
    #endif

    #if 1
    float confThreshold = 0.5;
    float nmsThreshold = 0.4;
    int inpWidth = 320;
    int inpHeight = 320;
    String modelDecoder = ".//frozen_east_text_detection.pb";
    String modelRecognition = ".//crnn.onnx";

    CV_Assert(!modelDecoder.empty());
    // Load networks.
    Net detector = readNet(modelDecoder);
    Net recognizer;
    if (!modelRecognition.empty())
        recognizer = readNet(modelRecognition);
    else
    {
       qDebug()<<"error"<<endl;

    }
    // Open a video file or an image file or a camera stream.
    std::vector<Mat> outs;
    std::vector<String> outNames(2);
    outNames[0] = "feature_fusion/Conv_7/Sigmoid";
    outNames[1] = "feature_fusion/concat_3";
    Mat frame, blob;
    TickMeter tickMeter;
    frame = img.clone();
    blobFromImage(frame, blob, 1.0, Size(inpWidth, inpHeight), Scalar(123.68, 116.78, 103.94), true, false);
    detector.setInput(blob);
    tickMeter.start();
    detector.forward(outs, outNames);
    tickMeter.stop();
    Mat scores = outs[0];
    Mat geometry = outs[1];
    // Decode predicted bounding boxes.
    std::vector<RotatedRect> boxes;
    std::vector<float> confidences;
    decodeBoundingBoxes(scores, geometry, confThreshold, boxes, confidences);
    // Apply non-maximum suppression procedure.
    std::vector<int> indices;
    NMSBoxes(boxes, confidences, confThreshold, nmsThreshold, indices);
    Point2f ratio((float)frame.cols / inpWidth, (float)frame.rows / inpHeight);
    // Render text.
    for (size_t i = 0; i < indices.size(); ++i)
    {
        RotatedRect& box = boxes[indices[i]];
        Point2f vertices[4];
        box.points(vertices);
        for (int j = 0; j < 4; ++j)
        {
            vertices[j].x *= ratio.x;
            vertices[j].y *= ratio.y;
        }
        if (!modelRecognition.empty())
        {
            Mat cropped;
            fourPointsTransform(frame, vertices, cropped);
            cvtColor(cropped, cropped, cv::COLOR_BGR2GRAY);
            Mat blobCrop = blobFromImage(cropped, 1.0/127.5, Size(), Scalar::all(127.5));
            recognizer.setInput(blobCrop);
            tickMeter.start();
            Mat result = recognizer.forward();
            tickMeter.stop();
            std::string wordRecognized = "";
            decodeText(result, wordRecognized);
            putText(frame, wordRecognized, vertices[1], FONT_HERSHEY_SIMPLEX, 1, Scalar(0, 0, 255));
            qDebug()<<QString::fromStdString(wordRecognized)<<endl;
        }
        for (int j = 0; j < 4; ++j)
            line(frame, vertices[j], vertices[(j + 1) % 4], Scalar(0, 255, 0), 1);
    }
    // Put efficiency information.
    //std::string label = format("Inference time: %.2f ms", tickMeter.getTimeMilli());
    //putText(frame, label, Point(10, 20), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));

    static const std::string kWinName = "EAST: An Efficient and Accurate Scene Text Detector";
    namedWindow(kWinName, WINDOW_NORMAL);
    imshow(kWinName, frame);

    tickMeter.reset();
    #endif
    return a.exec();
}

(5)result: