实现了C++调用CRNN模型,实现OCR识别。
-
C++ calls CRNN model and OCR recognition.
CRNN是一种卷积循环神经网络结构,用于解决基于图像的序列识别问题,特别是场景文字识别问题。
- CRNN is a kind of convolution loop neural network structure, which is used to solve the sequence recognition problem based on image, especially the scene text recognition problem.
CRNN 全称为 Convolutional Recurrent Neural Network,主要用于端到端地对不定长的文本序列进行识别,不用先对单个文字进行切割,而是将文本识别转化为时序依赖的序列学习问题,就是基于图像的序列识别。
- CRNN, fully known as Convolutional Neural Network, is found to be mainly used for end-to-end recognition of variable text sequences; it turns text recognition into a sequential sequential learning problem, which is image-based sequence recognition, rather than cutting a single text first.
CRNN网络结构
- CRNN network structure
整个CRNN网络结构包含三部分,从下到上依次为:
CNN(卷积层),使用深度CNN,对输入图像提取特征,得到特征图;
RNN(循环层),使用双向RNN(BLSTM)对特征序列进行预测,对序列中的每个特征向量进行学习,并输出预测标签(真实值)分布;
CTC loss(转录层),使用 CTC 损失,把从循环层获取的一系列标签分布转换成最终的标签序列。
-
The whole CRNN network structure consists of three parts, which are in order from bottom to top:
CNN (convolutional layer) USES depth CNN to extract features from the input image to obtain the feature map.
RNN (cyclic layer) is used to predict the feature sequence, learn each feature vector in the sequence, and output the distribution of prediction label (true value).
CTC loss (transcription layer), using CTC loss, converts a series of label distributions obtained from the cyclic layer into the final label sequence.
C++实现CRNN模型的调用:
-
C++ implements the call of CRNN model:
(1)projects:ENV config
INCLUDEPATH += C:\opencv-4.4.0\opencv\build\include
C:\opencv-4.4.0\opencv\build\include\opencv
C:\opencv-4.4.0\opencv\build\include\opencv2
LIBS +=-LC:\opencv-4.4.0\opencv\build\x64\vc14\lib -lopencv_world440
INCLUDEPATH += $$PWD/../opencv-4.4.0/opencv/build/x64/vc14
DEPENDPATH += $$PWD/../opencv-4.4.0/opencv/build/x64/vc14
(2)essential files:
crnn.pth(该软件在pytorch中实现了卷积递归神经网络(CRNN)
- his software implements the Convolutional Recurrent Neural Network (CRNN) in pytorch. ) convert crnn.pth to crnn.onnx to use in opencv by the c++.
frozen_east_text_detection.pb:Text positioning function.(It's not necessary,You can do the projection, get the text area).
(3)convert crnn.pth to crnn.onnx
ptonnx:
import torch
import models.crnn as crnn
model = CRNN(32, 1, 37, 256)
model.load_state_dict(torch.load('crnn.pth'))
dummy_input = torch.randn(1, 1, 32, 100)
torch.onnx.export(model, dummy_input, "crnn.onnx", verbose=True)
models:
import torch.nn as nn
class BidirectionalLSTM(nn.Module):
def __init__(self, nIn, nHidden, nOut):
super(BidirectionalLSTM, self).__init__()
self.rnn = nn.LSTM(nIn, nHidden, bidirectional=True)
self.embedding = nn.Linear(nHidden * 2, nOut)
def forward(self, input):
recurrent, _ = self.rnn(input)
T, b, h = recurrent.size()
t_rec = recurrent.view(T * b, h)
output = self.embedding(t_rec) # [T * b, nOut]
output = output.view(T, b, -1)
return output
class CRNN(nn.Module):
def __init__(self, imgH, nc, nclass, nh, n_rnn=2, leakyRelu=False):
super(CRNN, self).__init__()
assert imgH % 16 == 0, 'imgH has to be a multiple of 16'
ks = [3, 3, 3, 3, 3, 3, 2]
ps = [1, 1, 1, 1, 1, 1, 0]
ss = [1, 1, 1, 1, 1, 1, 1]
nm = [64, 128, 256, 256, 512, 512, 512]
cnn = nn.Sequential()
def convRelu(i, batchNormalization=False):
nIn = nc if i == 0 else nm[i - 1]
nOut = nm[i]
cnn.add_module('conv{0}'.format(i),
nn.Conv2d(nIn, nOut, ks[i], ss[i], ps[i]))
if batchNormalization:
cnn.add_module('batchnorm{0}'.format(i), nn.BatchNorm2d(nOut))
if leakyRelu:
cnn.add_module('relu{0}'.format(i),
nn.LeakyReLU(0.2, inplace=True))
else:
cnn.add_module('relu{0}'.format(i), nn.ReLU(True))
convRelu(0)
cnn.add_module('pooling{0}'.format(0), nn.MaxPool2d(2, 2)) # 64x16x64
convRelu(1)
cnn.add_module('pooling{0}'.format(1), nn.MaxPool2d(2, 2)) # 128x8x32
convRelu(2, True)
convRelu(3)
cnn.add_module('pooling{0}'.format(2),
nn.MaxPool2d((2, 2), (2, 1), (0, 1))) # 256x4x16
convRelu(4, True)
convRelu(5)
cnn.add_module('pooling{0}'.format(3),
nn.MaxPool2d((2, 2), (2, 1), (0, 1))) # 512x2x16
convRelu(6, True) # 512x1x16
self.cnn = cnn
self.rnn = nn.Sequential(
BidirectionalLSTM(512, nh, nh),
BidirectionalLSTM(nh, nh, nclass))
def forward(self, input):
# conv features
conv = self.cnn(input)
b, c, h, w = conv.size()
assert h == 1, "the height of conv must be 1"
conv = conv.squeeze(2)
conv = conv.permute(2, 0, 1) # [w, b, c]
# rnn features
output = self.rnn(conv)
return output
(4)CPP(Modified the code provided by Opencv)
#include <QCoreApplication>
/*
import torch
import models.crnn as crnn
model = CRNN(32, 1, 37, 256)
model.load_state_dict(torch.load('crnn.pth'))
dummy_input = torch.randn(1, 1, 32, 100)
torch.onnx.export(model, dummy_input, "crnn.onnx", verbose=True)
*/
#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>
#include <opencv2/dnn.hpp>
#include <QDebug>
using namespace cv;
using namespace cv::dnn;
void decodeBoundingBoxes(const Mat& scores, const Mat& geometry, float scoreThresh,
std::vector<RotatedRect>& detections, std::vector<float>& confidences);
void fourPointsTransform(const Mat& frame, Point2f vertices[4], Mat& result);
void decodeText(const Mat& scores, std::string& text);
void decodeBoundingBoxes(const Mat& scores, const Mat& geometry, float scoreThresh,
std::vector<RotatedRect>& detections, std::vector<float>& confidences)
{
detections.clear();
CV_Assert(scores.dims == 4); CV_Assert(geometry.dims == 4); CV_Assert(scores.size[0] == 1);
CV_Assert(geometry.size[0] == 1); CV_Assert(scores.size[1] == 1); CV_Assert(geometry.size[1] == 5);
CV_Assert(scores.size[2] == geometry.size[2]); CV_Assert(scores.size[3] == geometry.size[3]);
const int height = scores.size[2];
const int width = scores.size[3];
for (int y = 0; y < height; ++y)
{
const float* scoresData = scores.ptr<float>(0, 0, y);
const float* x0_data = geometry.ptr<float>(0, 0, y);
const float* x1_data = geometry.ptr<float>(0, 1, y);
const float* x2_data = geometry.ptr<float>(0, 2, y);
const float* x3_data = geometry.ptr<float>(0, 3, y);
const float* anglesData = geometry.ptr<float>(0, 4, y);
for (int x = 0; x < width; ++x)
{
float score = scoresData[x];
if (score < scoreThresh)
continue;
// Decode a prediction.
// Multiple by 4 because feature maps are 4 time less than input image.
float offsetX = x * 4.0f, offsetY = y * 4.0f;
float angle = anglesData[x];
float cosA = std::cos(angle);
float sinA = std::sin(angle);
float h = x0_data[x] + x2_data[x];
float w = x1_data[x] + x3_data[x];
Point2f offset(offsetX + cosA * x1_data[x] + sinA * x2_data[x],
offsetY - sinA * x1_data[x] + cosA * x2_data[x]);
Point2f p1 = Point2f(-sinA * h, -cosA * h) + offset;
Point2f p3 = Point2f(-cosA * w, sinA * w) + offset;
RotatedRect r(0.5f * (p1 + p3), Size2f(w, h), -angle * 180.0f / (float)CV_PI);
detections.push_back(r);
confidences.push_back(score);
}
}
}
void fourPointsTransform(const Mat& frame, Point2f vertices[4], Mat& result)
{
const Size outputSize = Size(100, 32);
Point2f targetVertices[4] = {Point(0, outputSize.height - 1),
Point(0, 0), Point(outputSize.width - 1, 0),
Point(outputSize.width - 1, outputSize.height - 1),
};
Mat rotationMatrix = getPerspectiveTransform(vertices, targetVertices);
warpPerspective(frame, result, rotationMatrix, outputSize);
}
void decodeText(const Mat& scores, std::string& text)
{
static const std::string alphabet = "0123456789abcdefghijklmnopqrstuvwxyz";
Mat scoresMat = scores.reshape(1, scores.size[0]);
std::vector<char> elements;
elements.reserve(scores.size[0]);
for (int rowIndex = 0; rowIndex < scoresMat.rows; ++rowIndex)
{
Point p;
minMaxLoc(scoresMat.row(rowIndex), 0, 0, 0, &p);
if (p.x > 0 && static_cast<size_t>(p.x) <= alphabet.size())
{
elements.push_back(alphabet[p.x - 1]);
}
else
{
elements.push_back('-');
}
}
if (elements.size() > 0 && elements[0] != '-')
text += elements[0];
for (size_t elementIndex = 1; elementIndex < elements.size(); ++elementIndex)
{
if (elementIndex > 0 && elements[elementIndex] != '-' &&
elements[elementIndex - 1] != elements[elementIndex])
{
text += elements[elementIndex];
}
}
}
int main(int argc, char** argv)
{
QCoreApplication a(argc, argv);
qDebug()<<"=========================================="<<endl;
Mat img = imread(".//3.jpg");
imshow("kWinName", img);
//waitKey(0);
#if 0
float confThreshold = parser.get<float>("thr");
float nmsThreshold = parser.get<float>("nms");
int inpWidth = parser.get<int>("width");
int inpHeight = parser.get<int>("height");
String modelDecoder = parser.get<String>("model");
String modelRecognition = parser.get<String>("ocr");
#endif
#if 1
float confThreshold = 0.5;
float nmsThreshold = 0.4;
int inpWidth = 320;
int inpHeight = 320;
String modelDecoder = ".//frozen_east_text_detection.pb";
String modelRecognition = ".//crnn.onnx";
CV_Assert(!modelDecoder.empty());
// Load networks.
Net detector = readNet(modelDecoder);
Net recognizer;
if (!modelRecognition.empty())
recognizer = readNet(modelRecognition);
else
{
qDebug()<<"error"<<endl;
}
// Open a video file or an image file or a camera stream.
std::vector<Mat> outs;
std::vector<String> outNames(2);
outNames[0] = "feature_fusion/Conv_7/Sigmoid";
outNames[1] = "feature_fusion/concat_3";
Mat frame, blob;
TickMeter tickMeter;
frame = img.clone();
blobFromImage(frame, blob, 1.0, Size(inpWidth, inpHeight), Scalar(123.68, 116.78, 103.94), true, false);
detector.setInput(blob);
tickMeter.start();
detector.forward(outs, outNames);
tickMeter.stop();
Mat scores = outs[0];
Mat geometry = outs[1];
// Decode predicted bounding boxes.
std::vector<RotatedRect> boxes;
std::vector<float> confidences;
decodeBoundingBoxes(scores, geometry, confThreshold, boxes, confidences);
// Apply non-maximum suppression procedure.
std::vector<int> indices;
NMSBoxes(boxes, confidences, confThreshold, nmsThreshold, indices);
Point2f ratio((float)frame.cols / inpWidth, (float)frame.rows / inpHeight);
// Render text.
for (size_t i = 0; i < indices.size(); ++i)
{
RotatedRect& box = boxes[indices[i]];
Point2f vertices[4];
box.points(vertices);
for (int j = 0; j < 4; ++j)
{
vertices[j].x *= ratio.x;
vertices[j].y *= ratio.y;
}
if (!modelRecognition.empty())
{
Mat cropped;
fourPointsTransform(frame, vertices, cropped);
cvtColor(cropped, cropped, cv::COLOR_BGR2GRAY);
Mat blobCrop = blobFromImage(cropped, 1.0/127.5, Size(), Scalar::all(127.5));
recognizer.setInput(blobCrop);
tickMeter.start();
Mat result = recognizer.forward();
tickMeter.stop();
std::string wordRecognized = "";
decodeText(result, wordRecognized);
putText(frame, wordRecognized, vertices[1], FONT_HERSHEY_SIMPLEX, 1, Scalar(0, 0, 255));
qDebug()<<QString::fromStdString(wordRecognized)<<endl;
}
for (int j = 0; j < 4; ++j)
line(frame, vertices[j], vertices[(j + 1) % 4], Scalar(0, 255, 0), 1);
}
// Put efficiency information.
//std::string label = format("Inference time: %.2f ms", tickMeter.getTimeMilli());
//putText(frame, label, Point(10, 20), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
static const std::string kWinName = "EAST: An Efficient and Accurate Scene Text Detector";
namedWindow(kWinName, WINDOW_NORMAL);
imshow(kWinName, frame);
tickMeter.reset();
#endif
return a.exec();
}
(5)result:
(6)problems:
If your program works correctly, but has no results, you can refer to my following practice
add this files on the release path。
The directory structure of this procedure is as follows:
The recognition accuracy needs further training to improve
I hope I can help you,If you have any questions, please comment on this blog or send me a private message. I will reply in my free time.