#include <algorithm> #include <array> #include <cassert> #include <print> #include <string> #include <vector> #include "Utils.h" #include <opencv2/opencv.hpp> #include "net.h" #define DetectParamPath "../data/det.param" #define DetectModelPath "../data/det.bin" #define RecognizeParamPath "../data/rec.param" #define RecognizeModelPath "../data/rec.bin" #define CharDictPath "../data/ocr_text_dict.txt" #define IsDebug 1 namespace Impl { /** * * @param cols * @param rows * @param outCols * @param outRows * @param maxSideConstraint 最大边长? * @param alignment 对齐大小 * @return */ uint64_t calcTargetSize(uint32_t cols, uint32_t rows, uint32_t *outCols, uint32_t *outRows, uint32_t maxSideConstraint, uint32_t alignment) { unsigned int *v7; if (cols < maxSideConstraint && rows < maxSideConstraint) { maxSideConstraint = cols; if (cols < rows) maxSideConstraint = rows; if (maxSideConstraint % alignment) maxSideConstraint += alignment - maxSideConstraint % alignment; } if (rows <= cols) { *outCols = maxSideConstraint; v7 = outRows; *outRows = maxSideConstraint * rows / cols; } else { v7 = outRows; *outRows = maxSideConstraint; *outCols = maxSideConstraint * cols / rows; } unsigned int v8 = *outCols % alignment; if (v8) *outCols = alignment + *outCols - v8; unsigned int v10 = *v7 % alignment; uint64_t result = *v7 / alignment; if (v10) *v7 = alignment + *v7 - v10; return result; } std::vector<cv::Mat> getEachRegion( const cv::Mat &image, const cv::Mat &binaryMask) { cv::Mat newBinary = binaryMask.clone(); cv::resize(newBinary, newBinary, cv::Size(image.cols, image.rows)); std::vector<std::vector<cv::Point> > contours; std::vector<cv::Vec4i> hierarchy; cv::findContours(newBinary, contours, hierarchy, cv::RETR_EXTERNAL, cv::CHAIN_APPROX_SIMPLE); #if IsDebug // 调试用 cv::cvtColor(newBinary, newBinary, cv::COLOR_GRAY2BGR); cv::Mat debugImageBinary = newBinary.clone(); cv::Mat debugImage = image.clone(); #endif // 存储所有裁剪出的区域 std::vector<cv::Mat> cropped_images; // 获取原始图像的尺寸,用于边界检查 constexpr int padding_x = 10; constexpr int padding_y = 7; constexpr int min_area = 4; std::println( "find contours: {}" , contours.size()); for ( size_t i = 0; i < contours.size(); ++i) { // 计算当前轮廓的原始边界框 cv::Rect original_box = cv::boundingRect(contours[i]); // 过滤掉非常小的区域 if (original_box.area() < min_area) { std::println( "area too small: {}" , original_box.area()); continue ; } // 计算扩展后的边界框 cv::Rect expanded_box; expanded_box.x = std::ranges::max(0, original_box.x - padding_x); expanded_box.y = std::ranges::max(0, original_box.y - padding_y); expanded_box.width = std::ranges::min(image.cols - expanded_box.x, original_box.width + padding_x * 2); expanded_box.height = std::ranges::min(image.rows - expanded_box.y, original_box.height + padding_y * 2); // 检查扩展后的框是否有效 (宽度和高度必须大于0) if (expanded_box.width <= 0 || expanded_box.height <= 0) { std::println( "Skipping contour {} because expanded box is invalid." , i); continue ; } #if IsDebug // 调试线条 cv::rectangle(debugImage, expanded_box, cv::Scalar(0, 255, 0, 255), 1); // 绿色框// 绿色框 cv::rectangle(debugImageBinary, expanded_box, cv::Scalar(0, 255, 0, 255), 1); // 绿色框 #endif // 使用扩展后的边界框裁剪原始图像 cv::Mat cropped_region = image(expanded_box).clone(); // 将裁剪出的图像添加到列表中 cropped_images.emplace_back(cropped_region); } #if IsDebug cv::imwrite( "../output/debugImage.png" , debugImage); cv::imwrite( "../output/debugImageBinary.png" , debugImageBinary); #endif return cropped_images; } } class OCR { private : ncnn::Net detectNet_; ncnn::Net recognizeNet_; std::vector<std::string> charDict_; public : OCR() { detectNet_.load_param(DetectParamPath); detectNet_.load_model(DetectModelPath); recognizeNet_.load_param(RecognizeParamPath); recognizeNet_.load_model(RecognizeModelPath); charDict_ = Utils::getCharDict(CharDictPath); } // 检测文本区域,QtImage 的图像是 32bit (0xffRRGGBB) std::vector<cv::Mat> findRegions( const cv::Mat &image) { assert (image.type() == CV_8UC4 && "image type must be CV_8UC4(BGRA)" ); // 计算目标大小 unsigned int calcCols, calcRows; Impl::calcTargetSize(image.cols, image.rows, &calcCols, &calcRows, 0x3C0u, 0x20u); ncnn::Mat out; ncnn::Mat in = ncnn::Mat::from_pixels_resize( image.data, ncnn::Mat::PIXEL_RGBA2RGB, image.cols, image.rows, // 如果不使用计算值则容易糊在一起 static_cast < int >(calcCols), static_cast < int >(calcRows) ); // 归一化 uint32_t mean_vals[3]; // { 0.485f, 0.456f , 0.406f } uint32_t norm_vals[3]; // { 1 / 0.229f / 255.f, 1 / 0.224f / 255.f, 1 / 0.225f / 255.f <-这个对不上} mean_vals[0] = 0x3EF851EC; mean_vals[1] = 0x3EE978D5; mean_vals[2] = 0x3ECFDF3B; norm_vals[0] = 0x3C8C4936; norm_vals[1] = 0x3C8F6AD8; norm_vals[2] = 0x3C8EC7AB; in.substract_mean_normalize( reinterpret_cast < float *>(mean_vals), reinterpret_cast < float *>(norm_vals)); // 推理 ncnn::Extractor ex = detectNet_.create_extractor(); ex.input( "x" , in); ex.extract( "sigmoid_0.tmp_0" , out, 0); // 归一化 uint32_t norms2[3]; norms2[0] = 0x437F0000; norms2[1] = 0x437F0000; norms2[2] = 0x437F0000; out.substract_mean_normalize(nullptr, reinterpret_cast < float *>(norms2)); // 转换为灰度图 cv::Mat binaryMask(out.h, out.w, CV_8U); out.to_pixels(binaryMask.data, ncnn::Mat::PIXEL_GRAY); // 二值化 // xmm 00000000000000004053200000000000 // xmm 0000000000000000406FE00000000000 cv::threshold(binaryMask, binaryMask, 76.5, 255.0, cv::THRESH_BINARY); assert (binaryMask.type() == CV_8UC1 && "mask type must be CV_8UC1" ); return Impl::getEachRegion(image, binaryMask); } // 识别文本(输入 BGRA 图像,也就是 QtImage) std::vector<std::pair< float , std::string> > recognizeText( const cv::Mat &image) { assert (image.type() == CV_8UC4 && "image type must be CV_8UC4(BGRA)" ); ncnn::Mat out2; ncnn::Mat in2 = ncnn::Mat::from_pixels_resize( image.data, ncnn::Mat::PIXEL_BGRA2BGR, image.cols, image.rows, static_cast < int >( static_cast < float >(image.cols) / static_cast < float >(image.rows) * 48.0), 0x30 ); constexpr uint32_t mean_vals2[3] { 0x42FF0000, 0x42FF0000, 0x42FF0000 }; constexpr uint32_t norm_vals2[3] { 0x3C008081, 0x3C008081, 0x3C008081 }; in2.substract_mean_normalize( reinterpret_cast < const float *>(mean_vals2), reinterpret_cast < const float *>(norm_vals2)); ncnn::Extractor extractor = recognizeNet_.create_extractor(); extractor.input( "x" , in2); extractor.extract( "softmax_11.tmp_0" , out2, 0); // 0x24A std::vector<std::pair< float , std::string> > results; // 逐行扫描 int64_t lastIndex = 0; for ( int hIndex = 0; hIndex < out2.h; ++hIndex) { const auto l1 = out2.row(hIndex); const auto l2 = out2.row(hIndex + 1); const auto maxElementPtr = std::ranges::max_element(l1, l2); const auto index = maxElementPtr - l1; if (index > 0 && (hIndex <= 0 || index != lastIndex)) { results.emplace_back(*maxElementPtr, charDict_[index - 1]); } lastIndex = index; } return results; } }; int main() { auto orginal_image = cv::imread( "../test1.png" , cv::IMREAD_UNCHANGED); cv::cvtColor(orginal_image, orginal_image, cv::COLOR_BGR2BGRA); auto ocr = OCR(); for ( const auto &element: ocr.findRegions(orginal_image)) { std::string t; for ( auto &[score, text]: ocr.recognizeText(element)) { // std::println("Text: {}, Score: {}", text, score); t += text; } std::println( "Text: {}" , t); } return 0; } |