投影分割算法—文字识别前的预处理（字符准确分割）

最新推荐文章于 2023-06-18 12:26:57 发布

三叔家的猫

最新推荐文章于 2023-06-18 12:26:57 发布

阅读量3.5k

点赞数 3

分类专栏： OCR 文章标签：深度学习人工智能

本文链接：https://blog.csdn.net/qq_39056987/article/details/118677903

版权

OCR 专栏收录该内容

18 篇文章 34 订阅

订阅专栏

序言

基于opencv、numpy，实现了投影分割算法对文字识别前的预处理，能够准确分割出字符的具体位置，有助于提高模型准确率，使用numpy实现某些运算过程，程序耗时在毫秒级别，对于识别速度效率而言几乎无影响。

一、使用场景

搞过ocr的同学都知道，OCR通常可以分为两阶段：一、文字检测；二、文字识别。这两部分衔接起来，构成了OCR的识别任务。但是往往在这两个阶段的中间还可以加入一些处理，能够有助于提高模型的准确率，比较典型的例子可以看我之前的文章OCR-文本检测后的文字纠正，如果文字检测模型检测出来的文字倾斜角度较大的，在送入识别网络前不做任何纠正的话，识别率往往非常的感人。
在这里插入图片描述

这部分的纠正在之前的文章中已经解决了，那么本文要解决的是当模型检测不准的时候，或者人工框选（有些需求可能需要人工去框选识别）的目标不准时，可能会出现如下情况：
在这里插入图片描述

可以看到文字的边缘有许多冗余的信息，这会干扰到文字识别的效果，我们需要的是精准的文字的位置，即红色框部分；

这是因为在模型训练的时候，通常情况下训练的数据是相对来说文字在图片中的占比是比较多的，并没有太多的冗余信息存在，如果按照以上的图片直接送入CRNN网络中，图片首先会等比例压缩成高为32的图片，本身文字所占比就比较小，经过一压缩，文字信息会变得更模糊了，严重的话会缩成一团字符笔画都无法看清的文字。

所以需要在识别之前做一些处理，我直接贴出代码实现，是基于上一篇文章的分割部分投影法实现，对速度进行了优化，投影法原理也比较简单，百度稍微看一下就好了，在这里就不多说，本实现应用了垂直和水平投影，代码分成了好几个模块，看起来也比较容易理解：

import numpy as np
import cv2
from collections import Counter

def stretch(img):
    '''
    图像拉伸函数
    '''
    maxi=float(img.max())
    mini=float(img.min())

    img = img*np.array(255/(maxi-mini))-np.array((255*mini)/(maxi-mini))
    img = img.astype("uint8")

    return img

def dobinaryzation(img):
    '''
    二值化处理函数
    '''
    maxi=float(img.max())
    mini=float(img.min())

    x = maxi-((maxi-mini)/2)
    ret,thresh=cv2.threshold(img,x,255,cv2.THRESH_BINARY)
    #返回二值化后的黑白图像
    return thresh

def getHProjection(image):

    # 图像高与宽
    (h, w) = image.shape
    # 长度与图像高度一致的数组
    h_ = [0] * h
    # 循环统计每一行白色像素的个数
    indx = np.where(image==255)
    indx_set = set(indx[0])
    indx_list = list(indx[0])

    count = Counter(indx_list)
    count_dict = dict(count)
    for i in indx_set:
        count = count_dict[i]
        h_[i] = count
    return h_

def getWProjection(image):
    # 图像高与宽

    (h, w) = image.shape
    # 长度与图像高度一致的数组
    w_ = [0] * w
    # 循环统计每一列白色像素的个数
    indx = np.where(image==255)

    indx_set = set(indx[1])
    indx_list = list(indx[1])

    count = Counter(indx_list)
    count_dict = dict(count)
    for i in indx_set:
        count = count_dict[i]
        w_[i] = count

    return w_

def find_license(img):
    '''
    预处理函数
    '''
    gray_img=cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
    # 灰度拉伸
    stretchedimg=stretch(gray_img)
    # 图像二值化

    binaryimg=dobinaryzation(stretchedimg)
    # canny边缘检测

    canny=cv2.Canny(binaryimg,binaryimg.shape[0],binaryimg.shape[1])

    # 进行闭运算
    kernel=np.ones((3,19),np.uint8)
    closingimg=cv2.morphologyEx(canny,cv2.MORPH_CLOSE,kernel)


    H = getHProjection(closingimg)     # 投影字符分割
    W = getWProjection(closingimg)

    start_h = 0
    H_Start = []
    H_End = []
    
    # 根据水平投影获取垂直分割位置
    for i in range(len(H)):
        if H[i] > 0 and start_h == 0:
            H_Start.append(i)
            start_h = 1
        if H[i] <= 0 and start_h == 1:
            H_End.append(i)
            start_h = 0
    if len(H_Start) != len(H_End):
        H_End.append(len(H))
    # 分割行，分割之后再进行列分割并保存分割位置
    sum_list = []

    for i in range(len(H_Start)):
        sum_pix = sum(H[H_Start[i]:H_End[i]])        # 对每个区域内像素相加
        sum_list.append(sum_pix)
    index_h = sum_list.index(max(sum_list))         # 选出像素分布最大的区域

    start_w = 0
    W_Start = []
    W_End = []
    
    # 根据垂直投影获取水平分割位置
    for i in range(len(W)):
        if W[i] > 0 and start_w == 0:
            W_Start.append(i)
            start_w = 1
        if W[i] <= 0 and start_w == 1:
            W_End.append(i)
            start_w = 0
    if len(W_Start) != len(W_End):
        W_End.append(len(W))
    # 分割行，分割之后再进行列分割并保存分割位置
    w_start = min(W_Start)
    w_end = max(W_End)

    cropImg = img[max(H_Start[index_h]-5,0):min(H_End[index_h]+5,img.shape[0]), max(w_start - 5, 0): min(w_end + 5, img.shape[1])]      # 对该区域进行裁剪，这里裁剪的时候保留了上下左右5个像素

    return cropImg

if __name__ == "__main__":
    import time
    img_path = r"/home/cai/1.png"
    img = cv2.imread(img_path)
    cv2.imshow("img0",img)
    time1 = time.time()
    img = find_license(img)
    time2 = time.time()
    print(time2-time1)
    cv2.imshow("img",img)
    cv2.waitKey()

运行前后图像的对比和速度（最上方的数字）：
在这里插入图片描述

再附上C++实现：

//
// Created by cai on 2021/7/12.
//

#include <opencv2/opencv.hpp>
#include <iostream>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>

using namespace cv;
using namespace std;

Mat vertical_projection(Mat dst, Mat images) //基于Y轴垂直投影分割算法（输入的是二值图像）
{
    int src_width = dst.cols;        //获得二值图像的列数（宽）（列的像素点个数）
    int src_height = dst.rows;       //获得二值图像的行数（高）（行的像素点个数）
    int* projectValArry = new int[src_height]();//创建用于储存每行黑色像素个数的数组
    int* projectValWidth = new int[src_width]();//创建用于储存每行黑色像素个数的数组

    for (int i = 0; i < src_height; i++){
        for (int j = 0; j < src_width; j++){
            if (dst.at<uchar>(i, j) == 255)      //at:取出二值图像的i行j列的像素点，如果他是黑色，则存取
            {
                projectValArry[i]++;   //每一行的黑色像素个数加一
                projectValWidth[j]++;
            }
        }
    }

    //定义Mat vector ，存储图片数组
    vector<int> H_strat;
    vector<int> H_end;

    bool white_block = 0;

    for (int i =0;i<src_height;i++){
        if (projectValArry[i]){
            white_block = 1;
            H_strat.push_back(i);
        }
        if (projectValArry[i] && white_block ==1){
            H_end.push_back(i);
            white_block = 0;
        }
    }
    int h_start = *min_element(H_strat.begin(),H_strat.end());
    int h_end = *max_element(H_end.begin(),H_end.end());



    //定义Mat vector ，存储图片数组
    vector<int> W_strat;
    vector<int> W_end;

    for (int i =0;i<src_width;i++){
        if (projectValWidth[i]){
            white_block = 1;
            W_strat.push_back(i);
        }
        if (projectValWidth[i] && white_block ==1){
            W_end.push_back(i);
            white_block = 0;
        }
    }
    int w_start = *min_element(W_strat.begin(),W_strat.end());
    int w_end = *max_element(W_end.begin(),W_end.end());

    Mat split_temp;
    split_temp = images(Rect(w_start-5, h_start-5, w_end-w_start+10, h_end - h_start+10)).clone();
    return split_temp;
}

int main(){
    //载入图像
    Mat img = imread("/home/cai/1.png");
    if (img.empty()){
        cout << "Error: Could not load image" << endl;
        return 0;
    }
    Mat gray;
    cvtColor(img, gray, COLOR_BGR2GRAY);

    double minv = 0.0, maxv = 0.0,thres = 0.0;
    double* minp = &minv;
    double* maxp = &maxv;

    minMaxIdx(gray,minp,maxp);

    //先转为灰度图
    Mat dst;
    thres = maxv - ((maxv-minv)/2);
    threshold(gray,dst,thres,255,THRESH_BINARY);

    Canny(dst,dst,dst.rows,dst.cols);

    Mat element;
    element = getStructuringElement(MORPH_RECT, Size(19, 3));
    morphologyEx(dst, dst, MORPH_CLOSE, element);

    Mat img_crop = vertical_projection(dst,img);
    imshow("img_crop",img_crop);
    waitKey(0);
    return 0;
}

三叔家的猫

关注

3
点赞
踩
30

收藏

觉得还不错? 一键收藏
1
评论
投影分割算法—文字识别前的预处理（字符准确分割）

序言基于opencv、numpy，实现了投影分割算法对文字识别前的预处理，能够准确分割出字符的具体位置，有助于提高模型准确率，使用numpy实现某些运算过程，程序耗时在毫秒级别，对于识别速度效率而言几乎无影响。一、使用场景搞过ocr的同学都知道，OCR通常可以分为两阶段：一、文字检测；二、文字识别。这两部分衔接起来，构成了OCR的识别任务。但是往往在这两个阶段的中间还可以加入一些处理，能够有助于提高模型的准确率，比较典型的例子可以看我之前的文章OCR-文本检测后的文字纠正，如果文字检测模型检测出来的文
复制链接

扫一扫