OCR-文本检测后的文字纠正

最新推荐文章于 2024-05-14 00:18:11 发布

三叔家的猫

最新推荐文章于 2024-05-14 00:18:11 发布

阅读量5.1k

点赞数 10

分类专栏： OCR 文章标签：计算机视觉 opencv python

本文链接：https://blog.csdn.net/qq_39056987/article/details/108671180

版权

OCR 专栏收录该内容

18 篇文章 36 订阅

订阅专栏

序言

主要是对规则的文本检测的角度纠正，如果弯曲的很厉害，那也没办法，因为最近一直在做OCR方面的工作，包括文本检测、文本识别等等，关于检测后的文本角度纠正，网上查了很久没发现一个靠谱的，想了很久，决定自己用opencv实现一下，本文记录下实现的过程。主要是对YOLO检测后的倾斜文本做纠正，纠正后的图片送入到识别网络中能够大大的提升模型的识别率，当然不一定对所有场景下的文本都适用，实现仅供参考和学习。

一、关键技术

霍夫变换预测倾斜角度；
图像二值化用于分割文本区域；
ROI区域获取；
得到纠正后的文本区域截图。

二、实现过程

2.1 样本准备

首先获得文本检测后的截图，例如我们这里有几张发票检测下来的倾斜角度比较大的图片：
在这里插入图片描述

我们希望能够将图中的文字方向正确的纠正过来，具体实现如下。

首先通过霍夫变换找出图片中的直线，并通过这些直线求得近似真实值的旋转角度，再然后使用旋转函数对图片进行角度旋转。在这里要说明的是，其实文本图像中的直线是比较多的，我在反复测试很多检测文本时发现，纠正的角度还是很标准的，不过也有例外，总的来说，文本越清晰，求出的角度越准确。先贴上角度纠正前后的效果对比：

在这里插入图片描述

这里旋转后为了不缺边，使用了图像的RGB均值对边缘进行填充。

第二步是将图片灰度后进行二值化处理，二值化后用到了闭运算，这样能够保证相连的文字不会被分成几个部分，最后得到的二值图像如下，对比上图，可以看到有文字的区域都被区分的很开，这里尝试了后面接着用开运算，但是效果不理想，所以果断放弃这个操作。
在这里插入图片描述

得到文字区域的二值图像后，接下来就对这些区域进行ROI提取，提取出面积最大的最小外接矩形区域，其他的剔除，最后得到的纠正效果：

在这里插入图片描述

在这里插入图片描述
前后对比一下，可以说效果非常的奈斯。当然正常情况下的文本倾斜角度不会太大，如果霍夫变换求出的角度太小对识别的影响不是特别大，如果求出角度太大则很可能会出错，这里我们可以在程序里对角度范围进行限制，超过一定范围则忽略直接返回原图，这部分的修改可以在Hough函数中设置。

三、代码实现

最后贴上代码实现：

# -*- coding: utf-8 -*-
import cv2
import numpy as np
import math

def compute(img):
    "求图像每个通道的均值"
    per_image_Rmean = []
    per_image_Gmean = []
    per_image_Bmean = []
    per_image_Bmean.append(np.mean(img[:, :, 0]))
    per_image_Gmean.append(np.mean(img[:, :, 1]))
    per_image_Rmean.append(np.mean(img[:, :, 2]))
    R_mean = np.mean(per_image_Rmean)
    G_mean = np.mean(per_image_Gmean)
    B_mean = np.mean(per_image_Bmean)
    return B_mean, G_mean, R_mean


def rotate_bound(image, angle):
    """
    旋转任意角度不缺边缘
    """
    (h, w) = image.shape[:2]  # 获取图像的尺寸
    (cX, cY) = (w // 2, h // 2)  # 旋转中心

    # 设置旋转矩阵
    M = cv2.getRotationMatrix2D((cX, cY), -angle, 1.0)
    cos = np.abs(M[0, 0])
    sin = np.abs(M[0, 1])

    # 计算图像旋转后的新边界
    nW = int((h * sin) + (w * cos))
    nH = int((h * cos) + (w * sin))

    # 调整旋转矩阵的移动距离（t_{x}, t_{y}）
    M[0, 2] += (nW / 2) - cX
    M[1, 2] += (nH / 2) - cY

    # perform the actual rotation and return the image
    b, g, r = compute(image)
    return cv2.warpAffine(image, M, (nW, nH), borderValue=(b, g, r))


def Hough(img):
    """
    霍夫变换角度检测并纠正
    :param img:
    :return: 矫正后的图片
    """
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    edges = cv2.Canny(gray, 25, 150, apertureSize=3)

    lines = cv2.HoughLines(edges, 1, np.pi / 180, 0)
    for rho, theta in lines[0]:
        a = np.cos(theta)
        b = np.sin(theta)
        x0 = a * rho
        y0 = b * rho
        x1 = int(x0 + 1000 * (-b))
        y1 = int(y0 + 1000 * (a))
        x2 = int(x0 - 1000 * (-b))
        y2 = int(y0 - 1000 * (a))
    if x1 == x2 or y1 == y2:
        return img
    t = float(y2 - y1) / (x2 - x1)
    rotate_angle = math.degrees(math.atan(t))
    print(rotate_angle)
    if 12 > rotate_angle > 2 or -12 < rotate_angle < -2:    # 角度纠正的范围
        rotate_img = rotate_bound(img, -rotate_angle)
        afterimg=find_license(rotate_img)
        cv2.imshow("afterimg",afterimg)
        return afterimg
    else:
        return img


def stretch(img):
    '''
    图像拉伸函数
    '''
    maxi=float(img.max())
    mini=float(img.min())

    for i in range(img.shape[0]):
        for j in range(img.shape[1]):
            img[i,j]=(255/(maxi-mini)*img[i,j]-(255*mini)/(maxi-mini))

    return img

def dobinaryzation(img):
    '''
    二值化处理函数
    '''
    maxi=float(img.max())
    mini=float(img.min())

    x=maxi-((maxi-mini)/2)
    ret,thresh=cv2.threshold(img,x,255,cv2.THRESH_BINARY)
    #返回二值化后的黑白图像
    return thresh

def find_license(img):
    '''
    预处理函数
    '''
    gray_img=cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
    #灰度拉伸
    stretchedimg=stretch(gray_img)

    #图像二值化
    binaryimg=dobinaryzation(stretchedimg)

    #canny边缘检测
    canny=cv2.Canny(binaryimg,binaryimg.shape[0],binaryimg.shape[1])

    #进行闭运算
    kernel=np.ones((3,19),np.uint8)
    closingimg=cv2.morphologyEx(canny,cv2.MORPH_CLOSE,kernel)
    # cv2.imshow("closingimg",closingimg)
    # 提取图像区域
    img_crop = ROI_extract(img,closingimg)

    return img_crop

def findVertices(points):
    # 获取最小外接矩阵，中心点坐标，宽高，旋转角度
    rect = cv2.minAreaRect(points)
    # 获取矩形四个顶点，浮点型
    box = cv2.boxPoints(rect)
    # 取整
    box = np.int0(box)
    # 获取四个顶点坐标
    left_point_x = np.min(box[:, 0])
    right_point_x = np.max(box[:, 0])
    top_point_y = np.min(box[:, 1])
    bottom_point_y = np.max(box[:, 1])

    left_point_y = box[:, 1][np.where(box[:, 0] == left_point_x)][0]
    right_point_y = box[:, 1][np.where(box[:, 0] == right_point_x)][0]
    top_point_x = box[:, 0][np.where(box[:, 1] == top_point_y)][0]
    bottom_point_x = box[:, 0][np.where(box[:, 1] == bottom_point_y)][0]
    # 上下左右四个点坐标
    vertices = np.array([[top_point_x, top_point_y], [bottom_point_x, bottom_point_y], [left_point_x, left_point_y],
                      [right_point_x, right_point_y]])
    return vertices, rect

def ROI_extract(image,thresh):
    """
    提取目标区域
    :param image:
    :return:
    """
    contours, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)

    max_area = 0
    img = None
    size = image.shape
    if len(contours) != 0:
        for cnt in range(len(contours)):
            x, y, w, h = cv2.boundingRect(contours[cnt])
            box, _ = findVertices(contours[cnt])
            area = w * h
            if area > max_area:  # 过滤小区域，只保留最大的区域
                y1 = max(0,y-2)
                x1 = max(0,x-2)
                y2 = min(size[0],y + h + 2)
                x2 = min(size[1],x + w + 2)
                newimage = image[y1 :y2, x1 :x2]
                max_area = area
                img = newimage

                # 框出文字区域
                image_copy = image.copy()
                cv2.rectangle(image_copy, (x1, y1), (x2, y2), (0, 255, 0), 2)
                cv2.imshow('image_copy', image_copy)
            else:
                continue
    else:
        return img
    return img


if __name__=='__main__':
    import os

    path = r"G:\CRNN_data\diff_img\test"
    for name in os.listdir(path):
        img=cv2.imread(os.path.join(path,name),cv2.IMREAD_COLOR)
        #预处理图像
        cv2.imshow("img0",img)
        img = Hough(img)
        # cv2.imwrite(r"G:\CRNN_data\diff_img\test_save\{}".format(name),img)
        cv2.imshow("hough_img",img)
        cv2.waitKey()

PS：最后，因为是基于传统方法去做的，鲁棒性有待加强，旋转角度基本上没什么问题，主要是阈值分割不是很稳定，可以尝试在旋转纠正后使用其他的方法分割出字符，如果在自己的数据上不是很适用的话，还可以考虑在识别网络中加入TPS进行纠正，关于TPS这里就不说了，如果要说的话又可以写一篇博文了，感兴趣的可以百度。不过谨慎看好，因为我在加进去后精度反而还还下降了，当然和我本身的数据有关。

update另一种实现，修改了字符分割部分，使用水平投影对字符进行分割，最后对投影的区域进行筛选分割，分割的效果稳定性更好：

在这里插入图片描述

# -*- coding: utf-8 -*-
import cv2
import numpy as np
import math

def compute(img):
    "求图像每个通道的均值"
    per_image_Rmean = []
    per_image_Gmean = []
    per_image_Bmean = []
    per_image_Bmean.append(np.mean(img[:, :, 0]))
    per_image_Gmean.append(np.mean(img[:, :, 1]))
    per_image_Rmean.append(np.mean(img[:, :, 2]))
    R_mean = np.mean(per_image_Rmean)
    G_mean = np.mean(per_image_Gmean)
    B_mean = np.mean(per_image_Bmean)
    return B_mean, G_mean, R_mean

def rotate_bound(image, angle):
    """
    旋转任意角度不缺边缘
    """
    (h, w) = image.shape[:2]  # 获取图像的尺寸
    (cX, cY) = (w // 2, h // 2)  # 旋转中心

    # 设置旋转矩阵
    M = cv2.getRotationMatrix2D((cX, cY), -angle, 1.0)
    cos = np.abs(M[0, 0])
    sin = np.abs(M[0, 1])

    # 计算图像旋转后的新边界
    nW = int((h * sin) + (w * cos))
    nH = int((h * cos) + (w * sin))

    # 调整旋转矩阵的移动距离（t_{x}, t_{y}）
    M[0, 2] += (nW / 2) - cX
    M[1, 2] += (nH / 2) - cY

    # perform the actual rotation and return the image
    b, g, r = compute(image)
    return cv2.warpAffine(image, M, (nW, nH), borderValue=(b, g, r))

def Hough(img):
    """
    霍夫变换角度检测并纠正
    :param img:
    :return: 矫正后的图片
    """
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    edges = cv2.Canny(gray, 25, 150, apertureSize=3)

    lines = cv2.HoughLines(edges, 1, np.pi / 180, 0)
    for rho, theta in lines[0]:
        a = np.cos(theta)
        b = np.sin(theta)
        x0 = a * rho
        y0 = b * rho
        x1 = int(x0 + 1000 * (-b))
        y1 = int(y0 + 1000 * (a))
        x2 = int(x0 - 1000 * (-b))
        y2 = int(y0 - 1000 * (a))
    if x1 == x2 or y1 == y2:
        return img
    t = float(y2 - y1) / (x2 - x1)
    rotate_angle = math.degrees(math.atan(t))
    print(rotate_angle)
    if 12 > rotate_angle > 3 or -12 < rotate_angle < -3:    # 角度纠正的范围
        rotate_img = rotate_bound(img, -rotate_angle)
        # cv2.imwrite(r"G:\CRNN_data\diff_img\test_save\{}".format(name), rotate_img)
        afterimg=find_license(rotate_img)
        # cv2.imshow("afterimg",afterimg)
        return afterimg
    else:
        return img

def stretch(img):
    '''
    图像拉伸函数
    '''
    maxi=float(img.max())
    mini=float(img.min())

    for i in range(img.shape[0]):
        for j in range(img.shape[1]):
            img[i,j]=(255/(maxi-mini)*img[i,j]-(255*mini)/(maxi-mini))

    return img

def dobinaryzation(img):
    '''
    二值化处理函数
    '''
    maxi=float(img.max())
    mini=float(img.min())

    x=maxi-((maxi-mini)/2)
    ret,thresh=cv2.threshold(img,x,255,cv2.THRESH_BINARY)
    #返回二值化后的黑白图像
    return thresh

def getHProjection(image):
    hProjection = np.zeros(image.shape, np.uint8)

    # 图像高与宽
    (h, w) = image.shape
    # 长度与图像高度一致的数组
    h_ = [0] * h
    # 循环统计每一行白色像素的个数
    for y in range(h):
        for x in range(w):
            if image[y, x] == 255:
                h_[y] += 1
    # 绘制水平投影图像
    for y in range(h):
        for x in range(h_[y]):
            hProjection[y, x] = 255
    cv2.imshow('hProjection2', hProjection)

    return h_

def find_license(img):
    '''
    预处理函数
    '''
    gray_img=cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
    #灰度拉伸
    stretchedimg=stretch(gray_img)

    #图像二值化
    binaryimg=dobinaryzation(stretchedimg)

    #canny边缘检测
    canny=cv2.Canny(binaryimg,binaryimg.shape[0],binaryimg.shape[1])

    #进行闭运算
    kernel=np.ones((3,19),np.uint8)
    closingimg=cv2.morphologyEx(canny,cv2.MORPH_CLOSE,kernel)

    H = getHProjection(closingimg)     # 投影字符分割

    start = 0
    H_Start = []
    H_End = []
    # 根据水平投影获取垂直分割位置
    for i in range(len(H)):
        if H[i] > 0 and start == 0:
            H_Start.append(i)
            start = 1
        if H[i] <= 0 and start == 1:
            H_End.append(i)
            start = 0
    if len(H_Start) != len(H_End):
        H_End.append(len(H))
    # 分割行，分割之后再进行列分割并保存分割位置
    sum_list = []
    for i in range(len(H_Start)):
        sum_pix = sum(H[H_Start[i]:H_End[i]])        # 对每个区域内像素相加
        sum_list.append(sum_pix)
    index = sum_list.index(max(sum_list))         # 选出像素分布最大的区域
    cropImg = img[H_Start[index]:H_End[index], 0:img.shape[1]]      # 对该区域进行裁剪


    # 提取图像区域
    # cropImg = ROI_extract(img,closingimg)

    return cropImg

if __name__=='__main__':
    import os

    path = r"E:\PycharmProjects\yolov5_invoice_api\inference\output_crop"
    for name in os.listdir(path):
        img=cv2.imread(os.path.join(path,name),cv2.IMREAD_COLOR)
        #预处理图像
        cv2.imshow("img0",img)
        img = Hough(img)
        cv2.imwrite(r"G:\CRNN_data\diff_img\test_save\{}".format(name),img)
        cv2.imshow("hough_img",img)
        cv2.waitKey()

速度慢了一点，不过还存在很大的优化空间，上面代码涉及了很多for循环，这是非常耗时的，可使用numpy数组运算代替for循环，速度可提升到毫秒级别。

三叔家的猫

关注

10
点赞
踩
49

收藏

觉得还不错? 一键收藏
5
评论
OCR-文本检测后的文字纠正

序言主要是对规则的文本检测都的角度纠正，如果弯曲的很厉害，那也没办法，因为最近一直在做OCR方面的工作，包括文本检测、文本识别等等，关于检测后的文本角度纠正，网上查了很久没发现一个靠谱的，想了很久，决定自己用opencv实现一下，本文记录下实现的过程。本文主要是对YOLO检测后的倾斜文本做纠正，纠正后的图片送入到识别网络中能够大大的提升模型的识别率，当然不一定对所有场景下的文本都适用，本文实现仅供参考和学习。一、关键技术霍夫变换预测倾斜角度；图像二值化用于分割文本区域；ROI区域获取；得到纠正
复制链接

扫一扫

专栏目录