paddleOCR代码学习-详细注释

最新推荐文章于 2024-06-02 01:13:17 发布

weixin_44140703

最新推荐文章于 2024-06-02 01:13:17 发布

阅读量5.9k

点赞数 20

分类专栏： python学习学习总结文章标签：计算机视觉 python 可视化深度学习神经网络

本文链接：https://blog.csdn.net/weixin_44140703/article/details/112648266

版权

学习总结同时被 2 个专栏收录

20 篇文章 0 订阅

订阅专栏

python学习

6 篇文章 1 订阅

订阅专栏

文章目录

可视化-代码

def draw_ocr_box_txt(image,
                     boxes,
                     txts,
                     scores=None,
                     drop_score=0.5,
                     font_path="./doc/simfang.ttf"):
    """
    把结果画在图片上
    image,  图片
    boxes,  预测的点坐标 [[(171, 1308), (1440, 1462), (1438, 1692), (162, 1539)], ...]
    txts,   文本        ["888", ...]
    scores = None,  分数
    drop_score = 0.5, 丢弃阈值
    font_path = "./doc/simfang.ttf"  字体
    """
    h, w = image.height, image.width
    img_left = image.copy()
    # PIL.Image.new() 使用给定的模式和大小创建一个新图像
    # 新建一张白色的图
    img_right = Image.new('RGB', (w, h), (255, 255, 255))
    # ImageDraw模块提供了图像对象的简单2D绘制。用户可以使用这个模块创建新的图像，注释或润饰已存在图像，为web应用实时产生各种图形。 Draw()创建一个可以在给定图像上绘图的对象。
    draw_left = ImageDraw.Draw(img_left)
    draw_right = ImageDraw.Draw(img_right)
    for idx, (box, txt) in enumerate(zip(boxes, txts)):
        if scores is not None and scores[idx] < drop_score:
            continue
        # rgb 颜色设置 这里随机生成RGB颜色
        color = (random.randint(0, 255), random.randint(0, 255),
                 random.randint(0, 255))
        # 根据点坐标画多边形 fill为填充颜色
        draw_left.polygon(box, fill=color)
        # 根据点坐标画多边形，outline为边框颜色
        draw_right.polygon(
            [
                box[0][0], box[0][1], box[1][0], box[1][1], box[2][0],
                box[2][1], box[3][0], box[3][1]
            ],
            outline=color)
        # 高度计算 sqrt 开平方根 四个点的顺序是 左上点 右上点，右下点，左下点，宽度计算为使用前两个点坐标，利用勾股定理计算
        box_height = math.sqrt((box[0][0] - box[3][0])**2 + (box[0][1] - box[3][
            1])**2)
        # 宽度计算，开平方根
        box_width = math.sqrt((box[0][0] - box[1][0])**2 + (box[0][1] - box[1][
            1])**2)
        # 判断是竖着的还是横着的框 height大于2倍的width，认为框是竖着的
        if box_height > 2 * box_width:
            # 字体大小
            font_size = max(int(box_width * 0.9), 10)
            font = ImageFont.truetype(font_path, font_size, encoding="utf-8")
            # 初始纵坐标
            cur_y = box[0][1]
            # 把字符分开 竖起来写书写
            for c in txt:
                char_size = font.getsize(c) # char_size,字符的尺寸，2个维度（width, height）
                draw_right.text(
                    (box[0][0] + 3, cur_y), c, fill=(0, 0, 0), font=font)
                cur_y += char_size[1]
        # 框横着
        else:
            font_size = max(int(box_height * 0.8), 10)
            font = ImageFont.truetype(font_path, font_size, encoding="utf-8")
            draw_right.text(
                [box[0][0], box[0][1]], txt, fill=(0, 0, 0), font=font)
    # Image.blend 图像合成，两张图合成一张，0.5为不透明度，值越大，越不透明
    img = Image.blend(image, img_left, 0.5)
    # 新建一张图，设置RGB色彩(255, 255, 255)，设置宽度和高度，这里设置2倍的宽是为了粘贴2张图像
    img_show = Image.new('RGB', (w * 2, h), (255, 255, 255))
    # 把图像粘贴在img_show上
    img_show.paste(img, (0, 0, w, h))
    img_show.paste(img_right, (w, 0, w * 2, h))
    return np.array(img_show) # 转化为array格式后，opencv可以读取，也可以进入神经网络计算

predict_system.py

class TextSystem(object):
    def __init__(self, args):
        self.text_detector = predict_det.TextDetector(args)
        self.text_recognizer = predict_rec.TextRecognizer(args)
        self.use_angle_cls = args.use_angle_cls
        self.drop_score = args.drop_score
        if self.use_angle_cls:
            self.text_classifier = predict_cls.TextClassifier(args)

    def get_rotate_crop_image(self, img, points):
        '''
        剪切--做透视变换 和 旋转

        img_height, img_width = img.shape[0:2]
        left = int(np.min(points[:, 0]))
        right = int(np.max(points[:, 0]))
        top = int(np.min(points[:, 1]))
        bottom = int(np.max(points[:, 1]))
        img_crop = img[top:bottom, left:right, :].copy()
        points[:, 0] = points[:, 0] - left
        points[:, 1] = points[:, 1] - top
        '''
        img_crop_width = int(
            max(
                np.linalg.norm(points[0] - points[1]),
                np.linalg.norm(points[2] - points[3])))
        img_crop_height = int(
            max(
                np.linalg.norm(points[0] - points[3]),
                np.linalg.norm(points[1] - points[2])))
        pts_std = np.float32([[0, 0], [img_crop_width, 0],
                              [img_crop_width, img_crop_height],
                              [0, img_crop_height]])
        M = cv2.getPerspectiveTransform(points, pts_std)
        # cv2.warpPerspective()透视变换函数，可保持直线不变形，但是平行线可能不再平行
        dst_img = cv2.warpPerspective(
            img,
            M, (img_crop_width, img_crop_height),
            borderMode=cv2.BORDER_REPLICATE,
            flags=cv2.INTER_CUBIC)
        dst_img_height, dst_img_width = dst_img.shape[0:2]
        if dst_img_height * 1.0 / dst_img_width >= 1.5:
            dst_img = np.rot90(dst_img)
        return dst_img

    def print_draw_crop_rec_res(self, img_crop_list, rec_res):
        bbox_num = len(img_crop_list)
        for bno in range(bbox_num):
            cv2.imwrite("./output/img_crop_%d.jpg" % bno, img_crop_list[bno])
            logger.info(bno, rec_res[bno])

    def __call__(self, img):
        ori_im = img.copy()
        # 区域预测，返回预测区域四点和使用时间
        dt_boxes, elapse = self.text_detector(img)
        logger.info("dt_boxes num : {}, elapse : {}".format(
            len(dt_boxes), elapse))
        if dt_boxes is None:
            return None, None
        img_crop_list = []

        # 按从上到下、从左到右的顺序对文本框排序
        dt_boxes = sorted_boxes(dt_boxes)

        # 把预测的位置框裁剪出来一一做仿射变换，数据加入到列表中
        for bno in range(len(dt_boxes)):
            tmp_box = copy.deepcopy(dt_boxes[bno])
            # 裁剪--做透视变换和旋转
            img_crop = self.get_rotate_crop_image(ori_im, tmp_box)
            img_crop_list.append(img_crop)
        # 是否使用方向旋转
        if self.use_angle_cls:
            img_crop_list, angle_list, elapse = self.text_classifier(
                img_crop_list)
            logger.info("cls num  : {}, elapse : {}".format(
                len(img_crop_list), elapse))

        # 文本识别 返回结果和耗时
        rec_res, elapse = self.text_recognizer(img_crop_list)
        logger.info("rec_res num  : {}, elapse : {}".format(
            len(rec_res), elapse))
        # self.print_draw_crop_rec_res(img_crop_list, rec_res)
        filter_boxes, filter_rec_res = [], []
        # 这里根据 置信度 和 丢弃值 判断识别结果是否要丢弃，丢弃置信度低的结果，返回置信度高于丢弃值的结果
        for box, rec_reuslt in zip(dt_boxes, rec_res):
            text, score = rec_reuslt
            if score >= self.drop_score:
                filter_boxes.append(box)
                filter_rec_res.append(rec_reuslt)
        return filter_boxes, filter_rec_res


def sorted_boxes(dt_boxes):
    """
    Sort text boxes in order from top to bottom, left to right
    args:
        dt_boxes(array):detected text boxes with shape [4, 2]
    return:
        sorted boxes(array) with shape [4, 2]

    按从上到下、从左到右的顺序对文本框排序
    参数:
        dt_boxes(array):检测到形状为[4,2]的文本框
    返回:
        形状为[4,2]的排序框(数组)
    [[[ 275. 1996.],  [2664. 1829.],  [2697. 2295.],  [ 307. 2463.]],, [[ 623. 1566.],  [1523. 1520.],  [1540. 1854.],  [ 640. 1900.]],, [[ 392. 1540.],  [ 654. 1540.],  [ 654. 1859.],  [ 392. 1859.]]]
    """
    num_boxes = dt_boxes.shape[0]
    # 按照列表中x[0][1], x[0][0] 这两个元素的顺序-优先按照第一个，把列表排序
    sorted_boxes = sorted(dt_boxes, key=lambda x: (x[0][1], x[0][0]))
    _boxes = list(sorted_boxes)

    # 这里实现了一个功能，排序后的几个标注框坐标，相邻两个点，满足给定条件，互换位置，重新排序
    for i in range(num_boxes - 1):
        if abs(_boxes[i + 1][0][1] - _boxes[i][0][1]) < 10 and \
                (_boxes[i + 1][0][0] < _boxes[i][0][0]):
            tmp = _boxes[i]
            _boxes[i] = _boxes[i + 1]
            _boxes[i + 1] = tmp
    return _boxes


def main(args):
    # 获取图片列表，返回一个list，可能有一个或者多个元素（图片）
    image_file_list = get_image_file_list(args.image_dir)
    # 实例化ocr模型
    text_sys = TextSystem(args)
    # 可视化
    is_visualize = True
    # 字体路径
    font_path = args.vis_font_path
    # 丢弃阈值 0.5
    drop_score = args.drop_score
    for image_file in image_file_list:
        # 读取图片，看是否是gif类型。若不是gif类型，返回 None, False
        img, flag = check_and_read_gif(image_file)
        # 如果flag为False 直接读取图片 ndarray类型数据，形状是(H, W, C)
        if not flag:
            img = cv2.imread(image_file)
        if img is None:
            logger.info("error in loading image:{}".format(image_file))
            continue
        # 开始时间
        starttime = time.time()
        # 进入ocr系统识别 返回识别出区域的点坐标 和 识别结果
        # dt_boxes：[[区域的四个点坐标],[区域的四个点坐标],[区域的四个点坐标]]
        # [array([[ 392., 1540.],
        #        [ 654., 1540.],
        #        [ 654., 1859.],
        #        [ 392., 1859.]], dtype=float32), array([[ 623., 1566.],
        #        [1523., 1520.],
        #        [1540., 1854.],
        #        [ 640., 1900.]], dtype=float32), array([[ 275., 1996.],
        #        [2664., 1829.],
        #        [2697., 2295.],
        #        [ 307., 2463.]], dtype=float32)]

        # rec_res：[(识别结果，置信度),(识别结果，置信度),(识别结果，置信度)]   [('P', 0.7642319), ('C2NT', 0.808735), ('3338490', 0.570314)]
        dt_boxes, rec_res = text_sys(img)
        # 用的时间
        elapse = time.time() - starttime
        # 日志信息,记录所用时间
        logger.info("Predict time of %s: %.3fs" % (image_file, elapse))
        # 日志信息，记录识别结果
        for text, score in rec_res:
            logger.info("{}, {:.3f}".format(text, score))

        # 可视化 如果True
        if is_visualize:
            # 把opencv读取的图片首先转化成RGB格式--cv2.cvtColor，再转化成PIL可以读取的格式--Image.fromarray
            image = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
            # 文字区域点坐标
            boxes = dt_boxes
            # 只取文本，不取置信度，获得文本列表
            txts = [rec_res[i][0] for i in range(len(rec_res))]
            # 获得置信度列表
            scores = [rec_res[i][1] for i in range(len(rec_res))]
            # 画图
            draw_img = draw_ocr_box_txt(
                image,  # 图片数据
                boxes,  # 区域点坐标
                txts,   # 识别出的文本结果列表
                scores, # 识别出的文本结果置信度列表
                drop_score=drop_score, # 丢弃阈值
                font_path=font_path)   # 字体路径
            # 结果图片保存路径
            draw_img_save = "./inference_results/"
            if not os.path.exists(draw_img_save):
                os.makedirs(draw_img_save)
            # 保存图片结果 最后的draw_img[:, :, ::-1]，用切片的方法把图片改成RGB格式
            cv2.imwrite(
                os.path.join(draw_img_save, os.path.basename(image_file)),
                draw_img[:, :, ::-1])
            # 日志信息
            logger.info("The visualized image saved in {}".format(
                os.path.join(draw_img_save, os.path.basename(image_file))))

det_db_head.py

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import math
import paddle
from paddle import nn
import paddle.nn.functional as F
from paddle import ParamAttr


def get_bias_attr(k, name):
    stdv = 1.0 / math.sqrt(k * 1.0)
    initializer = paddle.nn.initializer.Uniform(-stdv, stdv)
    bias_attr = ParamAttr(initializer=initializer, name=name + "_b_attr")
    return bias_attr


class Head(nn.Layer):
    """
    这里，概率图和阈值图经过结构相同的网络（结构相同，参数不同）计算而来，二值图使用可微二值化公式计算而来，

    """
    def __init__(self, in_channels, name_list):
        """
        name_list:  ['conv2d_57', 'batch_norm_49', 'conv2d_transpose_2', 'batch_norm_50', 'conv2d_transpose_3', 'thresh']
        """
        super(Head, self).__init__()
        self.conv1 = nn.Conv2D(
            in_channels=in_channels,
            out_channels=in_channels // 4,
            kernel_size=3,
            padding=1,
            weight_attr=ParamAttr(name=name_list[0] + '.w_0'),   # name_list:让网络参数名字不重复且可控，并不影响实际的网络结构
            bias_attr=False)
        self.conv_bn1 = nn.BatchNorm(
            num_channels=in_channels // 4,
            param_attr=ParamAttr(
                name=name_list[1] + '.w_0',
                initializer=paddle.nn.initializer.Constant(value=1.0)),
            bias_attr=ParamAttr(
                name=name_list[1] + '.b_0',
                initializer=paddle.nn.initializer.Constant(value=1e-4)),
            moving_mean_name=name_list[1] + '.w_1',
            moving_variance_name=name_list[1] + '.w_2',
            act='relu')
        self.conv2 = nn.Conv2DTranspose(
            in_channels=in_channels // 4,
            out_channels=in_channels // 4,
            kernel_size=2,
            stride=2,
            weight_attr=ParamAttr(
                name=name_list[2] + '.w_0',
                initializer=paddle.nn.initializer.KaimingUniform()),
            bias_attr=get_bias_attr(in_channels // 4, name_list[-1] + "conv2"))
        self.conv_bn2 = nn.BatchNorm(
            num_channels=in_channels // 4,
            param_attr=ParamAttr(
                name=name_list[3] + '.w_0',
                initializer=paddle.nn.initializer.Constant(value=1.0)),
            bias_attr=ParamAttr(
                name=name_list[3] + '.b_0',
                initializer=paddle.nn.initializer.Constant(value=1e-4)),
            moving_mean_name=name_list[3] + '.w_1',
            moving_variance_name=name_list[3] + '.w_2',
            act="relu")
        # Conv2DTranspose：该层根据输入（input）、卷积核（kernel）和空洞大小（dilations）、步长（stride）、填充（padding）来计算输出特征层大小或者通过output_size指定输出特征层大小。
        # 输入(Input)和输出(Output)为NCHW或NHWC格式，其中N为批尺寸，C为通道数（channel），H为特征层高度，W为特征层宽度。卷积核是MCHW格式，M是输出图像通道数，C是输入图像通道数，H是卷积核高度，W是卷积核宽度。
        # 如果组数大于1，C等于输入图像通道数除以组数的结果。转置卷积的计算过程相当于卷积的反向计算。转置卷积又被称为反卷积（但其实并不是真正的反卷积）。
        self.conv3 = nn.Conv2DTranspose(
            in_channels=in_channels // 4,
            out_channels=1,
            kernel_size=2,
            stride=2,
            weight_attr=ParamAttr(
                name=name_list[4] + '.w_0',
                initializer=paddle.nn.initializer.KaimingUniform()),
            bias_attr=get_bias_attr(in_channels // 4, name_list[-1] + "conv3"),
        )

    def forward(self, x):
        x = self.conv1(x)
        x = self.conv_bn1(x)
        x = self.conv2(x)
        x = self.conv_bn2(x)
        x = self.conv3(x)
        x = F.sigmoid(x)
        return x


class DBHead(nn.Layer):
    """
    Differentiable Binarization (DB) for text detection:
        see https://arxiv.org/abs/1911.08947
    args:
        params(dict): super parameters for build DB network
    """

    def __init__(self, in_channels, k=50, **kwargs):
        super(DBHead, self).__init__()
        self.k = k
        binarize_name_list = [
            'conv2d_56', 'batch_norm_47', 'conv2d_transpose_0', 'batch_norm_48',
            'conv2d_transpose_1', 'binarize'
        ]
        thresh_name_list = [
            'conv2d_57', 'batch_norm_49', 'conv2d_transpose_2', 'batch_norm_50',
            'conv2d_transpose_3', 'thresh'
        ]
        # 这里 binarize（收缩图-概率图） 和 thresh（阈值图）是完全一样的网络结构，只是在训练过程中，参数的调整，使其成具有不同的参数。
        self.binarize = Head(in_channels, binarize_name_list)
        self.thresh = Head(in_channels, thresh_name_list)

    def step_function(self, x, y):
        """
        根据收缩图-概率图和阈值图 求二值图
        """
        # paddle.reciprocal 对输入Tensor取倒数。这里是根据论文中可微二值化公式来的 k一般是50
        return paddle.reciprocal(1 + paddle.exp(-self.k * (x - y)))

    def forward(self, x):
        # 收缩图-概率图
        shrink_maps = self.binarize(x)
        if not self.training:
            return {'maps': shrink_maps}
        # 阈值图
        threshold_maps = self.thresh(x)
        # 二值图
        binary_maps = self.step_function(shrink_maps, threshold_maps)
        y = paddle.concat([shrink_maps, threshold_maps, binary_maps], axis=1)
        return {'maps': y}

det_db_head.py

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import math
import paddle
from paddle import nn
import paddle.nn.functional as F
from paddle import ParamAttr


def get_bias_attr(k, name):
    stdv = 1.0 / math.sqrt(k * 1.0)
    initializer = paddle.nn.initializer.Uniform(-stdv, stdv)
    bias_attr = ParamAttr(initializer=initializer, name=name + "_b_attr")
    return bias_attr


class Head(nn.Layer):
    """
    这里，概率图和阈值图经过结构相同的网络（结构相同，参数不同）计算而来，二值图使用可微二值化公式计算而来

    """
    def __init__(self, in_channels, name_list):
        """
        name_list:  ['conv2d_57', 'batch_norm_49', 'conv2d_transpose_2', 'batch_norm_50', 'conv2d_transpose_3', 'thresh']
        """
        super(Head, self).__init__()
        self.conv1 = nn.Conv2D(
            in_channels=in_channels,
            out_channels=in_channels // 4,
            kernel_size=3,
            padding=1,
            weight_attr=ParamAttr(name=name_list[0] + '.w_0'),   # name_list:让网络参数名字不重复且可控，并不影响实际的网络结构
            bias_attr=False)
        self.conv_bn1 = nn.BatchNorm(
            num_channels=in_channels // 4,
            param_attr=ParamAttr(
                name=name_list[1] + '.w_0',
                initializer=paddle.nn.initializer.Constant(value=1.0)),
            bias_attr=ParamAttr(
                name=name_list[1] + '.b_0',
                initializer=paddle.nn.initializer.Constant(value=1e-4)),
            moving_mean_name=name_list[1] + '.w_1',
            moving_variance_name=name_list[1] + '.w_2',
            act='relu')
        self.conv2 = nn.Conv2DTranspose(
            in_channels=in_channels // 4,
            out_channels=in_channels // 4,
            kernel_size=2,
            stride=2,
            weight_attr=ParamAttr(
                name=name_list[2] + '.w_0',
                initializer=paddle.nn.initializer.KaimingUniform()),
            bias_attr=get_bias_attr(in_channels // 4, name_list[-1] + "conv2"))
        self.conv_bn2 = nn.BatchNorm(
            num_channels=in_channels // 4,
            param_attr=ParamAttr(
                name=name_list[3] + '.w_0',
                initializer=paddle.nn.initializer.Constant(value=1.0)),
            bias_attr=ParamAttr(
                name=name_list[3] + '.b_0',
                initializer=paddle.nn.initializer.Constant(value=1e-4)),
            moving_mean_name=name_list[3] + '.w_1',
            moving_variance_name=name_list[3] + '.w_2',
            act="relu")
        # Conv2DTranspose：该层根据输入（input）、卷积核（kernel）和空洞大小（dilations）、步长（stride）、填充（padding）来计算输出特征层大小或者通过output_size指定输出特征层大小。
        # 输入(Input)和输出(Output)为NCHW或NHWC格式，其中N为批尺寸，C为通道数（channel），H为特征层高度，W为特征层宽度。卷积核是MCHW格式，M是输出图像通道数，C是输入图像通道数，H是卷积核高度，W是卷积核宽度。
        # 如果组数大于1，C等于输入图像通道数除以组数的结果。转置卷积的计算过程相当于卷积的反向计算。转置卷积又被称为反卷积（但其实并不是真正的反卷积）。
        self.conv3 = nn.Conv2DTranspose(
            in_channels=in_channels // 4,
            out_channels=1,
            kernel_size=2,
            stride=2,
            weight_attr=ParamAttr(
                name=name_list[4] + '.w_0',
                initializer=paddle.nn.initializer.KaimingUniform()),
            bias_attr=get_bias_attr(in_channels // 4, name_list[-1] + "conv3"),
        )

    def forward(self, x):
        x = self.conv1(x)
        x = self.conv_bn1(x)
        x = self.conv2(x)
        x = self.conv_bn2(x)
        x = self.conv3(x)
        x = F.sigmoid(x)
        return x


class DBHead(nn.Layer):
    """
    Differentiable Binarization (DB) for text detection:
        see https://arxiv.org/abs/1911.08947
    args:
        params(dict): super parameters for build DB network
    """

    def __init__(self, in_channels, k=50, **kwargs):
        super(DBHead, self).__init__()
        self.k = k
        binarize_name_list = [
            'conv2d_56', 'batch_norm_47', 'conv2d_transpose_0', 'batch_norm_48',
            'conv2d_transpose_1', 'binarize'
        ]
        thresh_name_list = [
            'conv2d_57', 'batch_norm_49', 'conv2d_transpose_2', 'batch_norm_50',
            'conv2d_transpose_3', 'thresh'
        ]
        # 这里 binarize（收缩图-概率图） 和 thresh（阈值图）是完全一样的网络结构，只是在训练过程中，参数的调整，使其成具有不同的参数。
        self.binarize = Head(in_channels, binarize_name_list)
        self.thresh = Head(in_channels, thresh_name_list)

    def step_function(self, x, y):
        """
        根据收缩图和阈值图 求二值图
        """
        # paddle.reciprocal 对输入Tensor取倒数。这里是根据论文中可微二值化公式来的 k一般是50
        return paddle.reciprocal(1 + paddle.exp(-self.k * (x - y)))

    def forward(self, x):
        # 收缩图-概率图,概率图和阈值图经过结构相同的网络（结构相同，参数不同）计算而来，二值图使用可微二值化公式计算而来，
        shrink_maps = self.binarize(x)
        if not self.training:
            return {'maps': shrink_maps}
        # 阈值图
        threshold_maps = self.thresh(x)
        # 二值图
        binary_maps = self.step_function(shrink_maps, threshold_maps)
        y = paddle.concat([shrink_maps, threshold_maps, binary_maps], axis=1)
        return {'maps': y}

predict_det.py

import os
import sys

__dir__ = os.path.dirname(os.path.abspath(__file__))
sys.path.append(__dir__)
sys.path.append(os.path.abspath(os.path.join(__dir__, '../..')))

os.environ["FLAGS_allocator_strategy"] = 'auto_growth'

import cv2
import numpy as np
import time
import sys

import tools.infer.utility as utility
from ppocr.utils.logging import get_logger
from ppocr.utils.utility import get_image_file_list, check_and_read_gif
from ppocr.data import create_operators, transform
from ppocr.postprocess import build_post_process

logger = get_logger()


class TextDetector(object):
    def __init__(self, args):
        self.args = args
        self.det_algorithm = args.det_algorithm
        pre_process_list = [{
            'DetResizeForTest': {
                'limit_side_len': args.det_limit_side_len,
                'limit_type': args.det_limit_type
            }
        }, {
            'NormalizeImage': {
                'std': [0.229, 0.224, 0.225],
                'mean': [0.485, 0.456, 0.406],
                'scale': '1./255.',
                'order': 'hwc'
            }
        }, {
            'ToCHWImage': None
        }, {
            'KeepKeys': {
                'keep_keys': ['image', 'shape']
            }
        }]
        postprocess_params = {}
        if self.det_algorithm == "DB":
            postprocess_params['name'] = 'DBPostProcess'
            postprocess_params["thresh"] = args.det_db_thresh
            postprocess_params["box_thresh"] = args.det_db_box_thresh
            postprocess_params["max_candidates"] = 1000
            postprocess_params["unclip_ratio"] = args.det_db_unclip_ratio
            postprocess_params["use_dilation"] = True
        elif self.det_algorithm == "EAST":
            postprocess_params['name'] = 'EASTPostProcess'
            postprocess_params["score_thresh"] = args.det_east_score_thresh
            postprocess_params["cover_thresh"] = args.det_east_cover_thresh
            postprocess_params["nms_thresh"] = args.det_east_nms_thresh
        elif self.det_algorithm == "SAST":
            pre_process_list[0] = {
                'DetResizeForTest': {
                    'resize_long': args.det_limit_side_len
                }
            }
            postprocess_params['name'] = 'SASTPostProcess'
            postprocess_params["score_thresh"] = args.det_sast_score_thresh
            postprocess_params["nms_thresh"] = args.det_sast_nms_thresh
            self.det_sast_polygon = args.det_sast_polygon
            if self.det_sast_polygon:
                postprocess_params["sample_pts_num"] = 6
                postprocess_params["expand_scale"] = 1.2
                postprocess_params["shrink_ratio_of_width"] = 0.2
            else:
                postprocess_params["sample_pts_num"] = 2
                postprocess_params["expand_scale"] = 1.0
                postprocess_params["shrink_ratio_of_width"] = 0.3
        else:
            logger.info("unknown det_algorithm:{}".format(self.det_algorithm))
            sys.exit(0)

        self.preprocess_op = create_operators(pre_process_list)
        self.postprocess_op = build_post_process(postprocess_params)
        self.predictor, self.input_tensor, self.output_tensors = utility.create_predictor(
            args, 'det', logger)  # paddle.jit.load(args.det_model_dir)
        # self.predictor.eval()

    def order_points_clockwise(self, pts):
        """
        把点坐标 按照顺时针方向排序
        
        reference from: https://github.com/jrosebr1/imutils/blob/master/imutils/perspective.py
        # sort the points based on their x-coordinates  根据这些点的x坐标对它们排序
        """
        # np.argsort将矩阵a按照axis排序，并返回排序后的下标  pts[np.argsort(pts[:, 0]), :] 根据返回的下标取值
        xSorted = pts[np.argsort(pts[:, 0]), :]

        # grab the left-most and right-most points from the sorted 从已排序的对象中获取最左和最右的点
        # x-roodinate points
        leftMost = xSorted[:2, :]
        rightMost = xSorted[2:, :]

        # now, sort the left-most coordinates according to their
        # y-coordinates so we can grab the top-left and bottom-left
        # points, respectively
        leftMost = leftMost[np.argsort(leftMost[:, 1]), :]
        (tl, bl) = leftMost

        rightMost = rightMost[np.argsort(rightMost[:, 1]), :]
        (tr, br) = rightMost

        rect = np.array([tl, tr, br, bl], dtype="float32")
        return rect

    def clip_det_res(self, points, img_height, img_width):
        """对点坐标进行裁剪，使其坐标不超出图片范围"""
        for pno in range(points.shape[0]):
            points[pno, 0] = int(min(max(points[pno, 0], 0), img_width - 1))
            points[pno, 1] = int(min(max(points[pno, 1], 0), img_height - 1))
        return points

    def filter_tag_det_res(self, dt_boxes, image_shape):
        """对文本框进行过滤  按顺时针排序-限制坐标-求两点距离，有两点间直线距离小于3的过滤掉"""
        img_height, img_width = image_shape[0:2]
        dt_boxes_new = []
        for box in dt_boxes:
            # 把点坐标按照顺时针方向排序
            box = self.order_points_clockwise(box)
            # 对点坐标进行裁剪，使其坐标不超出图片范围
            box = self.clip_det_res(box, img_height, img_width)
            # np.linalg.norm 求矩阵二范式，即求两点间直线距离
            rect_width = int(np.linalg.norm(box[0] - box[1]))
            rect_height = int(np.linalg.norm(box[0] - box[3]))
            if rect_width <= 3 or rect_height <= 3:
                continue
            dt_boxes_new.append(box)
        dt_boxes = np.array(dt_boxes_new)
        return dt_boxes

    def filter_tag_det_res_only_clip(self, dt_boxes, image_shape):
        img_height, img_width = image_shape[0:2]
        dt_boxes_new = []
        for box in dt_boxes:
            box = self.clip_det_res(box, img_height, img_width)
            dt_boxes_new.append(box)
        dt_boxes = np.array(dt_boxes_new)
        return dt_boxes

    def __call__(self, img):
        # ori_im 原始图片做备份
        ori_im = img.copy()
        data = {'image': img}
        # 对数据进行处理
        data = transform(data, self.preprocess_op)
        img, shape_list = data
        if img is None:
            return None, 0
        img = np.expand_dims(img, axis=0)
        shape_list = np.expand_dims(shape_list, axis=0)
        img = img.copy()
        starttime = time.time()
        # 获取输入的张量  copy_from_cpu--从cpu获取模型运行所需输入数据
        self.input_tensor.copy_from_cpu(img)
        # 运行predictor
        self.predictor.run()
        outputs = []
        # 获取输出  copy_to_cpu: 获取模型运行输出结果
        for output_tensor in self.output_tensors:
            output = output_tensor.copy_to_cpu()
            outputs.append(output)

        preds = {}
        if self.det_algorithm == "EAST":
            preds['f_geo'] = outputs[0]
            preds['f_score'] = outputs[1]
        elif self.det_algorithm == 'SAST':
            preds['f_border'] = outputs[0]
            preds['f_score'] = outputs[1]
            preds['f_tco'] = outputs[2]
            preds['f_tvo'] = outputs[3]
        elif self.det_algorithm == 'DB':
            preds['maps'] = outputs[0]
        else:
            raise NotImplementedError
        # postprocess_op：后处理，获取后处理结果post_result
        post_result = self.postprocess_op(preds, shape_list)
        # dt_boxes：预测结果 点坐标
        dt_boxes = post_result[0]['points']
        if self.det_algorithm == "SAST" and self.det_sast_polygon:
            dt_boxes = self.filter_tag_det_res_only_clip(dt_boxes, ori_im.shape)
        else:
            # 对文本框进行过滤  有两点间直线距离小于3（设置的阈值）的过滤掉
            dt_boxes = self.filter_tag_det_res(dt_boxes, ori_im.shape)
        elapse = time.time() - starttime
        return dt_boxes, elapse

db_postprocess.py

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
import cv2
import paddle
from shapely.geometry import Polygon
import pyclipper


class DBPostProcess(object):
    """
    The post process for Differentiable Binarization (DB).
    可微二值化(DB)后处理。
    """

    def __init__(self,
                 thresh=0.3,
                 box_thresh=0.7,
                 max_candidates=1000,
                 unclip_ratio=2.0,
                 use_dilation=False,
                 **kwargs):
        self.thresh = thresh
        self.box_thresh = box_thresh
        self.max_candidates = max_candidates
        self.unclip_ratio = unclip_ratio
        self.min_size = 3
        self.dilation_kernel = None if not use_dilation else np.array(
            [[1, 1], [1, 1]])

    def boxes_from_bitmap(self, pred, _bitmap, dest_width, dest_height):
        '''
        从二值图获取box

        _bitmap: single map with shape (1, H, W),
                whose values are binarized as {0, 1}

        _bitmap:带有形状(1,H, W)的二值化单图，
                其值二值化为{0,1}
        '''

        bitmap = _bitmap
        height, width = bitmap.shape
        # cv2.findContours()函数：查找检测物体的轮廓 接受的参数为二值图，即黑白的（不是灰度图）
        # findcontours函数会“原地”修改输入的图像。
        # 第一个参数是寻找轮廓的图像；
        # 第二个参数表示轮廓的检索模式，有四种（本文介绍的都是新的cv2接口）：
        # cv2.RETR_EXTERNAL
        # 表示只检测外轮廓
        # cv2.RETR_LIST
        # 检测的轮廓不建立等级关系
        # cv2.RETR_CCOMP
        # 建立两个等级的轮廓，上面的一层为外边界，里面的一层为内孔的边界信息。如果内孔内还有一个连通物体，这个物体的边界也在顶层。
        # cv2.RETR_TREE
        # 建立一个等级树结构的轮廓。

        # 第三个参数method为轮廓的近似办法
        # cv2.CHAIN_APPROX_NONE
        # 存储所有的轮廓点，相邻的两个点的像素位置差不超过1，即max（abs（x1 - x2），abs（y2 - y1）） == 1
        # cv2.CHAIN_APPROX_SIMPLE
        # 压缩水平方向，垂直方向，对角线方向的元素，只保留该方向的终点坐标，例如一个矩形轮廓只需4个点来保存轮廓信息
        # cv2.CHAIN_APPROX_TC89_L1，CV_CHAIN_APPROX_TC89_KCOS
        # 使用teh - Chinlchain近似算法

        # 返回两个值，一个是轮廓本身，还有一个是每条轮廓对应的属性。
        # cv2.findContours()函数首先返回一个list，list中每个元素都是图像中的一个轮廓，用numpy中的ndarray表示;每个轮廓是一个ndarray，每个ndarray是轮廓上的点的集合。
        outs = cv2.findContours((bitmap * 255).astype(np.uint8), cv2.RETR_LIST,
                                cv2.CHAIN_APPROX_SIMPLE)
        if len(outs) == 3:
            img, contours, _ = outs[0], outs[1], outs[2]
        elif len(outs) == 2:
            contours, _ = outs[0], outs[1]
        # num_contours 中每个元素是一个轮廓---是轮廓上的点的集合
        num_contours = min(len(contours), self.max_candidates)

        boxes = []
        scores = []
        for index in range(num_contours):
            contour = contours[index]
            # 获取区域的四点坐标，sside是h和w的其中较小的值
            points, sside = self.get_mini_boxes(contour)
            # 小于最小值min_size的过滤掉
            if sside < self.min_size:
                continue
            points = np.array(points)
            # 计算得到文本框分数
            score = self.box_score_fast(pred, points.reshape(-1, 2))
            # 根据分数和阈值过滤
            if self.box_thresh > score:
                continue

            # 根据points获取box
            box = self.unclip(points).reshape(-1, 1, 2)
            box, sside = self.get_mini_boxes(box)
            if sside < self.min_size + 2:
                continue
            box = np.array(box)
            # np.clip限制点坐标，使其不超出图片
            box[:, 0] = np.clip(
                np.round(box[:, 0] / width * dest_width), 0, dest_width)
            box[:, 1] = np.clip(
                np.round(box[:, 1] / height * dest_height), 0, dest_height)
            boxes.append(box.astype(np.int16))
            scores.append(score)
        return np.array(boxes, dtype=np.int16), scores


    def unclip(self, box):
        """"
        在推理时可以采用概率图或近似二值图来生成文本框，为了方便作者选择了概率图，具体步骤如下：
        1、使用固定阈值0.2将概率图做二值化得到二值化图；
        2、由二值化图得到收缩文字区域；
        3、将收缩文字区域按Vatti clipping算法的偏移系数D'进行扩张得到最终文本框。
        其中A'、L'是收缩区域的面积、周长，r‘依经验设置为1.5（对应收缩比例r=0.4）。
        """
        unclip_ratio = self.unclip_ratio
        poly = Polygon(box)
        distance = poly.area * unclip_ratio / poly.length
        # pco = pyclipper.PyclipperOffset()
        # pco.AddPath(subj, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
        # solution = pco.Execute(-7.0)   这三行代码做多边形的坐标偏移 在这里是做文本框的伸缩
        offset = pyclipper.PyclipperOffset()
        offset.AddPath(box, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
        expanded = np.array(offset.Execute(distance))
        return expanded

    def get_mini_boxes(self, contour):
        """
        获取最小的box并且排序，返回box点坐标

        contour: 构成轮廓的几个点坐标
        """
        # cv2.minAreaRect(Points)
        # 其中points是点集，数据类型为ndarray，array((x1,y1),(x2,y2),....,(xn,yn))
        # 而minAreaRect就是求出在上述点集下的最小面积矩形。
        # 函数 cv2.minAreaRect() 返回一个Box2D结构 rect：（最小外接矩形的中心（x，y），（宽度，高度），旋转角度）。
        # 分别对应于返回值：(rect[0][0],  rect[0][1]),  (rect[1][0],  rect[1][1]),  rect[2]
        # cv2.boxPoints(rect)可以返回四个点的值，其中cv2.boxPoints(rect)[0]为point[0]，cv2.boxPoints(rect)[1]为point[1]......
        bounding_box = cv2.minAreaRect(contour)
        points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0])

        # 以下代码只是对点坐标做一个排序，按照左上，右上，右下，左下顺序排列
        index_1, index_2, index_3, index_4 = 0, 1, 2, 3
        if points[1][1] > points[0][1]:
            index_1 = 0
            index_4 = 1
        else:
            index_1 = 1
            index_4 = 0
        if points[3][1] > points[2][1]:
            index_2 = 2
            index_3 = 3
        else:
            index_2 = 3
            index_3 = 2

        box = [
            points[index_1], points[index_2], points[index_3], points[index_4]
        ]
        # bounding_box[1] 是(w, h)  这里返回h和w的最小值
        return box, min(bounding_box[1])

    def box_score_fast(self, bitmap, _box):
        """
        通过点坐标 和 mask填充，计算得到一个均值分数，这里不太懂 ？？？

        """
        h, w = bitmap.shape[:2]
        box = _box.copy()
        # np.clip这个函数将将数组中的元素限制在a_min, a_max之间，大于a_max的就使得它等于 a_max，小于a_min,的就使得它等于a_min。
        # 这里是对区域的几个点做一个限制，使其不超出图片区域！
        xmin = np.clip(np.floor(box[:, 0].min()).astype(np.int), 0, w - 1)
        xmax = np.clip(np.ceil(box[:, 0].max()).astype(np.int), 0, w - 1)
        ymin = np.clip(np.floor(box[:, 1].min()).astype(np.int), 0, h - 1)
        ymax = np.clip(np.ceil(box[:, 1].max()).astype(np.int), 0, h - 1)

        mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8)
        box[:, 0] = box[:, 0] - xmin
        box[:, 1] = box[:, 1] - ymin
        # cv2.fillPoly()函数可以用来填充任意形状的图型.可以用来绘制多边形,工作中也经常使用非常多个边来近似的画一条曲线.cv2.fillPoly()函数可以一次填充多个图型.
        cv2.fillPoly(mask, box.reshape(1, -1, 2).astype(np.int32), 1)
        return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0]

    def __call__(self, outs_dict, shape_list):
        """
        outs_dict：{'maps': [1,1,960,544]}
        shape_list：[1, 4]
        """
        pred = outs_dict['maps']
        if isinstance(pred, paddle.Tensor):
            pred = pred.numpy()
        pred = pred[:, 0, :, :]
        # segmentation : 布尔值矩阵 由 pred > self.thresh 得到的结果矩阵
        # 使用固定阈值0.3将概率图做二值化得到二值化图；
        segmentation = pred > self.thresh

        boxes_batch = []
        for batch_index in range(pred.shape[0]):
            src_h, src_w, ratio_h, ratio_w = shape_list[batch_index]
            if self.dilation_kernel is not None:
                # cv2.dilate()函数对图片进行膨胀处理,-白色部分膨胀，黑色部分变小. 该函数的参数含义： img – 目标图片 kernel – 进行操作的内核，默认为3×3的矩阵 iterations – 腐蚀次数，默认为1
                mask = cv2.dilate(
                    np.array(segmentation[batch_index]).astype(np.uint8),
                    self.dilation_kernel)
            else:
                mask = segmentation[batch_index]
            # 从二值图得到文本框和分数
            boxes, scores = self.boxes_from_bitmap(pred[batch_index], mask,
                                                   src_w, src_h)
            boxes_batch.append({'points': boxes})
        return boxes_batch

weixin_44140703

关注

20
点赞
踩
80

收藏

觉得还不错? 一键收藏
2
评论
paddleOCR代码学习-详细注释

可视化-代码def draw_ocr_box_txt(image, boxes, txts, scores=None, drop_score=0.5, font_path="./doc/simfang.ttf"): """ 把结果画在图片上 image, 图片 bo
复制链接

扫一扫

专栏目录