文本检测模型随机crop代码整理_文本随机性判断-CSDN博客

本文链接：https://blog.csdn.net/ooooocj/article/details/117325445

PSENet

输入imgs是一个列表，包含img、gt_text、training_mask、gt_kernels。其中img是原始图片，gt_text中背景为0文本区域的值为文本实例索引，例如有三个文本行则对应文本区域分别填充为1,2,3。training_mask是标签为'#'即忽略文本行区域为0其余地方值为1的二值图，gt_kernels一共有6张图对应文本区域shrink不同比例后的二值图。因为psenet是基于语义分割的模型，标签是和模型输入大小一样的图，因此对原图进行翻转、旋转、裁剪等操作，对标签也进行同样的操作即可。

import numpy as np
import random


def random_crop(imgs, img_size):
    # imgs = [img, gt_text, training_mask]
    # imgs.extend(gt_kernels)
    h, w = imgs[0].shape[0:2]
    th, tw = img_size  # 640, 640
    if w == tw and h == th:
        return imgs

    if random.random() > 3.0 / 8.0 and np.max(imgs[1]) > 0:  # 按一定比例进行crop并且要保证img中有文本实例
        tl = np.min(np.where(imgs[1] > 0), axis=1) - img_size  # 找到所有bbox中最靠左的和最靠上的坐标
        # -640是因为如果tl都大于0的话，下面选起点i,j时如果选到了最左上的位置即tl，可以保证crop区域右下可以包含至少一个像素的文本区域
        tl[tl < 0] = 0
        br = np.max(np.where(imgs[1] > 0), axis=1) - img_size
        br[br < 0] = 0
        br[0] = min(br[0], h - th)
        # 如果h-th小, 保证crop不超出图片; 如果br小, 保证起点选到右下边界时, crop区域左上可以包含至少一个像素的文本区域
        br[1] = min(br[1], w - tw)

        # i,j是起点
        i = random.randint(tl[0], br[0])
        j = random.randint(tl[1], br[1])
    else:
        i = random.randint(0, h - th)
        j = random.randint(0, w - tw)

    # return i, j, th, tw
    for idx in range(len(imgs)):
        if len(imgs[idx].shape) == 3:
            imgs[idx] = imgs[idx][i:i+th, j:j+tw, :]
        else:
            imgs[idx] = imgs[idx][i:i+th, j:j+tw]
    return imgs

EAST

核心思想是遍历图中的所有ground truth，找到所有不包含文本区域的x和y坐标，crop的区域左上和右下坐标从这些坐标中选，以防止crop的区域会横穿或纵穿ground truth。另外EAST里还会按一定比例专门crop不包含文本的纯背景样本。

import numpy as np

min_crop_side_ratio = 0.1


def crop_area(im, polys, tags, crop_background=False, max_tries=50):
    """
    make random crop from the input image
    :param im:
    :param polys:
    :param tags:
    :param crop_background:
    :param max_tries:
    :return:
    """
    h, w, _ = im.shape
    # pad_h和pad_w好像没什么用
    pad_h = h // 10
    pad_w = w // 10
    h_array = np.zeros((h + pad_h*2), dtype=np.int32)
    w_array = np.zeros((w + pad_w*2), dtype=np.int32)
    for poly in polys:
        poly = np.round(poly, decimals=0).astype(np.int32)
        minx = np.min(poly[:, 0])
        maxx = np.max(poly[:, 0])
        w_array[minx + pad_w: maxx + pad_w] = 1
        miny = np.min(poly[:, 1])
        maxy = np.max(poly[:, 1])
        h_array[miny + pad_h: maxy + pad_h] = 1
    # ensure the cropped area not across a text
    h_axis = np.where(h_array == 0)[0]  # 不包含文本区域的y坐标
    w_axis = np.where(w_array == 0)[0]
    if len(h_axis) == 0 or len(w_axis) == 0:
        return im, polys, tags
    for i in range(max_tries):
        xx = np.random.choice(w_axis, size=2)
        xmin = np.min(xx) - pad_w
        xmax = np.max(xx) - pad_w
        xmin = np.clip(xmin, 0, w-1)
        xmax = np.clip(xmax, 0, w-1)
        yy = np.random.choice(h_axis, size=2)
        ymin = np.min(yy) - pad_h
        ymax = np.max(yy) - pad_h
        ymin = np.clip(ymin, 0, h-1)
        ymax = np.clip(ymax, 0, h-1)
        if xmax - xmin < min_crop_side_ratio * w or ymax - ymin < min_crop_side_ratio * h:
            # area too small
            continue
        if polys.shape[0] != 0:
            poly_axis_in_area = (polys[:, :, 0] >= xmin) & (polys[:, :, 0] <= xmax) \
                                & (polys[:, :, 1] >= ymin) & (polys[:, :, 1] <= ymax)
            selected_polys = np.where(np.sum(poly_axis_in_area, axis=1) == 4)[0]
        else:
            selected_polys = []
        if len(selected_polys) == 0:
            # no text in this area
            if crop_background:
                return im[ymin:ymax+1, xmin:xmax+1, :], polys[selected_polys], tags[selected_polys]
            else:
                continue
        im = im[ymin:ymax+1, xmin:xmax+1, :]
        polys = polys[selected_polys]
        tags = tags[selected_polys]
        polys[:, :, 0] -= xmin
        polys[:, :, 1] -= ymin
        return im, polys, tags

    return im, polys, tags