paddleocr数据增强

小飞龙程序员

已于 2024-06-25 14:12:22 修改

阅读量1.2k

点赞数 10

文章标签：目标跟踪算法人工智能

于 2024-01-27 20:54:21 首次发布

本文链接：https://blog.csdn.net/m0_47405013/article/details/134138553

版权

对于PaddleOCR中的det（目标检测）和rec（文本识别），可以使用不同的数据增强方法来提高模型的性能。以下是一些常见的参数修改方法：
目标检测数据增强参数修改：
图像翻转：通过设置flip_ratio参数来控制图像水平翻转的概率。
随机旋转：通过设置rotation_degree参数来控制图像随机旋转的角度范围。
随机裁剪：通过设置image_shape和random_crop来进行图像的随机裁剪。

文本识别数据增强参数修改：
    图像翻转：通过设置flip_horizontal_prob参数来控制图像水平翻转的概率。
    随机旋转：通过设置random_rotate_range参数来控制图像随机旋转的角度范围。
    随机裁剪：通过设置image_shape和random_crop来进行图像的随机裁剪。
    颜色抖动：通过设置image_distort_strategy参数来进行图像颜色抖动增强。

这只是一些常见的参数修改方法，具体的数据增强方法和参数设置可以根据实际需求进行调整。
一、什么是数据增强？

数据增强是一种挖机数据集潜力的方法，可以让数据集蕴含更多让模型有效学习的信息。这些方法是领域和任务特定的，

二、为什么需要数据增强？

1、在实际任务中，原始数据集未必完全含有解决任务所需要的充足信息。通过分析任务场景的复杂性和当前数据集的短板，对现有数据有针对性做一些数据增强/增广的策略的修改，以提供更加多样性的、匹配任务场景复杂性的新数据，往往可以显著的提高模型效果。

2、扩大训练数据集，抑制过拟合，提升模型的泛化能力。

三、基础的图像增强方法：
1、随机改变亮度、对比度和颜色
2、随机填充
3、随机裁剪
4、随机缩放
5、随机翻转

四、高阶的图像增强方法

1、图像变换类：
标准变换
autoaugment
randaugment

2、图像裁剪类：
cutout
randerasing
hideandseek
gridmask

3、图像混叠类：
mixup
cutmix

#识别增强
https://blog.csdn.net/hhhhhhhhhhwwwwwwwwww/article/details/125155386
smb://jczngd-nas.local/mnt/share/数据/THL/ocr_datasets/training

数据增强旋转(图旋转和点旋转)代码

import os
import cv2
import random
import json
import math
import numpy as np
from scipy.linalg import block_diag



def rotate_point(point, angle_degrees, center=(0, 0)):
    """
    Rotate a point counterclockwise by a given angle around a center point.

    Parameters:
        point (tuple[float, float]): The x, y coordinates of the point to rotate.
        angle_degrees (float): The angle to rotate the point counterclockwise, in degrees.
        center (tuple[float, float]): The x, y coordinates of the center of rotation.

    Returns:
        tuple[float, float]: The new x, y coordinates of the rotated point.
    """
    # Convert angle to radians
    angle_rad = math.radians(angle_degrees)
    # Translate point to origin
    translated_point = (point[0] - center[0], point[1] - center[1])
    # Rotate point around origin
    rotated_point_x = translated_point[0] * math.cos(angle_rad) - translated_point[1] * math.sin(angle_rad)
    rotated_point_y = translated_point[0] * math.sin(angle_rad) + translated_point[1] * math.cos(angle_rad)
    # Translate point back to original position
    rotated_point = (rotated_point_x + center[0], rotated_point_y + center[1])
    return rotated_point


# def rotate_points(points,center,angle):
#     # 计算旋转矩阵
#     M = cv2.getRotationMatrix2D(center, angle, 1.0)
#     # 获取旋转后的点坐标，这里以原点(0,0)为例
#     rotated_point = np.dot(M, np.array([0, 0, 1]))[:2] / rotated_point[2])
#     return rotated_point

if __name__ == '__main__':
    img_dir = r"F:\Desktop\training"
    img_dir_txt=r"F:\Desktop\training_txt1"
    out_dir=r"H:\training_rotate2"
    for path in os.listdir(img_dir):
        if path.endswith(".bmp") or path.endswith(".jpg") or path.endswith(".png"):
            path1 = os.path.join(img_dir, path)
            print(path1)
            img = cv2.imread(path1)
            # 随机生成旋转角度
            angle = random.randint(0, 360)
            print(angle)
            # 获取图片的高度和宽度
            (h, w) = img.shape[:2]
            # 计算旋转中心点
            center = (w / 2, h / 2)
            # 计算旋转矩阵
            M = cv2.getRotationMatrix2D(center, angle, 1.0)
            # 进行旋转操作
            rotated = cv2.warpAffine(img, M, (w, h))
            cv2.imwrite(os.path.join(out_dir,"3_"+path),rotated)
            ls=[]
            with open(os.path.join(img_dir_txt,path.split(".")[0]+".txt")) as f:
                line = json.load(f)
                # print(line)
                for ind,info in enumerate(line):
                    dict = {}
                    txts = info.get("transcription")
                    boxes = info.get("points")
                    diff = info.get("difficult")
                    print(txts,boxes,diff)
                    new_boxes=[]
                    for box in boxes:
                        point=(int(box[0]),int(box[1]))
                        transformed_point=rotate_point(point, -angle, center)
                        print(transformed_point)
                        transformed_point=(int(transformed_point[0]),int(transformed_point[1]))
                        new_boxes.append(transformed_point)

                    dict["transcription"]=txts
                    dict["points"]=new_boxes
                    dict["difficult"]=diff
                    ls.append(dict)
            print(ls)
            with open(os.path.join(out_dir,"Label.txt"),"a+") as fout:
                fout.write("training/3_%s"%(path)+"\t"+str(ls)+"\n")

数据增强(gamma变换)代码

import cv2,os
import random
import numpy as np
import json

def gamma_correct(img_src, gamma):
    # 设置伽马值
    # gamma = 1.5
    # 计算伽马校正表
    gamma_table = np.array([((i / 255.0) ** (1.0 / gamma)) * 255 for i in range(256)]).astype(np.uint8)
    # 应用伽马校正表
    img_gamma = cv2.LUT(img_src, gamma_table)
    return img_gamma

# def get_single_center_points(box):
#     x_list = [i[0] for i in box]
#     y_list = [j[1] for j in box]
#     xmax, xmin = np.max(x_list), np.min(x_list)
#     ymax, ymin = np.max(y_list), np.min(y_list)
#     center = [int((xmax + xmin) / 2), int((ymax + ymin) / 2)]
#     return center

# def rotate_image(image, angle_x, angle_y, angle_z, focal_length,image_name_txt):
#     """
#     对图像进行基于指定角度的轻微透视变换。
#
#     :param image: 输入图像
#     :param angle_x: 绕X轴的旋转角度（较小）
#     :param angle_y: 绕Y轴的旋转角度（较小）
#     :param angle_z: 绕Z轴的旋转角度（较小）
#     :param focal_length: 相机焦距（毫米）
#     :return: 变换后的图像
#     """
#     h, w = image.shape[:2]
#     # 将焦距从毫米转换为像素单位
#     focal_length_px = (focal_length / 36) * w  # 假设35mm全幅相机
#
#     # 将角度转换为弧度
#     ax, ay, az = np.deg2rad(angle_x), np.deg2rad(angle_y), np.deg2rad(angle_z)
#
#     # 构建旋转矩阵
#     Rx = np.array([[1, 0, 0], [0, np.cos(ax), -np.sin(ax)], [0, np.sin(ax), np.cos(ax)]])
#     Ry = np.array([[np.cos(ay), 0, np.sin(ay)], [0, 1, 0], [-np.sin(ay), 0, np.cos(ay)]])
#     Rz = np.array([[np.cos(az), -np.sin(az), 0], [np.sin(az), np.cos(az), 0], [0, 0, 1]])
#     R = Rz @ Ry @ Rx
#
#     # 计算投影矩阵
#     K = np.array([[focal_length_px, 0, w / 2], [0, focal_length_px, h / 2], [0, 0, 1]])
#     P = K @ R @ np.linalg.inv(K)  # 考虑焦点的投影矩阵
#     print(P.shape)
#     # 应用透视变换
#     transformed_image = cv2.warpPerspective(image, P, (w, h))
#     ls = []
#     with open(image_name_txt,"r") as f:
#         line = json.load(f)
#         print(line)
#         for index,info in enumerate(line):
#             txts = info.get("transcription")
#             boxes = info.get("points")
#             diff = info.get("difficult")
#             print(txts,boxes,diff)
#             dist_list = []
#             for index, ocr_qc in enumerate(boxes):
#                 # box_center_ocr = get_single_center_points(ocr_qc[0])
#                 point_homog = np.array([0, 0, 1])
#                 point_transformed_homog = np.dot(P, point_homog)
#                 other_y = point_transformed_homog[1]
#                 transformed_point = cv2.perspectiveTransform(ocr_qc, P)
#                 # dist_list.append(other_y - p_a_new[1])
#             dist_list = sorted(dist_list)
#             #boxes坐标透视变换
#             point_transformed_homog = np.dot(P, boxes)
#             dict = {"transcription": " ", "points": [[931, 1136], [1035, 1228], [981, 1285], [882, 1195]],"difficult": False}
#             dict["transcription"]=txts
#             dict["points"]=point_transformed_homog
#             dict["difficult"]=diff
#             ls.append(dict)
#     print("ls:",ls)
#
#     return transformed_image


# def gen_new_img(image):
#     gamma_v = random.randint(5, 15) / 10.0
#     gamma_img = gamma_correct(image, gamma_v)
#     r_angle = [random.randint(-30, 30) / 10.0 for _ in range(3)]
#     gamma_img = rotate_image(gamma_img, r_angle[0], r_angle[1], r_angle[2], 12.0)
#     # gamma_img = cv2.flip(gamma_img, flipCode=1)
#     if random.randint(0, 1) == 0:
#         gamma_img = cv2.rotate(gamma_img, cv2.ROTATE_180)
#     return gamma_img

if __name__ == '__main__':
    img_dir = r"F:\Desktop\training"
    out_dir = r"F:\Desktop\traing_brighting"
    img_dir_txt=r"F:\Desktop\training_txt1"
    for path in os.listdir(img_dir):
        if path.endswith(".bmp") or path.endswith(".jpg") or path.endswith(".png"):
            path1 = os.path.join(img_dir, path)
            print(path1)
            img = cv2.imread(path1)
            #gamma纠正
            gamma_v = random.randint(5, 15) / 10.0
            gamma_img = gamma_correct(img, gamma_v)
            #旋转角度
            # r_angle = [random.randint(-30, 30) / 10.0 for _ in range(3)]
            # image_label_name=os.path.join(img_dir_txt,path.split(".")[0]+".txt")
            # print(image_label_name)
            # gamma_img = rotate_image(gamma_img, r_angle[0], r_angle[1], r_angle[2], 12.0,image_label_name)
            # # gamma_img = cv2.flip(gamma_img, flipCode=1)
            # if random.randint(0, 1) == 0:
            #     gamma_img = cv2.rotate(gamma_img, cv2.ROTATE_180)
            cv2.imwrite(os.path.join(out_dir,"2_"+path),gamma_img)

数据增强检测和识别合并一起

import os
import cv2
import random
import json
import math
import numpy as np
# from scipy.linalg import block_diag
import shutil

def gamma_correct(img_src, gamma):
    # 设置伽马值
    # gamma = 1.5
    # 计算伽马校正表
    gamma_table = np.array([((i / 255.0) ** (1.0 / gamma)) * 255 for i in range(256)]).astype(np.uint8)
    # 应用伽马校正表
    img_gamma = cv2.LUT(img_src, gamma_table)
    return img_gamma

def rotate_point(point, angle_degrees, center=(0, 0)):
    """
    Rotate a point counterclockwise by a given angle around a center point.

    Parameters:
        point (tuple[float, float]): The x, y coordinates of the point to rotate.
        angle_degrees (float): The angle to rotate the point counterclockwise, in degrees.
        center (tuple[float, float]): The x, y coordinates of the center of rotation.

    Returns:
        tuple[float, float]: The new x, y coordinates of the rotated point.
    """
    # Convert angle to radians
    angle_rad = math.radians(angle_degrees)
    # Translate point to origin
    translated_point = (point[0] - center[0], point[1] - center[1])
    # Rotate point around origin
    rotated_point_x = translated_point[0] * math.cos(angle_rad) - translated_point[1] * math.sin(angle_rad)
    rotated_point_y = translated_point[0] * math.sin(angle_rad) + translated_point[1] * math.cos(angle_rad)
    # Translate point back to original position
    rotated_point = (rotated_point_x + center[0], rotated_point_y + center[1])
    return rotated_point

def get_line(linea_value):
    ls = []
    for ind, info in enumerate(linea_value):
        dict = {}
        txts = info.get("transcription")
        print("txts:", txts)
        boxes = info.get("points")
        diff = info.get("difficult")
        print(txts, boxes, diff)
        new_boxes = []
        for box in boxes:
            point = (int(box[0]), int(box[1]))
            transformed_point = rotate_point(point, -angle, center)
            print(transformed_point)
            transformed_point = (int(transformed_point[0]), int(transformed_point[1]))
            new_boxes.append(transformed_point)

        dict["transcription"] = txts
        dict["points"] = new_boxes
        dict["difficult"] = diff
        ls.append(dict)
    ls = json.dumps(ls)
    return ls

# 过滤函数
def filter_out_of_bounds(items, width, height):
    filtered_items = []
    for item in items:
        points = item["points"]
        if all(0 <= x < width and 0 <= y < height for x, y in points):
            filtered_items.append(item)
    return filtered_items

def RGB2mosaicRGB(img, k=0):
    '''
    img: RGB, numpy
    k: int 0-255  The smaller, the darker
    return:
        mosaic:
    step1:
    Resample the input RGB image into a mosaic of RGB subpixels
    (modeled as 9 pixels with [K, K, K; R, G, B; R, G, B], where
    K stands for black) to simulate the image displayed on the LCD.
    Note that this step causes the final moire image to be darker.
    '''
    h, w, c = img.shape
    scale = 3
    h_, w_ = int(h*scale), int(w*scale)
    mosaic = np.ones((h_, w_, c), np.uint8) * k
    for i in range(h):
        for j in range(w):
            mosaic[i*scale+1, j*scale, 0] = img[i, j, 0]
            mosaic[i*scale+1, j*scale+1, 1] = img[i, j, 1]
            mosaic[i*scale+1, j*scale+2, 2] = img[i, j, 2]
        mosaic[i*scale+2, :, :] = mosaic[i*scale+1, :, :]
    return mosaic


def toushi(img):
    # 图像的高度和宽度
    height, width = img.shape[:2]
    # 定义变换前后的四个点
    # 原始图像的四个角点
    pts1 = np.float32([[50, 50], [width - 50, 50], [50, height - 50], [width - 50, height - 50]])
    # 变换后的四个点，可以根据需要调整
    pts2 = np.float32([[0, 0], [width, 0], [0, height], [width, height]])
    # 计算透视变换矩阵
    perspective_matrix = cv2.getPerspectiveTransform(pts1, pts2)
    # 进行透视变换
    perspective_img = cv2.warpPerspective(img, perspective_matrix, (width, height))

    return perspective_img


if __name__ == '__main__':
    # 输入路径
    img_dir = "/data1/thl/PaddleOCR/train_data/japan_chinese20240619"
    img_label_txt = os.path.join(img_dir, "Label.txt")
    img_rec_gt_txt = os.path.join(img_dir, "rec_gt.txt")
    # 保存路径
    file_name = "japan_chinese_final20240619"
    out_dir = f"/data1/thl/PaddleOCR/train_data/{file_name}"
    if not os.path.exists(out_dir): os.makedirs(out_dir)
    # 删除Label.txt文件
    f1 = os.path.join(out_dir, "Label.txt")
    if os.path.exists(f1): os.remove(f1)
    f2 = os.path.join(out_dir, "rec_gt.txt")
    if os.path.exists(f2): os.remove(f2)
    #解决检测数据增强
    with open(img_label_txt) as f:
        lines=f.readlines()
        for line in lines:
            linea=line.split("\t")
            line0,line1=linea[0],linea[1]
            img_name=line0.split("/")[-1]
            print(img_name)
            img_path=os.path.join(img_dir,img_name)
            img=cv2.imread(img_path)
            gamma_v = random.randint(5, 15) / 10.0
            gamma_img = gamma_correct(img, gamma_v)
            # 复制原始图片到文件夹下
            shutil.copy(img_path, os.path.join(out_dir, img_name))
            # 获取图片的高度和宽度
            (h, w) = gamma_img.shape[:2]
            # 计算旋转中心点
            center = (w / 2, h / 2)
            # 复制标签到label.txt下
            with open(os.path.join(out_dir, "Label.txt"), "a+") as fout:
                str_name=f"{file_name}/{img_name}"
                fout.write(str_name+ "\t" + str(line1))
                fout.close()

            # 摩尔纹增强
            height, width = img.shape[:2]
            # 计算放大后的尺寸
            new_height = int(height / 3)
            new_width = int(width / 3)
            image1 = cv2.resize(img, (new_width, new_height), interpolation=cv2.INTER_LINEAR)
            k = random.randint(130, 170)
            transformed_image = RGB2mosaicRGB(image1, k)
            transformed_image = cv2.cvtColor(transformed_image, cv2.COLOR_BGR2GRAY)
            mosaic_img_name=f"{img_name[:-4]}_mosaic.{img_name[-3:]}"
            cv2.imwrite(os.path.join(out_dir, mosaic_img_name), transformed_image)
            with open(os.path.join(out_dir, "Label.txt"), "a+") as fout1:
                str_name=f"{file_name}/{mosaic_img_name}"
                fout1.write(str_name+ "\t" + str(line1))
                fout1.close()

            #noise增强
            # 进行透视变换
            perspective_img = toushi(img)
            noise_img_name = f"{img_name[:-4]}_noise.{img_name[-3:]}"
            cv2.imwrite(os.path.join(out_dir, noise_img_name), perspective_img)
            with open(os.path.join(out_dir, "Label.txt"), "a+") as fout2:
                str_name=f"{file_name}/{noise_img_name}"
                fout2.write(str_name+ "\t" + str(line1))
                fout2.close()

            # 随机生成旋转角度
            for i in range(1,12):
                # angle = random.randint(0, 360)
                angle=int(i*30)
                # print(angle)
                # 计算旋转矩阵
                M = cv2.getRotationMatrix2D(center, angle, 1.0)
                # 进行旋转操作
                rotated = cv2.warpAffine(gamma_img, M, (w, h))
                if img_name.endswith(".jpg") or img_name.endswith(".png"):
                    formatted_path = f"{img_name[:-4]}_{angle}.{img_name[-3:]}"
                elif img_name.endswith(".jpeg"):
                    formatted_path = f"{img_name[:-5]}_{angle}.{img_name[-4:]}"
                formatted_path = f"{img_name[:-4]}_{angle}.{img_name[-3:]}"
                cv2.imwrite(os.path.join(out_dir, formatted_path), rotated)
                linea_value = json.loads(linea[1])
                ls = get_line(linea_value)
                # filtered_data = filter_out_of_bounds(ls, w, h)
                # filtered_data = json.dumps(filtered_data)
                with open(os.path.join(out_dir,"Label.txt"),"a+") as fout3:
                    str_name1 = f"{file_name}/{formatted_path}"
                    fout3.write(str_name1+"\t"+str(ls)+"\n")
                    fout3.close()

    # 解决识别数据增强
    with open(img_rec_gt_txt) as f:
        lines = f.readlines()
        for line in lines:
            line = line.split("\t")
            line0, line1 = line[0], line[1].replace("\n", "")
            img_name = line0.split("/")[-1]
            print(img_name)
            img = cv2.imread(os.path.join(img_dir, line0))
            out_crop_pic_dir = os.path.join(out_dir, "crop_img")
            if not os.path.exists(out_crop_pic_dir): os.makedirs(out_crop_pic_dir)
            # 保存原始图
            cv2.imwrite(os.path.join(out_crop_pic_dir, img_name), img)
            new_rec_gt_txt = os.path.join(out_dir, "rec_gt.txt")

            # 保存原始标签
            with open(new_rec_gt_txt, 'a+') as file:
                str_name1 = f"crop_img/{img_name}"
                file.write(str_name1 + "\t" + str(line1) + "\n")

            #灰度图
            gray=cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
            img_gray_name = f"{img_name[:-4]}_gray.{img_name[-3:]}"
            cv2.imwrite(os.path.join(out_crop_pic_dir, img_gray_name), gray)
            with open(new_rec_gt_txt, 'a+') as file:
                str_name2 = f"crop_img/{img_gray_name}"
                file.write(str_name2 + "\t" + str(line1) + "\n")

            # rec数据增强
            # 旋转180度
            img_rotated_180 = cv2.flip(img, -1)
            img_rotate180_name = f"{img_name[:-4]}_180.{img_name[-3:]}"
            # 保存旋转180度图
            cv2.imwrite(os.path.join(out_crop_pic_dir, img_rotate180_name), img_rotated_180)
            # 保存旋转180度标签
            with open(new_rec_gt_txt, 'a+') as file:
                str_name3 = f"crop_img/{img_rotate180_name}"
                file.write(str_name3 + "\t" + str(line1) + "\n")

            # gamma增强
            gamma_v = random.randint(5, 15) / 10.0
            gamma_img = gamma_correct(img, gamma_v)
            img_gamma_name = f"{img_name[:-4]}_gamma.{img_name[-3:]}"
            cv2.imwrite(os.path.join(out_crop_pic_dir, img_gamma_name), gamma_img)
            # # 保存gamma标签
            with open(new_rec_gt_txt, 'a+') as file:
                str_name4 = f"crop_img/{img_gamma_name}"
                file.write(str_name4 + "\t" + str(line1) + "\n")

            # 摩尔纹增强
            height, width = img.shape[:2]
            # 计算放大后的尺寸
            new_height = int(height / 3)
            new_width = int(width / 3)
            image = cv2.resize(img, (new_width, new_height), interpolation=cv2.INTER_LINEAR)
            k = random.randint(130, 170)
            transformed_image = RGB2mosaicRGB(image, k)
            transformed_image = cv2.cvtColor(transformed_image, cv2.COLOR_BGR2GRAY)
            if img_name.endswith(".jpg") or img_name.endswith(".png"):
                mosaic_img_name = f"{img_name[:-4]}_mosaic.{img_name[-3:]}"
            elif img_name.endswith(".jpeg"):
                mosaic_img_name = f"{img_name[:-5]}_mosaic.{img_name[-4:]}"
            cv2.imwrite(os.path.join(out_crop_pic_dir, mosaic_img_name), transformed_image)
            # # 保存mosaic标签
            with open(new_rec_gt_txt, 'a+') as file:
                str_name5 = f"crop_img/{mosaic_img_name}"
                file.write(str_name5 + "\t" + str(line1) + "\n")