对于PaddleOCR中的det(目标检测)和rec(文本识别),可以使用不同的数据增强方法来提高模型的性能。以下是一些常见的参数修改方法:
目标检测数据增强参数修改:
图像翻转:通过设置flip_ratio参数来控制图像水平翻转的概率。
随机旋转:通过设置rotation_degree参数来控制图像随机旋转的角度范围。
随机裁剪:通过设置image_shape和random_crop来进行图像的随机裁剪。
文本识别数据增强参数修改:
图像翻转:通过设置flip_horizontal_prob参数来控制图像水平翻转的概率。
随机旋转:通过设置random_rotate_range参数来控制图像随机旋转的角度范围。
随机裁剪:通过设置image_shape和random_crop来进行图像的随机裁剪。
颜色抖动:通过设置image_distort_strategy参数来进行图像颜色抖动增强。
这只是一些常见的参数修改方法,具体的数据增强方法和参数设置可以根据实际需求进行调整。
一、什么是数据增强?
数据增强是一种挖机数据集潜力的方法,可以让数据集蕴含更多让模型有效学习的信息。这些方法是领域和任务特定的,
二、为什么需要数据增强?
1、在实际任务中,原始数据集未必完全含有解决任务所需要的充足信息。通过分析任务场景的复杂性和当前数据集的短板,对现有数据有针对性做一些数据增强/增广的策略的修改,以提供更加多样性的、匹配任务场景复杂性的新数据,往往可以显著的提高模型效果。
2、扩大训练数据集,抑制过拟合,提升模型的泛化能力。
三、基础的图像增强方法:
1、随机改变亮度、对比度和颜色
2、随机填充
3、随机裁剪
4、随机缩放
5、随机翻转
四、高阶的图像增强方法
1、图像变换类:
标准变换
autoaugment
randaugment
2、图像裁剪类:
cutout
randerasing
hideandseek
gridmask
3、图像混叠类:
mixup
cutmix
#识别增强
https://blog.csdn.net/hhhhhhhhhhwwwwwwwwww/article/details/125155386
smb://jczngd-nas.local/mnt/share/数据/THL/ocr_datasets/training
数据增强旋转(图旋转和点旋转)代码
import os
import cv2
import random
import json
import math
import numpy as np
from scipy.linalg import block_diag
def rotate_point(point, angle_degrees, center=(0, 0)):
"""
Rotate a point counterclockwise by a given angle around a center point.
Parameters:
point (tuple[float, float]): The x, y coordinates of the point to rotate.
angle_degrees (float): The angle to rotate the point counterclockwise, in degrees.
center (tuple[float, float]): The x, y coordinates of the center of rotation.
Returns:
tuple[float, float]: The new x, y coordinates of the rotated point.
"""
# Convert angle to radians
angle_rad = math.radians(angle_degrees)
# Translate point to origin
translated_point = (point[0] - center[0], point[1] - center[1])
# Rotate point around origin
rotated_point_x = translated_point[0] * math.cos(angle_rad) - translated_point[1] * math.sin(angle_rad)
rotated_point_y = translated_point[0] * math.sin(angle_rad) + translated_point[1] * math.cos(angle_rad)
# Translate point back to original position
rotated_point = (rotated_point_x + center[0], rotated_point_y + center[1])
return rotated_point
# def rotate_points(points,center,angle):
# # 计算旋转矩阵
# M = cv2.getRotationMatrix2D(center, angle, 1.0)
# # 获取旋转后的点坐标,这里以原点(0,0)为例
# rotated_point = np.dot(M, np.array([0, 0, 1]))[:2] / rotated_point[2])
# return rotated_point
if __name__ == '__main__':
img_dir = r"F:\Desktop\training"
img_dir_txt=r"F:\Desktop\training_txt1"
out_dir=r"H:\training_rotate2"
for path in os.listdir(img_dir):
if path.endswith(".bmp") or path.endswith(".jpg") or path.endswith(".png"):
path1 = os.path.join(img_dir, path)
print(path1)
img = cv2.imread(path1)
# 随机生成旋转角度
angle = random.randint(0, 360)
print(angle)
# 获取图片的高度和宽度
(h, w) = img.shape[:2]
# 计算旋转中心点
center = (w / 2, h / 2)
# 计算旋转矩阵
M = cv2.getRotationMatrix2D(center, angle, 1.0)
# 进行旋转操作
rotated = cv2.warpAffine(img, M, (w, h))
cv2.imwrite(os.path.join(out_dir,"3_"+path),rotated)
ls=[]
with open(os.path.join(img_dir_txt,path.split(".")[0]+".txt")) as f:
line = json.load(f)
# print(line)
for ind,info in enumerate(line):
dict = {}
txts = info.get("transcription")
boxes = info.get("points")
diff = info.get("difficult")
print(txts,boxes,diff)
new_boxes=[]
for box in boxes:
point=(int(box[0]),int(box[1]))
transformed_point=rotate_point(point, -angle, center)
print(transformed_point)
transformed_point=(int(transformed_point[0]),int(transformed_point[1]))
new_boxes.append(transformed_point)
dict["transcription"]=txts
dict["points"]=new_boxes
dict["difficult"]=diff
ls.append(dict)
print(ls)
with open(os.path.join(out_dir,"Label.txt"),"a+") as fout:
fout.write("training/3_%s"%(path)+"\t"+str(ls)+"\n")
数据增强(gamma变换)代码
import cv2,os
import random
import numpy as np
import json
def gamma_correct(img_src, gamma):
# 设置伽马值
# gamma = 1.5
# 计算伽马校正表
gamma_table = np.array([((i / 255.0) ** (1.0 / gamma)) * 255 for i in range(256)]).astype(np.uint8)
# 应用伽马校正表
img_gamma = cv2.LUT(img_src, gamma_table)
return img_gamma
# def get_single_center_points(box):
# x_list = [i[0] for i in box]
# y_list = [j[1] for j in box]
# xmax, xmin = np.max(x_list), np.min(x_list)
# ymax, ymin = np.max(y_list), np.min(y_list)
# center = [int((xmax + xmin) / 2), int((ymax + ymin) / 2)]
# return center
# def rotate_image(image, angle_x, angle_y, angle_z, focal_length,image_name_txt):
# """
# 对图像进行基于指定角度的轻微透视变换。
#
# :param image: 输入图像
# :param angle_x: 绕X轴的旋转角度(较小)
# :param angle_y: 绕Y轴的旋转角度(较小)
# :param angle_z: 绕Z轴的旋转角度(较小)
# :param focal_length: 相机焦距(毫米)
# :return: 变换后的图像
# """
# h, w = image.shape[:2]
# # 将焦距从毫米转换为像素单位
# focal_length_px = (focal_length / 36) * w # 假设35mm全幅相机
#
# # 将角度转换为弧度
# ax, ay, az = np.deg2rad(angle_x), np.deg2rad(angle_y), np.deg2rad(angle_z)
#
# # 构建旋转矩阵
# Rx = np.array([[1, 0, 0], [0, np.cos(ax), -np.sin(ax)], [0, np.sin(ax), np.cos(ax)]])
# Ry = np.array([[np.cos(ay), 0, np.sin(ay)], [0, 1, 0], [-np.sin(ay), 0, np.cos(ay)]])
# Rz = np.array([[np.cos(az), -np.sin(az), 0], [np.sin(az), np.cos(az), 0], [0, 0, 1]])
# R = Rz @ Ry @ Rx
#
# # 计算投影矩阵
# K = np.array([[focal_length_px, 0, w / 2], [0, focal_length_px, h / 2], [0, 0, 1]])
# P = K @ R @ np.linalg.inv(K) # 考虑焦点的投影矩阵
# print(P.shape)
# # 应用透视变换
# transformed_image = cv2.warpPerspective(image, P, (w, h))
# ls = []
# with open(image_name_txt,"r") as f:
# line = json.load(f)
# print(line)
# for index,info in enumerate(line):
# txts = info.get("transcription")
# boxes = info.get("points")
# diff = info.get("difficult")
# print(txts,boxes,diff)
# dist_list = []
# for index, ocr_qc in enumerate(boxes):
# # box_center_ocr = get_single_center_points(ocr_qc[0])
# point_homog = np.array([0, 0, 1])
# point_transformed_homog = np.dot(P, point_homog)
# other_y = point_transformed_homog[1]
# transformed_point = cv2.perspectiveTransform(ocr_qc, P)
# # dist_list.append(other_y - p_a_new[1])
# dist_list = sorted(dist_list)
# #boxes坐标透视变换
# point_transformed_homog = np.dot(P, boxes)
# dict = {"transcription": " ", "points": [[931, 1136], [1035, 1228], [981, 1285], [882, 1195]],"difficult": False}
# dict["transcription"]=txts
# dict["points"]=point_transformed_homog
# dict["difficult"]=diff
# ls.append(dict)
# print("ls:",ls)
#
# return transformed_image
# def gen_new_img(image):
# gamma_v = random.randint(5, 15) / 10.0
# gamma_img = gamma_correct(image, gamma_v)
# r_angle = [random.randint(-30, 30) / 10.0 for _ in range(3)]
# gamma_img = rotate_image(gamma_img, r_angle[0], r_angle[1], r_angle[2], 12.0)
# # gamma_img = cv2.flip(gamma_img, flipCode=1)
# if random.randint(0, 1) == 0:
# gamma_img = cv2.rotate(gamma_img, cv2.ROTATE_180)
# return gamma_img
if __name__ == '__main__':
img_dir = r"F:\Desktop\training"
out_dir = r"F:\Desktop\traing_brighting"
img_dir_txt=r"F:\Desktop\training_txt1"
for path in os.listdir(img_dir):
if path.endswith(".bmp") or path.endswith(".jpg") or path.endswith(".png"):
path1 = os.path.join(img_dir, path)
print(path1)
img = cv2.imread(path1)
#gamma纠正
gamma_v = random.randint(5, 15) / 10.0
gamma_img = gamma_correct(img, gamma_v)
#旋转角度
# r_angle = [random.randint(-30, 30) / 10.0 for _ in range(3)]
# image_label_name=os.path.join(img_dir_txt,path.split(".")[0]+".txt")
# print(image_label_name)
# gamma_img = rotate_image(gamma_img, r_angle[0], r_angle[1], r_angle[2], 12.0,image_label_name)
# # gamma_img = cv2.flip(gamma_img, flipCode=1)
# if random.randint(0, 1) == 0:
# gamma_img = cv2.rotate(gamma_img, cv2.ROTATE_180)
cv2.imwrite(os.path.join(out_dir,"2_"+path),gamma_img)
数据增强检测和识别合并一起
import os
import cv2
import random
import json
import math
import numpy as np
# from scipy.linalg import block_diag
import shutil
def gamma_correct(img_src, gamma):
# 设置伽马值
# gamma = 1.5
# 计算伽马校正表
gamma_table = np.array([((i / 255.0) ** (1.0 / gamma)) * 255 for i in range(256)]).astype(np.uint8)
# 应用伽马校正表
img_gamma = cv2.LUT(img_src, gamma_table)
return img_gamma
def rotate_point(point, angle_degrees, center=(0, 0)):
"""
Rotate a point counterclockwise by a given angle around a center point.
Parameters:
point (tuple[float, float]): The x, y coordinates of the point to rotate.
angle_degrees (float): The angle to rotate the point counterclockwise, in degrees.
center (tuple[float, float]): The x, y coordinates of the center of rotation.
Returns:
tuple[float, float]: The new x, y coordinates of the rotated point.
"""
# Convert angle to radians
angle_rad = math.radians(angle_degrees)
# Translate point to origin
translated_point = (point[0] - center[0], point[1] - center[1])
# Rotate point around origin
rotated_point_x = translated_point[0] * math.cos(angle_rad) - translated_point[1] * math.sin(angle_rad)
rotated_point_y = translated_point[0] * math.sin(angle_rad) + translated_point[1] * math.cos(angle_rad)
# Translate point back to original position
rotated_point = (rotated_point_x + center[0], rotated_point_y + center[1])
return rotated_point
def get_line(linea_value):
ls = []
for ind, info in enumerate(linea_value):
dict = {}
txts = info.get("transcription")
print("txts:", txts)
boxes = info.get("points")
diff = info.get("difficult")
print(txts, boxes, diff)
new_boxes = []
for box in boxes:
point = (int(box[0]), int(box[1]))
transformed_point = rotate_point(point, -angle, center)
print(transformed_point)
transformed_point = (int(transformed_point[0]), int(transformed_point[1]))
new_boxes.append(transformed_point)
dict["transcription"] = txts
dict["points"] = new_boxes
dict["difficult"] = diff
ls.append(dict)
ls = json.dumps(ls)
return ls
# 过滤函数
def filter_out_of_bounds(items, width, height):
filtered_items = []
for item in items:
points = item["points"]
if all(0 <= x < width and 0 <= y < height for x, y in points):
filtered_items.append(item)
return filtered_items
def RGB2mosaicRGB(img, k=0):
'''
img: RGB, numpy
k: int 0-255 The smaller, the darker
return:
mosaic:
step1:
Resample the input RGB image into a mosaic of RGB subpixels
(modeled as 9 pixels with [K, K, K; R, G, B; R, G, B], where
K stands for black) to simulate the image displayed on the LCD.
Note that this step causes the final moire image to be darker.
'''
h, w, c = img.shape
scale = 3
h_, w_ = int(h*scale), int(w*scale)
mosaic = np.ones((h_, w_, c), np.uint8) * k
for i in range(h):
for j in range(w):
mosaic[i*scale+1, j*scale, 0] = img[i, j, 0]
mosaic[i*scale+1, j*scale+1, 1] = img[i, j, 1]
mosaic[i*scale+1, j*scale+2, 2] = img[i, j, 2]
mosaic[i*scale+2, :, :] = mosaic[i*scale+1, :, :]
return mosaic
def toushi(img):
# 图像的高度和宽度
height, width = img.shape[:2]
# 定义变换前后的四个点
# 原始图像的四个角点
pts1 = np.float32([[50, 50], [width - 50, 50], [50, height - 50], [width - 50, height - 50]])
# 变换后的四个点,可以根据需要调整
pts2 = np.float32([[0, 0], [width, 0], [0, height], [width, height]])
# 计算透视变换矩阵
perspective_matrix = cv2.getPerspectiveTransform(pts1, pts2)
# 进行透视变换
perspective_img = cv2.warpPerspective(img, perspective_matrix, (width, height))
return perspective_img
if __name__ == '__main__':
# 输入路径
img_dir = "/data1/thl/PaddleOCR/train_data/japan_chinese20240619"
img_label_txt = os.path.join(img_dir, "Label.txt")
img_rec_gt_txt = os.path.join(img_dir, "rec_gt.txt")
# 保存路径
file_name = "japan_chinese_final20240619"
out_dir = f"/data1/thl/PaddleOCR/train_data/{file_name}"
if not os.path.exists(out_dir): os.makedirs(out_dir)
# 删除Label.txt文件
f1 = os.path.join(out_dir, "Label.txt")
if os.path.exists(f1): os.remove(f1)
f2 = os.path.join(out_dir, "rec_gt.txt")
if os.path.exists(f2): os.remove(f2)
#解决检测数据增强
with open(img_label_txt) as f:
lines=f.readlines()
for line in lines:
linea=line.split("\t")
line0,line1=linea[0],linea[1]
img_name=line0.split("/")[-1]
print(img_name)
img_path=os.path.join(img_dir,img_name)
img=cv2.imread(img_path)
gamma_v = random.randint(5, 15) / 10.0
gamma_img = gamma_correct(img, gamma_v)
# 复制原始图片到文件夹下
shutil.copy(img_path, os.path.join(out_dir, img_name))
# 获取图片的高度和宽度
(h, w) = gamma_img.shape[:2]
# 计算旋转中心点
center = (w / 2, h / 2)
# 复制标签到label.txt下
with open(os.path.join(out_dir, "Label.txt"), "a+") as fout:
str_name=f"{file_name}/{img_name}"
fout.write(str_name+ "\t" + str(line1))
fout.close()
# 摩尔纹增强
height, width = img.shape[:2]
# 计算放大后的尺寸
new_height = int(height / 3)
new_width = int(width / 3)
image1 = cv2.resize(img, (new_width, new_height), interpolation=cv2.INTER_LINEAR)
k = random.randint(130, 170)
transformed_image = RGB2mosaicRGB(image1, k)
transformed_image = cv2.cvtColor(transformed_image, cv2.COLOR_BGR2GRAY)
mosaic_img_name=f"{img_name[:-4]}_mosaic.{img_name[-3:]}"
cv2.imwrite(os.path.join(out_dir, mosaic_img_name), transformed_image)
with open(os.path.join(out_dir, "Label.txt"), "a+") as fout1:
str_name=f"{file_name}/{mosaic_img_name}"
fout1.write(str_name+ "\t" + str(line1))
fout1.close()
#noise增强
# 进行透视变换
perspective_img = toushi(img)
noise_img_name = f"{img_name[:-4]}_noise.{img_name[-3:]}"
cv2.imwrite(os.path.join(out_dir, noise_img_name), perspective_img)
with open(os.path.join(out_dir, "Label.txt"), "a+") as fout2:
str_name=f"{file_name}/{noise_img_name}"
fout2.write(str_name+ "\t" + str(line1))
fout2.close()
# 随机生成旋转角度
for i in range(1,12):
# angle = random.randint(0, 360)
angle=int(i*30)
# print(angle)
# 计算旋转矩阵
M = cv2.getRotationMatrix2D(center, angle, 1.0)
# 进行旋转操作
rotated = cv2.warpAffine(gamma_img, M, (w, h))
if img_name.endswith(".jpg") or img_name.endswith(".png"):
formatted_path = f"{img_name[:-4]}_{angle}.{img_name[-3:]}"
elif img_name.endswith(".jpeg"):
formatted_path = f"{img_name[:-5]}_{angle}.{img_name[-4:]}"
formatted_path = f"{img_name[:-4]}_{angle}.{img_name[-3:]}"
cv2.imwrite(os.path.join(out_dir, formatted_path), rotated)
linea_value = json.loads(linea[1])
ls = get_line(linea_value)
# filtered_data = filter_out_of_bounds(ls, w, h)
# filtered_data = json.dumps(filtered_data)
with open(os.path.join(out_dir,"Label.txt"),"a+") as fout3:
str_name1 = f"{file_name}/{formatted_path}"
fout3.write(str_name1+"\t"+str(ls)+"\n")
fout3.close()
# 解决识别数据增强
with open(img_rec_gt_txt) as f:
lines = f.readlines()
for line in lines:
line = line.split("\t")
line0, line1 = line[0], line[1].replace("\n", "")
img_name = line0.split("/")[-1]
print(img_name)
img = cv2.imread(os.path.join(img_dir, line0))
out_crop_pic_dir = os.path.join(out_dir, "crop_img")
if not os.path.exists(out_crop_pic_dir): os.makedirs(out_crop_pic_dir)
# 保存原始图
cv2.imwrite(os.path.join(out_crop_pic_dir, img_name), img)
new_rec_gt_txt = os.path.join(out_dir, "rec_gt.txt")
# 保存原始标签
with open(new_rec_gt_txt, 'a+') as file:
str_name1 = f"crop_img/{img_name}"
file.write(str_name1 + "\t" + str(line1) + "\n")
#灰度图
gray=cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
img_gray_name = f"{img_name[:-4]}_gray.{img_name[-3:]}"
cv2.imwrite(os.path.join(out_crop_pic_dir, img_gray_name), gray)
with open(new_rec_gt_txt, 'a+') as file:
str_name2 = f"crop_img/{img_gray_name}"
file.write(str_name2 + "\t" + str(line1) + "\n")
# rec数据增强
# 旋转180度
img_rotated_180 = cv2.flip(img, -1)
img_rotate180_name = f"{img_name[:-4]}_180.{img_name[-3:]}"
# 保存旋转180度图
cv2.imwrite(os.path.join(out_crop_pic_dir, img_rotate180_name), img_rotated_180)
# 保存旋转180度标签
with open(new_rec_gt_txt, 'a+') as file:
str_name3 = f"crop_img/{img_rotate180_name}"
file.write(str_name3 + "\t" + str(line1) + "\n")
# gamma增强
gamma_v = random.randint(5, 15) / 10.0
gamma_img = gamma_correct(img, gamma_v)
img_gamma_name = f"{img_name[:-4]}_gamma.{img_name[-3:]}"
cv2.imwrite(os.path.join(out_crop_pic_dir, img_gamma_name), gamma_img)
# # 保存gamma标签
with open(new_rec_gt_txt, 'a+') as file:
str_name4 = f"crop_img/{img_gamma_name}"
file.write(str_name4 + "\t" + str(line1) + "\n")
# 摩尔纹增强
height, width = img.shape[:2]
# 计算放大后的尺寸
new_height = int(height / 3)
new_width = int(width / 3)
image = cv2.resize(img, (new_width, new_height), interpolation=cv2.INTER_LINEAR)
k = random.randint(130, 170)
transformed_image = RGB2mosaicRGB(image, k)
transformed_image = cv2.cvtColor(transformed_image, cv2.COLOR_BGR2GRAY)
if img_name.endswith(".jpg") or img_name.endswith(".png"):
mosaic_img_name = f"{img_name[:-4]}_mosaic.{img_name[-3:]}"
elif img_name.endswith(".jpeg"):
mosaic_img_name = f"{img_name[:-5]}_mosaic.{img_name[-4:]}"
cv2.imwrite(os.path.join(out_crop_pic_dir, mosaic_img_name), transformed_image)
# # 保存mosaic标签
with open(new_rec_gt_txt, 'a+') as file:
str_name5 = f"crop_img/{mosaic_img_name}"
file.write(str_name5 + "\t" + str(line1) + "\n")