目录
- 数据集介绍
- labelme格式转换及可视化
- yolov8旋转框检测格式转换
1.数据集介绍
SynthText800k是一个大型的文本训练数据集,包含文本实例、单词实例、字符实例(大小写字母、数字、标点字符),数据集结构如下:
imgs文件夹包含202个文件夹,858750张合成图片,每个文件夹对应一个场景。
gt.mat保存了标注实例信息,其格式官方描述如下:
其中,charBB的box顺序与txt中文本切分成字符后顺序一一对应,将txt文本切分即可得到charBB的label。
2.labelme格式转换及可视化
以下是gt.mat中charBB转labelme的json标注格式的代码,因为我的需求是字母检测,所以剔除了非字母字符,实际可以根据需求更改逻辑判断。
for i in tqdm(range(img_name_list.size)):
labelme_data = {
"version": "5.0.1",
"flags": {},
"imagePath": {},
"shapes": [],
"imageData": {}
}
img_name = img_name_list[i][0]
img_path = os.path.join(img_dir, img_name)
img = cv2.imread(img_path)
out_image_name = img_name.replace(img_name.split(".")[0],"{}".format(i))
out_img_path = os.path.join(out_dir, out_image_name)
out_label_path = out_img_path.replace(".jpg", ".json")
labelme_data["imagePath"] = os.path.basename(out_image_name)
labelme_data["imageData"] = None
Image.open(img_path).save(out_img_path)
# 获取字符框信息
char_bboxes = charBB_list[i]
# 获取文本信息
texts = text_list[i]
text = []
# 将文本信息分割成字符
for j in range(len(texts)):
text.extend(list(texts[j].replace(" ", "").replace("\n", "")))
# 确保charBB中字符数和text中包含字符数长度相同
assert char_bboxes.shape[2] == len(text), "[INFO]{}:\n\tcharBB and text have different length\n\tcharBB.shape[2] = {}, len(text) = {}\n\ttext:{}".format(
img_name,
char_bboxes.shape[2],
len(text),
text
)
for j in range(char_bboxes.shape[2]):
# 判断当前字符是否是字母
if text[j].isalpha() != True:
continue
label = text[j]
# 获取当前字符的polygon坐标
bbox = char_bboxes[:,:,j]
points = [[int(bbox[0][m]), int(bbox[1][m])] for m in range(4)]
shape_info = {'points': points,
'group_id': None,
'label': label,
'shape_type': "polygon",
'flags": {}
}
labelme_data["shapes"].append(shape_info)
labelme_data["imageHeight"] = img.shape[0]
labelme_data["imageWidth"] = img.shape[1]
with open(out_label_path, 'w') as output_json_file:
json.dump(labelme_data, output_json_file, indent=4)
output_json_file.close()
打开labelme,可视化结果如下所示:
3.yolov8旋转框检测格式转换
yolov8 OBB遵循以下格式,坐标在0-1之间归一化:
class_index, x1, y1, x2, y2, x3, y3, x4, y4
可通过cv2.minAreaRect函数返回不规则四边形的最小外接矩形,并通过cv2.boxPoints获取四个点坐标:
def coordinates_to_rotated_rect(points):
rect = cv2.minAreaRect(points)
box = cv2.boxPoints(rect)
# 转为int格式
point_obb = np.int0(box)
return point_obb
将以上写入json文件的代码部分写入改成以下代码:
points = [[int(bbox[0][m]), int(bbox[1][m])] for m in range(4)]
# 获取旋转框的四个点
rect_obb = coordinates_to_rotated_rect(np.array(points)).tolist()
shape_info = {'points': rect_obb,
'group_id': None,
'label': label,
'shape_type': "polygon",
'flags': {}
}
结果如下:
转换为yolov8 标准格式:
def letter_to_number(letter):
# 将字符和数字标签对应
if 'a' <= letter <= 'z':
return ord(letter) - ord('a')
elif 'A' <= letter <= 'Z':
return ord(letter) - ord('A') + 26
def normalize_points(image_width, image_height, points, decimal_places=3):
# 归一化处理
format_string = "{:." + str(decimal_places) + "f}"
normalized_points = []
for point in points:
normalized_x = format_string.format(point[0] / image_width)
normalized_y = format_string.format(point[1] / image_height)
normalized_points.append([float(normalized_x), float(normalized_y)])
return normalized_points
def array_to_string(arr):
# 将二维数组中的元素转换为字符串
str_arr = [','.join(map(str, row)) for row in arr]
# 将字符串数组连接为一个字符串
result = ','.join(str_arr)
return result
for i in tqdm(range(img_name_list.size)):
img_name = img_name_list[i][0]
img_path = os.path.join(img_dir, img_name)
img = cv2.imread(img_path)
img_height = img.shape[0]
img_width = img.shape[1]
out_image_name = img_name.replace(img_name.split(".")[0],"{}".format(i))
out_img_path = os.path.join(out_dir, out_image_name)
out_label_path = out_img_path.replace(".jpg", ".txt")
Image.open(img_path).save(out_img_path)
# 获取字符框信息
char_bboxes = charBB_list[i]
# 获取文本信息
texts = text_list[i]
text = []
# 定义txt文本信息
info = ""
# 将文本信息分割成字符
for j in range(len(texts)):
text.extend(list(texts[j].replace(" ", "").replace("\n", "")))
# 确保charBB中字符数和text中包含字符数长度相同
assert char_bboxes.shape[2] == len(text), "[INFO]{}:\n\tcharBB and text have different length\n\tcharBB.shape[2] = {}, len(text) = {}\n\ttext:{}".format(
img_name,
char_bboxes.shape[2],
len(text),
text
)
for j in range(char_bboxes.shape[2]):
# 判断当前字符是否是字母
if text[j].isalpha() != True:
continue
label = text[j]
number = letter_to_number(label)
info = info + str(number) + " "
# 获取当前字符的polygon坐标
bbox = char_bboxes[:,:,j]
points = [[int(bbox[0][m]), int(bbox[1][m])] for m in range(4)]
# 旋转框
rect_obb = coordinates_to_rotated_rect(np.array(points))
normalized_points = normalize_points(img_width, img_height, rect_obb)
info = info + array_to_string(normalized_points) + "\n"
with open(out_label_path, 'w') as file:
file.write(info)
file.close()
转换后数据如下: