目录
1. 下载
1.1 官网链接
目前官网有4个任务,第1个任务就是文本定位:
1.2 网盘链接
将官网的文件处理成能够训练的格式:
mkdir icdar2015 && cd icdar2015
mkdir imgs && mkdir annotations
# For images,
mv ch4_training_images imgs/training
mv ch4_test_images imgs/test
# For annotations,
mv ch4_training_localization_transcription_gt annotations/training
mv Challenge4_Test_Task1_GT annotations/test
python tools/data/textdet/icdar_converter.py /path/to/icdar2015 -o /path/to/icdar2015 -d icdar2015 --split-list training test
链接:https://pan.baidu.com/s/1yypel9p30ws6mWYZd14-iw
提取码:inc5
--来自百度网盘超级会员V4的分享
文件格式如下:
annotations
test
training
imgs
test
training
instances_test.json
instances_training.json
2. 文件内容介绍
imgs文件夹就是一些图片,如下。
annotations文件夹是标注文件。
2.1 gt_img_1.txt
377,117,463,117,465,130,378,130,Genaxis Theatre
493,115,519,115,519,131,493,131,[06]
374,155,409,155,409,170,374,170,###
492,151,551,151,551,170,492,170,62-03
376,198,422,198,422,212,376,212,Carpark
494,190,539,189,539,205,494,206,###
374,1,494,0,492,85,372,86,###
对应的图片:
(1)377,117,463,117,465,130,378,130,Genaxis Theatre
是上面图片上的文本标注(多边形4个顶点+文本内容):x1,y1,x2,y2,x3,y3,x4,y4,文本内容。
(2)###
#号是忽略标注。将会丢弃,不参与训练。
代码读取gt.txt文件,并画出多边形
import cv2
import numpy
if __name__ == '__main__':
img_path = r"E:\img_1.jpg"
gt_path = r"E:\gt_img_1.txt"
img = cv2.imread(img_path)
with open(gt_path, mode="r", encoding='utf-8-sig') as f:
gt_list = f.readlines()
for gt in gt_list:
gt = gt.split(',')
del gt[-1]
print(gt[0])
pt_list = []
for i in range(0, len(gt)-1, 2):
pt_list.append([int(gt[i]), int(gt[i+1])])
pt_array = numpy.array(pt_list)
cv2.drawContours(img, [pt_array], -1, (0, 0, 255), 3)
f.close()
cv2.namedWindow("img", cv2.WINDOW_NORMAL), cv2.imshow("img", img), cv2.waitKey()
2.2 instances_training.json
{
"images": [
# images里面存放的是字典,每个字典内容:图片路径、高、宽、对应的字符分割标注文件路径、图片id;
{"file_name": "training/img_927.jpg", "height": 720, "width": 1280, "segm_file": "training/gt_img_927.txt", "id": 0},
{"file_name": "training/img_573.jpg", "height": 720, "width": 1280, "segm_file": "training/gt_img_573.txt", "id": 1},
{"file_name": "training/img_510.jpg", "height": 720, "width": 1280, "segm_file": "training/gt_img_510.txt", "id": 2},
{"file_name": "training/img_468.jpg", "height": 720, "width": 1280, "segm_file": "training/gt_img_468.txt", "id": 3},
{"file_name": "training/img_827.jpg", "height": 720, "width": 1280, "segm_file": "training/gt_img_827.txt", "id": 4}
],
"categories": [{"id": 1, "name": "text"}], # 这个数据集只有一个类别,即文本,id设置为1.
"annotations": [
{"iscrowd": 1,
"category_id": 1,
"bbox": [216.0, 222.0, 93.0, 37.0],
"area": 2054.0,
"segmentation": [[217, 235, 304, 222, 309, 243, 216, 259]],
"image_id": 0,
"id": 0
},
# 所有图片的所有标注信息
# "iscrowd":等于1则是无用标注;默认等于0是有用标注;
# "category": 默认等于1;
# "bbox":文本区域最小外接矩形(min_x, min_y, w, h);
# "area": 多边形文本区域面积;
# "segmentation": 多边形文本区域的4个顶点坐标:[[x1, y1], [x2, y2], ...]
# "image_id": 此条标注所在的图片id;
# "id":此条标注id.
{"iscrowd": 1, "category_id": 1, "bbox": [296.0, 417.0, 85.0, 45.0], "area": 2303.0, "segmentation": [[296, 434, 374, 417, 381, 445, 303, 462]], "image_id": 0, "id": 1},
{"iscrowd": 0, "category_id": 1, "bbox": [374.0, 163.0, 27.0, 14.0], "area": 314.0, "segmentation": [[374, 165, 400, 163, 401, 175, 375, 177]], "image_id": 0, "id": 2},
{"iscrowd": 0, "category_id": 1, "bbox": [400.0, 161.0, 31.0, 13.0], "area": 301.5, "segmentation": [[400, 163, 428, 161, 431, 171, 403, 174]], "image_id": 0, "id": 3},
]
}
3. 生成json脚本
3.1 载入图片和标注信息
def load_img_info(files, dataset):
"""Load the information of one image.
Args:
files(tuple): The tuple of (img_file, groundtruth_file)
dataset(str): Dataset name, icdar2015 or icdar2017
Returns:
img_info(dict): The dict of the img and annotation information
"""
assert isinstance(files, tuple)
assert isinstance(dataset, str)
assert dataset
img_file, gt_file = files
# read imgs with ignoring orientations
img = mmcv.imread(img_file, 'unchanged')
if dataset == 'icdar2017':
gt_list = list_from_file(gt_file)
elif dataset == 'icdar2015':
gt_list = list_from_file(gt_file, encoding='utf-8-sig') # txt2list
else:
raise NotImplementedError(f'Not support {dataset}')
anno_info = []
for line in gt_list:
# each line has one ploygen (4 vetices), and others.
# e.g., 695,885,866,888,867,1146,696,1143,Latin,9
line = line.strip() # "x1, y1, x2, y2, ..., Text"
strs = line.split(',')
category_id = 1
xy = [int(x) for x in strs[0:8]] # 前面八个数字是文本位置坐标。str2int: [x1, y1, x2, y2, ...]
coordinates = np.array(xy).reshape(-1, 2) # list.size=8 转 numpy.size=(4,2): [[x1, y1], [x2, y2], ...]
polygon = Polygon(coordinates)
iscrowd = 0 # 遍历一张图片上的所有标注,iscrowd等于0,说明是有用标注。
# set iscrowd to 1 to ignore 1.
if (dataset == 'icdar2015'
and strs[8] == '###') or (dataset == 'icdar2017'
and strs[9] == '###'):
iscrowd = 1 # 等于1,说明是无用标注。
print('ignore text')
area = polygon.area
# convert to COCO style XYWH format
min_x, min_y, max_x, max_y = polygon.bounds # minimum bounding region
bbox = [min_x, min_y, max_x - min_x, max_y - min_y]
anno = dict(
iscrowd=iscrowd, # 1
category_id=category_id, # 1
bbox=bbox, # 文本最小外接矩形。
area=area, # 多边形文本区域面积
segmentation=[xy]) # 多边形文本区域顶点
anno_info.append(anno)
split_name = osp.basename(osp.dirname(img_file))
img_info = dict(
# remove img_prefix for filename
file_name=osp.join(split_name, osp.basename(img_file)),
height=img.shape[0],
width=img.shape[1],
anno_info=anno_info,
segm_file=osp.join(split_name, osp.basename(gt_file)))
return img_info
传入图片路径和对应的标注.txt路径,返回字典
3.2 保存成json
将前面得到的img_info字典信息,保存成json文件。
def convert_annotations(image_infos, out_json_name):
"""Convert the annotation into coco style.
Args:
image_infos(list): The list of image information dicts
out_json_name(str): The output json filename
Returns:
out_json(dict): The coco style dict
"""
assert isinstance(image_infos, list)
assert isinstance(out_json_name, str)
assert out_json_name # json文件完整路径
out_json = dict()
img_id = 0
ann_id = 0
out_json['images'] = []
out_json['categories'] = []
out_json['annotations'] = []
for image_info in image_infos: # 遍历所有图片的信息
image_info['id'] = img_id # image_info添加img_id信息
anno_infos = image_info.pop('anno_info')
out_json['images'].append(image_info)
for anno_info in anno_infos: # anno_info添加image_id、id信息
anno_info['image_id'] = img_id
anno_info['id'] = ann_id
out_json['annotations'].append(anno_info)
ann_id += 1
img_id += 1
# 下面两行是额外的信息,暂时不知用处。
cat = dict(id=1, name='text')
out_json['categories'].append(cat)
if len(out_json['annotations']) == 0: # 所有图片都没有文本标注,则删除annotations key.
out_json.pop('annotations')
mmcv.dump(out_json, out_json_name)
return out_json