引言
- 最近用到了CASIA这个手写体数据集,但是HWDB1.0~1.2系列其存储格式为gnt
- 虽说官网也给了读取方式,但是仍然具有一定门槛
- 于是上网搜了解析gnt格式的python代码,找到了CASIA中文手写体字库gnt文件格式解析(python)
- 这篇博客代码是2014年用python2写的,有些年久失修,正好顺手修复一下,改为了python3的
Gnt格式解析代码
import struct
from pathlib import Path
from PIL import Image
def write_txt(save_path: str, content: list, mode='w'):
"""
将list内容写入txt中
@param
content: list格式内容
save_path: 绝对路径str
@return:None
"""
with open(save_path, mode, encoding='utf-8') as f:
for value in content:
f.write(value + '\n')
path = 'raw_data'
save_dir = 'HWDB1'
gnt_paths = list(Path(path).iterdir())
label_list = []
for gnt_path in gnt_paths:
count = 0
print(gnt_path)
with open(str(gnt_path), 'rb') as f:
while f.read(1) != "":
f.seek(-1, 1)
count += 1
try:
length_bytes = struct.unpack('<I', f.read(4))[0]
tag_code = f.read(2)
width = struct.unpack('<H', f.read(2))[0]
height = struct.unpack('<H', f.read(2))[0]
im = Image.new('RGB', (width, height))
img_array = im.load()
for x in range(height):
for y in range(width):
pixel = struct.unpack('<B', f.read(1))[0]
img_array[y, x] = (pixel, pixel, pixel)
filename = str(count) + '.png'
tag_code = tag_code.decode('gbk').strip('\x00')
save_path = f'{save_dir}/images/{gnt_path.stem}'
if not Path(save_path).exists():
Path(save_path).mkdir(parents=True, exist_ok=True)
im.save(f'{save_path}/{filename}')
label_list.append(f'{gnt_path.stem}/{filename}\t{tag_code}')
except:
break
write_txt(f'{save_dir}/gt.txt', label_list)
dgrl格式解析代码
import os
import struct
from pathlib import Path
import cv2 as cv
import numpy as np
from tqdm import tqdm
def read_from_dgrl(dgrl):
if not os.path.exists(dgrl):
print('DGRL not exis!')
return
dir_name, base_name = os.path.split(dgrl)
label_dir = dir_name+'_label'
image_dir = dir_name+'_images'
if not os.path.exists(label_dir):
os.makedirs(label_dir)
if not os.path.exists(image_dir):
os.makedirs(image_dir)
with open(dgrl, 'rb') as f:
header_size = np.fromfile(f, dtype='uint8', count=4)
header_size = sum([j << (i*8) for i, j in enumerate(header_size)])
header = np.fromfile(f, dtype='uint8', count=header_size-4)
code_length = sum([j << (i*8) for i, j in enumerate(header[-4:-2])])
image_record = np.fromfile(f, dtype='uint8', count=12)
height = sum([j << (i*8) for i, j in enumerate(image_record[:4])])
width = sum([j << (i*8) for i, j in enumerate(image_record[4:8])])
line_num = sum([j << (i*8) for i, j in enumerate(image_record[8:])])
print('图像尺寸:')
print(height, width, line_num)
for k in range(line_num):
print(k+1)
char_num = np.fromfile(f, dtype='uint8', count=4)
char_num = sum([j << (i*8) for i, j in enumerate(char_num)])
print('字符数量:', char_num)
label = np.fromfile(f, dtype='uint8', count=code_length*char_num)
label = [label[i] << (8*(i % code_length))
for i in range(code_length*char_num)]
label = [sum(label[i*code_length:(i+1)*code_length])
for i in range(char_num)]
label = [struct.pack('I', i).decode(
'gbk', 'ignore')[0] for i in label]
print('合并前:', label)
label = ''.join(label)
label = ''.join(label.split(b'\x00'.decode()))
print('合并后:', label)
pos_size = np.fromfile(f, dtype='uint8', count=16)
y = sum([j << (i*8) for i, j in enumerate(pos_size[:4])])
x = sum([j << (i*8) for i, j in enumerate(pos_size[4:8])])
h = sum([j << (i*8) for i, j in enumerate(pos_size[8:12])])
w = sum([j << (i*8) for i, j in enumerate(pos_size[12:])])
bitmap = np.fromfile(f, dtype='uint8', count=h*w)
bitmap = np.array(bitmap).reshape(h, w)
label_file = os.path.join(
label_dir, base_name.replace('.dgrl', '_'+str(k)+'.txt'))
with open(label_file, 'w') as f1:
f1.write(label)
bitmap_file = os.path.join(
image_dir, base_name.replace('.dgrl', '_'+str(k)+'.jpg'))
cv.imwrite(bitmap_file, bitmap)
if __name__ == '__main__':
dgrl_paths = Path('dataset/HandWriteDBRecog/HWDB2.2Train').iterdir()
dgrl_paths = list(dgrl_paths)
for dgrl_path in tqdm(dgrl_paths):
read_from_dgrl(dgrl_path)
相关资料