CASIA手写体数据集HWDB1.0 gnt和dgrl格式解析

最新推荐文章于 2023-04-18 08:55:54 发布

Liekkas Kono

最新推荐文章于 2023-04-18 08:55:54 发布

阅读量3.1k

点赞数 15

分类专栏： Python 文章标签： python 手写体 Gnt格式

本文链接：https://blog.csdn.net/shiwanghualuo/article/details/119648829

版权

Python 专栏收录该内容

25 篇文章 1 订阅

订阅专栏

最近用到了CASIA这个手写体数据集，但是HWDB1.0~1.2系列其存储格式为gnt
虽说官网也给了读取方式，但是仍然具有一定门槛
于是上网搜了解析gnt格式的python代码，找到了CASIA中文手写体字库gnt文件格式解析(python)
这篇博客代码是2014年用python2写的，有些年久失修，正好顺手修复一下，改为了python3的

Gnt格式解析代码

# !/usr/bin/env python
# -*- encoding: utf-8 -*-
# @File: data_process.py
# @Author: SWHL
# @Contact: liekkaskono@163.com
import struct
from pathlib import Path

from PIL import Image


def write_txt(save_path: str, content: list, mode='w'):
    """
    将list内容写入txt中
    @param
    content: list格式内容
    save_path: 绝对路径str
    @return:None
    """
    with open(save_path, mode, encoding='utf-8') as f:
        for value in content:
            f.write(value + '\n')


path = 'raw_data'
save_dir = 'HWDB1'  # 目录下均为gnt文件

gnt_paths = list(Path(path).iterdir())

label_list = []
for gnt_path in gnt_paths:
    count = 0
    print(gnt_path)
    with open(str(gnt_path), 'rb') as f:
        while f.read(1) != "":
            f.seek(-1, 1)
            count += 1
            try:
                # 只所以添加try，是因为有时f.read会报错 struct.error: unpack requires a buffer of 4 bytes
                # 原因尚未找到
                length_bytes = struct.unpack('<I', f.read(4))[0]

                tag_code = f.read(2)

                width = struct.unpack('<H', f.read(2))[0]

                height = struct.unpack('<H', f.read(2))[0]

                im = Image.new('RGB', (width, height))
                img_array = im.load()
                for x in range(height):
                    for y in range(width):
                        pixel = struct.unpack('<B', f.read(1))[0]
                        img_array[y, x] = (pixel, pixel, pixel)

                filename = str(count) + '.png'
                tag_code = tag_code.decode('gbk').strip('\x00')
                save_path = f'{save_dir}/images/{gnt_path.stem}'
                if not Path(save_path).exists():
                    Path(save_path).mkdir(parents=True, exist_ok=True)
                im.save(f'{save_path}/{filename}')

                label_list.append(f'{gnt_path.stem}/{filename}\t{tag_code}')
            except:
                break

write_txt(f'{save_dir}/gt.txt', label_list)

dgrl格式解析代码

# !/usr/bin/env python
# -*- encoding: utf-8 -*-
# @Author: SWHL
# @Contact: liekkaskono@163.com
import os
import struct
from pathlib import Path

import cv2 as cv
import numpy as np
from tqdm import tqdm


def read_from_dgrl(dgrl):
    if not os.path.exists(dgrl):
        print('DGRL not exis!')
        return

    dir_name, base_name = os.path.split(dgrl)
    label_dir = dir_name+'_label'
    image_dir = dir_name+'_images'
    if not os.path.exists(label_dir):
        os.makedirs(label_dir)
    if not os.path.exists(image_dir):
        os.makedirs(image_dir)

    with open(dgrl, 'rb') as f:
        # 读取表头尺寸
        header_size = np.fromfile(f, dtype='uint8', count=4)
        header_size = sum([j << (i*8) for i, j in enumerate(header_size)])
        # print(header_size)

        # 读取表头剩下内容，提取 code_length
        header = np.fromfile(f, dtype='uint8', count=header_size-4)
        code_length = sum([j << (i*8) for i, j in enumerate(header[-4:-2])])
        # print(code_length)

        # 读取图像尺寸信息，提取图像中行数量
        image_record = np.fromfile(f, dtype='uint8', count=12)
        height = sum([j << (i*8) for i, j in enumerate(image_record[:4])])
        width = sum([j << (i*8) for i, j in enumerate(image_record[4:8])])
        line_num = sum([j << (i*8) for i, j in enumerate(image_record[8:])])
        print('图像尺寸:')
        print(height, width, line_num)

        # 读取每一行的信息
        for k in range(line_num):
            print(k+1)

            # 读取该行的字符数量
            char_num = np.fromfile(f, dtype='uint8', count=4)
            char_num = sum([j << (i*8) for i, j in enumerate(char_num)])
            print('字符数量:', char_num)

            # 读取该行的标注信息
            label = np.fromfile(f, dtype='uint8', count=code_length*char_num)
            label = [label[i] << (8*(i % code_length))
                     for i in range(code_length*char_num)]
            label = [sum(label[i*code_length:(i+1)*code_length])
                     for i in range(char_num)]
            label = [struct.pack('I', i).decode(
                'gbk', 'ignore')[0] for i in label]
            print('合并前：', label)
            label = ''.join(label)
            # 去掉不可见字符 \x00，这一步不加的话后面保存的内容会出现看不见的问题
            label = ''.join(label.split(b'\x00'.decode()))
            print('合并后：', label)

            # 读取该行的位置和尺寸
            pos_size = np.fromfile(f, dtype='uint8', count=16)
            y = sum([j << (i*8) for i, j in enumerate(pos_size[:4])])
            x = sum([j << (i*8) for i, j in enumerate(pos_size[4:8])])
            h = sum([j << (i*8) for i, j in enumerate(pos_size[8:12])])
            w = sum([j << (i*8) for i, j in enumerate(pos_size[12:])])
            # print(x, y, w, h)

            # 读取该行的图片
            bitmap = np.fromfile(f, dtype='uint8', count=h*w)
            bitmap = np.array(bitmap).reshape(h, w)

            # 保存信息
            label_file = os.path.join(
                label_dir, base_name.replace('.dgrl', '_'+str(k)+'.txt'))
            with open(label_file, 'w') as f1:
                f1.write(label)
            bitmap_file = os.path.join(
                image_dir, base_name.replace('.dgrl', '_'+str(k)+'.jpg'))
            cv.imwrite(bitmap_file, bitmap)


if __name__ == '__main__':
    dgrl_paths = Path('dataset/HandWriteDBRecog/HWDB2.2Train').iterdir()
    dgrl_paths = list(dgrl_paths)
    for dgrl_path in tqdm(dgrl_paths):
        read_from_dgrl(dgrl_path)