CASIA数据集格式转化代码

这篇博客介绍了如何使用Python解析汉字数据库中的GNT和DGRL文件,并进行转换。针对HWDB1.x和HWDB2.x数据集,提供了从GNT到PNG的转换代码,以及从DGRL到JPEG的转换过程。转换过程中涉及到文件头信息的读取、图像数据的解码以及标签的处理。此外,还展示了如何将标签转换为Unicode编码并保存为文本文件。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

HWDB1.x:脱机单字,1.0~1.2 三个版本,数据格式为 .gnt
OLHWDB1.x:联机单字,1.0~1.2 三个版本,
HWDB2.x:脱机文本行,1.0~1.2 三个版本,数据格式为 .dgrl
OLHWDB1.x:联机文本行,1.0~1.2 三个版本,

gnt转png

import os
import numpy as np
import struct
from PIL import Image

#data_dir = '/home/malidong/workspace/PatternRecognition/Chinese_character_recognition-master'
data_dir = 'D:/File/datasets/Handwriting Databases/Character Sample Data/Gnt'
#train_data_dir = os.path.join(data_dir, 'HWDB1.1trn_gnt')
train_data_dir = os.path.join(data_dir, 'Gnt1.0Test')
##test_data_dir = os.path.join(data_dir, 'HWDB1.1tst_gnt')

def read_from_gnt_dir(gnt_dir=train_data_dir):
    def one_file(f):
        header_size = 10
        while True:
            header = np.fromfile(f, dtype='uint8', count=header_size)
            if not header.size: break
            sample_size = header[0] + (header[1]<<8) + (header[2]<<16) + (header[3]<<24)
            tagcode = header[5] + (header[4]<<8)
            width = header[6] + (header[7]<<8)
            height = header[8] + (header[9]<<8)
            if header_size + width*height != sample_size:
                break
            image = np.fromfile(f, dtype='uint8', count=width*height).reshape((height, width))
            yield image, tagcode
    for file_name in os.listdir(gnt_dir):
        if file_name.endswith('.gnt'):
            file_path = os.path.join(gnt_dir, file_name)
            with open(file_path, 'rb') as f:
                for image, tagcode in one_file(f):
                    yield image, tagcode
char_set = set()
for _, tagcode in read_from_gnt_dir(gnt_dir=train_data_dir):
    #tagcode_unicode = struct.pack('>H', tagcode).decode('gb2312')
    tagcode_unicode = struct.pack('>H', tagcode).decode('gbk')
    char_set.add(tagcode_unicode)
char_list = list(char_set)
char_dict = dict(zip(sorted(char_list), range(len(char_list))))
print(len(char_dict))
import pickle
f = open('char_dict', 'wb')
pickle.dump(char_dict, f)
f.close()
train_counter = 0
test_counter = 0
for image, tagcode in read_from_gnt_dir(gnt_dir=train_data_dir):
    #tagcode_unicode = struct.pack('>H', tagcode).decode('gb2312')
    tagcode_unicode = struct.pack('>H', tagcode).decode('gbk')
    im = Image.fromarray(image)
    #dir_name = '/home/malidong/workspace/PatternRecognition/Chinese_character_recognition-master/data/train/' + '%0.5d'%char_dict[tagcode_unicode]
    dir_name = 'D:/File/datasets/Handwriting Databases/Character Sample Data/png/Gnt1.0Test_png/' + '%0.5d'%char_dict[tagcode_unicode]
    if not os.path.exists(os.path.join(dir_name)):
        os.mkdir(os.path.join(dir_name))
    im.convert('RGB').save(os.path.join(dir_name)+'/' + str(train_counter) + '.png')
    train_counter += 1
    
"""
for image, tagcode in read_from_gnt_dir(gnt_dir=test_data_dir):
    tagcode_unicode = struct.pack('>H', tagcode).decode('gb2312')
    im = Image.fromarray(image)
    dir_name = '/home/malidong/workspace/PatternRecognition/Chinese_character_recognition-master/data/test/' + '%0.5d'%char_dict[tagcode_unicode]
    if not os.path.exists(os.path.join(dir_name)):
        os.mkdir(os.path.join(dir_name))
    im.convert('RGB').save(os.path.join(dir_name)+'/' + str(test_counter) + '.png')
    test_counter += 1
"""

dgrl转jpg

import os
import struct
from pathlib import Path

import cv2 as cv
import numpy as np
from tqdm import tqdm


def read_from_dgrl(dgrl):
    if not os.path.exists(dgrl):
        print('DGRL not exis!')
        return

    dir_name, base_name = os.path.split(dgrl)
    label_dir = dir_name+'_label'
    image_dir = dir_name+'_images'
    if not os.path.exists(label_dir):
        os.makedirs(label_dir)
    if not os.path.exists(image_dir):
        os.makedirs(image_dir)

    with open(dgrl, 'rb') as f:
        # 读取表头尺寸
        header_size = np.fromfile(f, dtype='uint8', count=4)
        header_size = sum([j << (i*8) for i, j in enumerate(header_size)])
        # print(header_size)

        # 读取表头剩下内容,提取 code_length
        header = np.fromfile(f, dtype='uint8', count=header_size-4)
        code_length = sum([j << (i*8) for i, j in enumerate(header[-4:-2])])
        # print(code_length)

        # 读取图像尺寸信息,提取图像中行数量
        image_record = np.fromfile(f, dtype='uint8', count=12)
        height = sum([j << (i*8) for i, j in enumerate(image_record[:4])])
        width = sum([j << (i*8) for i, j in enumerate(image_record[4:8])])
        line_num = sum([j << (i*8) for i, j in enumerate(image_record[8:])])
        print('图像尺寸:')
        print(height, width, line_num)

        # 读取每一行的信息
        for k in range(line_num):
            print(k+1)

            # 读取该行的字符数量
            char_num = np.fromfile(f, dtype='uint8', count=4)
            char_num = sum([j << (i*8) for i, j in enumerate(char_num)])
            print('字符数量:', char_num)

            # 读取该行的标注信息
            label = np.fromfile(f, dtype='uint8', count=code_length*char_num)
            label = [label[i] << (8*(i % code_length))
                     for i in range(code_length*char_num)]
            label = [sum(label[i*code_length:(i+1)*code_length])
                     for i in range(char_num)]
            label = [struct.pack('I', i).decode(
                'gbk', 'ignore')[0] for i in label]
            print('合并前:', label)
            label = ''.join(label)
            # 去掉不可见字符 \x00,这一步不加的话后面保存的内容会出现看不见的问题
            label = ''.join(label.split(b'\x00'.decode()))
            print('合并后:', label)

            # 读取该行的位置和尺寸
            pos_size = np.fromfile(f, dtype='uint8', count=16)
            y = sum([j << (i*8) for i, j in enumerate(pos_size[:4])])
            x = sum([j << (i*8) for i, j in enumerate(pos_size[4:8])])
            h = sum([j << (i*8) for i, j in enumerate(pos_size[8:12])])
            w = sum([j << (i*8) for i, j in enumerate(pos_size[12:])])
            # print(x, y, w, h)

            # 读取该行的图片
            bitmap = np.fromfile(f, dtype='uint8', count=h*w)
            bitmap = np.array(bitmap).reshape(h, w)

            # 保存信息
            label_file = os.path.join(
                label_dir, base_name.replace('.dgrl', '_'+str(k)+'.txt'))
            with open(label_file, 'w') as f1:
                f1.write(label)
            bitmap_file = os.path.join(
                image_dir, base_name.replace('.dgrl', '_'+str(k)+'.jpg'))
            cv.imwrite(bitmap_file, bitmap)


if __name__ == '__main__':
    dgrl_paths = Path('E:/CASIA/HWDB2.1Train').iterdir()
    dgrl_paths = list(dgrl_paths)
    for dgrl_path in tqdm(dgrl_paths):
        read_from_dgrl(dgrl_path)

参考:

CASIA-HWDB2.x 数据集DGRL文件解析(python)_dagongji10的博客-CSDN博客

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值