深度学习文本预处理
从.txt读取,按特定格式输出到.txt
这里我首先是从网上下载的ch4_training_images(训练数据集照片)和ch4_training_localization_transcription_gt 2(Label)文件,Label文件如下:
下面为具体代码内容:
import os
import numpy as np
import json
gt_dir = "/Users/yi/Desktop/icdar" # 文件夹目录
with open("/Users/yi/Desktop/ch4_training_images/label.txt", "w") as fo:
for filename in os.listdir(gt_dir): # 遍历文件夹
gt_path = os.path.join(gt_dir, filename)
if not os.path.isfile(gt_path):
continue
name, suffix = os.path.splitext(filename)
if suffix.lower() != '.txt': # 判断是否为.txt文件
continue
img_filename = name[3:] + '.jpg'
with open(gt_path, 'r', encoding='utf-8-sig') as fi:
annot_ppocr = []
for i, line in enumerate(fi):
arr = line.strip().split(',') # strip()为删除每行后面的换行符,然后再分割
if len(arr) < 9:
continue
print(i, line, arr)
pts = [int(x) for x in arr[:8]] # 获取前八个数
print(