import os
import json
# 生成fileState.txt文件# dir=r"Z:\ocr_datasets\20240109_datasets\training"# out_dir=r"Z:\ocr_datasets\20240109_datasets\training"dir=r"E:\20240104\training"
out_dir=r"E:\20240104\training"for path in os.listdir(dir):
path1=os.path.join(dir,path)if path.endswith(".jpg")or path.endswith(".bmp"):withopen(os.path.join(out_dir,"fileState.txt"),"a+")as fin:
fin.write(str(path1)+"\t"+"1"+"\n")# 修改fileState.txt文件# character=r"F:\Desktop\training20231220/fileState.txt"# out_character=r"F:/Desktop/training20231220/fileState1.txt"# character_str=[]# with open(character,"rb") as fin:# lines=fin.readlines()# for line in lines:# # line=line.decode("utf-8").strip("\n").strip("\r\n")# line = line.decode("utf-8").strip("\r\n")# print(line)# new_txt=line.replace(line.split("\\")[-2],"training")# print(new_txt)# with open(out_character, "a+") as fout:# fout.writelines(new_txt+"\r")# character_str.append(line)## print(character_str)#修改label文件# character=r"F:\Desktop\training\Label.txt"# out_character=r"G:\training_gamma\Label.txt"# character_str=[]# with open(character,"rb") as fin:# lines=fin.readlines()# for line in lines:# # line=line.decode("utf-8").strip("\n").strip("\r\n")# line = line.decode("utf-8").strip("\n").strip("\r\n")# print(line)# # new_t=line.replace(line.split("/")[-2],"training_gamma")# # print(new_t)# new_txt = line.replace(line.split("/")[-1],"a_"+str(line.split("/")[-1]))# # new_txt = line.replace(line.split("/")[-1], str(line.split("/")[-1].split(".")[0])+"_a.bmp")# print(new_txt)# with open(out_character, "a+") as fout:# fout.writelines(new_txt+"\r")# character_str.append(line)## print(character_str)# character=R"D:\mypaddle_project\PaddleOCR-release-2.7\rec_gt_new.txt"# out_character=R"D:\mypaddle_project\PaddleOCR-release-2.7\rec_gt_new1.txt"# character_str=[]# with open(character,"rb") as fin:# lines=fin.readlines()# for line in lines:# # line=line.decode("utf-8").strip("\n").strip("\r\n")# line = line.decode("utf-8").strip("\n").strip("\r\n")# print(line)# new_txt=line.replace(line.split("/")[-2],"crop_img")# print(new_txt)# with open(out_character, "a+") as fout:# fout.writelines(new_txt+"\r")# character_str.append(line)## print(character_str)#shengchengtxt# out_dir=r"F:\Desktop\training_txt1"# if not os.path.exists(out_dir):os.makedirs(out_dir)# with open(r"F:\Desktop\training\Label.txt", "r", encoding="utf-8") as f:# lines = f.readlines()# for line in lines:# # line = line.decode("utf-8").strip("\n").strip("\r\n")# # print(line)# l0, l1 = line.split("\t")[0], line.split("\t")[1]# l00=l0.split("/")[-1].split(".")[0]+".txt"# with open(os.path.join(out_dir,l00),"w") as f1:# f1.write(l1)# out_dir=r"F:\Desktop\training_txt1"# with open(r"F:\Desktop\training\Label.txt", "r", encoding="utf-8") as f:# lines = f.readlines()# for line in lines:# l0, l1 = line.split("\t")[0], line.split("\t")[1]# lll1 = l1.replace('"',"'")# import ast# # 使用ast.literal_eval来转换字符串# converted_list = ast.literal_eval(lll2)# print(converted_list)## with open(r"F:\Desktop\training_txt1\0001.txt","r") as f:# f1=f.read()
ppocrlabel切分训练集和测试机
# coding:utf8import os
import shutil
import random
import argparse
# 删除划分的训练集、验证集、测试集文件夹,重新创建一个空的文件夹defisCreateOrDeleteFolder(path, flag):
flagPath = os.path.join(path, flag)if os.path.exists(flagPath):
shutil.rmtree(flagPath)
os.makedirs(flagPath)
flagAbsPath = os.path.abspath(flagPath)return flagAbsPath
defsplitTrainVal(root, absTrainRootPath, absValRootPath,trainTxt, valTxt, flag):# 按照指定的比例划分训练集、验证集、测试集
dataAbsPath = os.path.abspath(root)print("路径根目录:",dataAbsPath)if flag =="det":
labelFilePath = os.path.join(dataAbsPath, args.detLabelFileName)print("检测文件标签名:",labelFilePath)elif flag =="rec":
labelFilePath = os.path.join(dataAbsPath, args.recLabelFileName)print("识别文件标签名:",labelFilePath)
labelFileRead =open(labelFilePath,"r", encoding="UTF-8")
labelFileContent = labelFileRead.readlines()
random.shuffle(labelFileContent)
labelRecordLen =len(labelFileContent)for index, labelRecordInfo inenumerate(labelFileContent):
imageRelativePath = labelRecordInfo.split('\t')[0]
imageLabel = labelRecordInfo.split('\t')[1]
imageName = os.path.basename(imageRelativePath)if flag =="det":
imagePath = os.path.join(dataAbsPath, imageName)elif flag =="rec":
imagePath = os.path.join(dataAbsPath,"{}/{}".format(args.recImageDirName, imageName))# 按预设的比例划分训练集、验证集、测试集
trainValRatio = args.trainValRatio.split(":")
trainRatio =eval(trainValRatio[0])/10
valRatio = trainRatio +eval(trainValRatio[1])/10
curRatio = index / labelRecordLen
if curRatio < trainRatio:
imageCopyPath = os.path.join(absTrainRootPath, imageName)
shutil.copy(imagePath, imageCopyPath)
trainTxt.write("{}\t{}".format(imageCopyPath.split("\\")[-2]+"\\"+imageCopyPath.split("\\")[-1], imageLabel))elif curRatio >= trainRatio and curRatio < valRatio:
imageCopyPath = os.path.join(absValRootPath, imageName)
shutil.copy(imagePath, imageCopyPath)
valTxt.write("{}\t{}".format(imageCopyPath.split("\\")[-2]+"\\"+imageCopyPath.split("\\")[-1], imageLabel))else:pass# 删掉存在的文件defremoveFile(path):if os.path.exists(path):
os.remove(path)defgenDetRecTrainVal(args):
detAbsTrainRootPath = isCreateOrDeleteFolder(args.detRootPath,"train")
detAbsValRootPath = isCreateOrDeleteFolder(args.detRootPath,"val")
recAbsTrainRootPath = isCreateOrDeleteFolder(args.recRootPath,"train")
recAbsValRootPath = isCreateOrDeleteFolder(args.recRootPath,"val")
removeFile(os.path.join(args.detRootPath,"train.txt"))
removeFile(os.path.join(args.detRootPath,"val.txt"))
removeFile(os.path.join(args.recRootPath,"train.txt"))
removeFile(os.path.join(args.recRootPath,"val.txt"))
detTrainTxt =open(os.path.join(args.detRootPath,"train.txt"),"a", encoding="UTF-8")
detValTxt =open(os.path.join(args.detRootPath,"val.txt"),"a", encoding="UTF-8")
recTrainTxt =open(os.path.join(args.recRootPath,"train.txt"),"a", encoding="UTF-8")
recValTxt =open(os.path.join(args.recRootPath,"val.txt"),"a", encoding="UTF-8")
splitTrainVal(args.datasetRootPath, detAbsTrainRootPath, detAbsValRootPath, detTrainTxt, detValTxt,"det")print("*************************det结束******************")# for root, dirs, files in os.walk(args.datasetRootPath):# for dir in dirs:# if dir == 'crop_img':# splitTrainVal(root, recAbsTrainRootPath, recAbsValRootPath, recTrainTxt, recValTxt, "rec")# else:# continue# break# print("*************************rec结束******************")if __name__ =="__main__":# 功能描述:分别划分检测和识别的训练集、验证集# 说明:可以根据自己的路径和需求调整参数,图像数据往往多人合作分批标注,每一批图像数据放在一个文件夹内用PPOCRLabel进行标注,# 如此会有多个标注好的图像文件夹汇总并划分训练集、验证集、测试集的需求
parser = argparse.ArgumentParser()
parser.add_argument("--trainValRatio",type=str,
default="10:0",help="ratio of trainset:valset")
parser.add_argument("--datasetRootPath",type=str,
default=r"F:\Desktop\training_data\training20231212",help="path to the dataset marked by ppocrlabel, E.g, dataset folder named 1,2,3...")
parser.add_argument("--detRootPath",type=str,
default=r"F:\Desktop\training_data\training20231212\det",help="the path where the divided detection dataset is placed")
parser.add_argument("--recRootPath",type=str,
default=r"F:\Desktop\training_data\training20231212\rec",help="the path where the divided recognition dataset is placed")
parser.add_argument("--detLabelFileName",type=str,
default="Label.txt",help="the name of the detection annotation file")
parser.add_argument("--recLabelFileName",type=str,
default="rec_gt.txt",help="the name of the recognition annotation file")
parser.add_argument("--recImageDirName",type=str,
default="crop_img",help="the name of the folder where the cropped recognition dataset is located")
args = parser.parse_args()
genDetRecTrainVal(args)
ppocrlabel切分训练集验证集和测试集
# coding:utf8import os
import shutil
import random
import argparse
# 删除划分的训练集、验证集、测试集文件夹,重新创建一个空的文件夹defisCreateOrDeleteFolder(path, flag):
flagPath = os.path.join(path, flag)if os.path.exists(flagPath):
shutil.rmtree(flagPath)
os.makedirs(flagPath)
flagAbsPath = os.path.abspath(flagPath)return flagAbsPath
defsplitTrainVal(root, abs_train_root_path, abs_val_root_path, abs_test_root_path, train_txt, val_txt, test_txt, flag):
data_abs_path = os.path.abspath(root)
label_file_name = args.detLabelFileName if flag =="det"else args.recLabelFileName
label_file_path = os.path.join(data_abs_path, label_file_name)withopen(label_file_path,"r", encoding="UTF-8")as label_file:
label_file_content = label_file.readlines()
random.shuffle(label_file_content)
label_record_len =len(label_file_content)for index, label_record_info inenumerate(label_file_content):
image_relative_path, image_label = label_record_info.split('\t')
image_name = os.path.basename(image_relative_path)if flag =="det":
image_path = os.path.join(data_abs_path, image_name)elif flag =="rec":
image_path = os.path.join(data_abs_path, args.recImageDirName, image_name)
train_val_test_ratio = args.trainValTestRatio.split(":")
train_ratio =eval(train_val_test_ratio[0])/10
val_ratio = train_ratio +eval(train_val_test_ratio[1])/10
cur_ratio = index / label_record_len
if cur_ratio < train_ratio:# train_txt.write("{}\t{}\n".format(image_copy_path, image_label))
image_copy_path = os.path.join(abs_train_root_path, image_name)
shutil.copy(image_path, image_copy_path)
train_txt.write("{}\t{}".format(image_copy_path.split("/")[-2]+"/"+ image_copy_path.split("/")[-1], image_label))elif cur_ratio >= train_ratio and cur_ratio < val_ratio:
image_copy_path = os.path.join(abs_val_root_path, image_name)
shutil.copy(image_path, image_copy_path)# val_txt.write("{}\t{}\n".format(image_copy_path, image_label))
val_txt.write("{}\t{}".format(image_copy_path.split("/")[-2]+"/"+ image_copy_path.split("/")[-1], image_label))else:
image_copy_path = os.path.join(abs_test_root_path, image_name)
shutil.copy(image_path, image_copy_path)
test_txt.write("{}\t{}".format(image_copy_path.split("/")[-2]+"/"+ image_copy_path.split("/")[-1], image_label))# 删掉存在的文件defremoveFile(path):if os.path.exists(path):
os.remove(path)defgenDetRecTrainVal(args):
detAbsTrainRootPath = isCreateOrDeleteFolder(args.detRootPath,"train")
detAbsValRootPath = isCreateOrDeleteFolder(args.detRootPath,"val")
detAbsTestRootPath = isCreateOrDeleteFolder(args.detRootPath,"test")
recAbsTrainRootPath = isCreateOrDeleteFolder(args.recRootPath,"train")
recAbsValRootPath = isCreateOrDeleteFolder(args.recRootPath,"val")
recAbsTestRootPath = isCreateOrDeleteFolder(args.recRootPath,"test")
removeFile(os.path.join(args.detRootPath,"train.txt"))
removeFile(os.path.join(args.detRootPath,"val.txt"))
removeFile(os.path.join(args.detRootPath,"test.txt"))
removeFile(os.path.join(args.recRootPath,"train.txt"))
removeFile(os.path.join(args.recRootPath,"val.txt"))
removeFile(os.path.join(args.recRootPath,"test.txt"))
detTrainTxt =open(os.path.join(args.detRootPath,"train.txt"),"a", encoding="UTF-8")
detValTxt =open(os.path.join(args.detRootPath,"val.txt"),"a", encoding="UTF-8")
detTestTxt =open(os.path.join(args.detRootPath,"test.txt"),"a", encoding="UTF-8")
recTrainTxt =open(os.path.join(args.recRootPath,"train.txt"),"a", encoding="UTF-8")
recValTxt =open(os.path.join(args.recRootPath,"val.txt"),"a", encoding="UTF-8")
recTestTxt =open(os.path.join(args.recRootPath,"test.txt"),"a", encoding="UTF-8")
splitTrainVal(args.datasetRootPath, detAbsTrainRootPath, detAbsValRootPath, detAbsTestRootPath, detTrainTxt, detValTxt,
detTestTxt,"det")for root, dirs, files in os.walk(args.datasetRootPath):fordirin dirs:ifdir=='crop_img':
splitTrainVal(root, recAbsTrainRootPath, recAbsValRootPath, recAbsTestRootPath, recTrainTxt, recValTxt,
recTestTxt,"rec")else:continuebreakif __name__ =="__main__":# 功能描述:分别划分检测和识别的训练集、验证集、测试集# 说明:可以根据自己的路径和需求调整参数,图像数据往往多人合作分批标注,每一批图像数据放在一个文件夹内用PPOCRLabel进行标注,# 如此会有多个标注好的图像文件夹汇总并划分训练集、验证集、测试集的需求
parser = argparse.ArgumentParser()
parser.add_argument("--trainValTestRatio",type=str,
default="6:2:2",help="ratio of trainset:valset:testset")
parser.add_argument("--datasetRootPath",type=str,
default="../train_data/",help="path to the dataset marked by ppocrlabel, E.g, dataset folder named 1,2,3...")
parser.add_argument("--detRootPath",type=str,
default="../train_data/det",help="the path where the divided detection dataset is placed")
parser.add_argument("--recRootPath",type=str,
default="../train_data/rec",help="the path where the divided recognition dataset is placed")
parser.add_argument("--detLabelFileName",type=str,
default="Label.txt",help="the name of the detection annotation file")
parser.add_argument("--recLabelFileName",type=str,
default="rec_gt.txt",help="the name of the recognition annotation file")
parser.add_argument("--recImageDirName",type=str,
default="crop_img",help="the name of the folder where the cropped recognition dataset is located")
args = parser.parse_args()
genDetRecTrainVal(args)