PPOCRLabel自动生成filestate.txt,切分训练集和测试集

paddleocr自动生成一些坐标信息

import os
import json
# 生成fileState.txt文件
# dir=r"Z:\ocr_datasets\20240109_datasets\training"
# out_dir=r"Z:\ocr_datasets\20240109_datasets\training"
dir=r"E:\20240104\training"
out_dir=r"E:\20240104\training"
for path in os.listdir(dir):
    path1=os.path.join(dir,path)
    if path.endswith(".jpg") or path.endswith(".bmp"):
        with open(os.path.join(out_dir,"fileState.txt"),"a+") as fin:
            fin.write(str(path1)+"\t"+"1"+"\n")



# 修改fileState.txt文件
# character=r"F:\Desktop\training20231220/fileState.txt"
# out_character=r"F:/Desktop/training20231220/fileState1.txt"
# character_str=[]
# with open(character,"rb") as fin:
#     lines=fin.readlines()
#     for line in lines:
#         # line=line.decode("utf-8").strip("\n").strip("\r\n")
#         line = line.decode("utf-8").strip("\r\n")
#         print(line)
#         new_txt=line.replace(line.split("\\")[-2],"training")
#         print(new_txt)
#         with open(out_character, "a+") as fout:
#             fout.writelines(new_txt+"\r")
#         character_str.append(line)
#
# print(character_str)





#修改label文件
# character=r"F:\Desktop\training\Label.txt"
# out_character=r"G:\training_gamma\Label.txt"
# character_str=[]
# with open(character,"rb") as fin:
#     lines=fin.readlines()
#     for line in lines:
#         # line=line.decode("utf-8").strip("\n").strip("\r\n")
#         line = line.decode("utf-8").strip("\n").strip("\r\n")
#         print(line)
#         # new_t=line.replace(line.split("/")[-2],"training_gamma")
#         # print(new_t)
#         new_txt = line.replace(line.split("/")[-1],"a_"+str(line.split("/")[-1]))
#         # new_txt = line.replace(line.split("/")[-1], str(line.split("/")[-1].split(".")[0])+"_a.bmp")
#         print(new_txt)
#         with open(out_character, "a+") as fout:
#             fout.writelines(new_txt+"\r")
#         character_str.append(line)
#
# print(character_str)



# character=R"D:\mypaddle_project\PaddleOCR-release-2.7\rec_gt_new.txt"
# out_character=R"D:\mypaddle_project\PaddleOCR-release-2.7\rec_gt_new1.txt"
# character_str=[]
# with open(character,"rb") as fin:
#     lines=fin.readlines()
#     for line in lines:
#         # line=line.decode("utf-8").strip("\n").strip("\r\n")
#         line = line.decode("utf-8").strip("\n").strip("\r\n")
#         print(line)
#         new_txt=line.replace(line.split("/")[-2],"crop_img")
#         print(new_txt)
#         with open(out_character, "a+") as fout:
#             fout.writelines(new_txt+"\r")
#         character_str.append(line)
#
# print(character_str)


#shengchengtxt
# out_dir=r"F:\Desktop\training_txt1"
# if not os.path.exists(out_dir):os.makedirs(out_dir)
# with open(r"F:\Desktop\training\Label.txt", "r", encoding="utf-8") as f:
#     lines = f.readlines()
#     for line in lines:
#         # line = line.decode("utf-8").strip("\n").strip("\r\n")
#         # print(line)
#         l0, l1 = line.split("\t")[0], line.split("\t")[1]
#         l00=l0.split("/")[-1].split(".")[0]+".txt"
#         with open(os.path.join(out_dir,l00),"w") as f1:
#             f1.write(l1)




# out_dir=r"F:\Desktop\training_txt1"
# with open(r"F:\Desktop\training\Label.txt", "r", encoding="utf-8") as f:
#     lines = f.readlines()
#     for line in lines:
#         l0, l1 = line.split("\t")[0], line.split("\t")[1]
#         lll1 = l1.replace('"',"'")
#         import ast
#         # 使用ast.literal_eval来转换字符串
#         converted_list = ast.literal_eval(lll2)
#         print(converted_list)
#
# with open(r"F:\Desktop\training_txt1\0001.txt","r") as f:
#     f1=f.read()




ppocrlabel切分训练集和测试机

# coding:utf8
import os
import shutil
import random
import argparse


# 删除划分的训练集、验证集、测试集文件夹,重新创建一个空的文件夹
def isCreateOrDeleteFolder(path, flag):
    flagPath = os.path.join(path, flag)

    if os.path.exists(flagPath):
        shutil.rmtree(flagPath)

    os.makedirs(flagPath)
    flagAbsPath = os.path.abspath(flagPath)
    return flagAbsPath


def splitTrainVal(root, absTrainRootPath, absValRootPath,trainTxt, valTxt, flag):
    # 按照指定的比例划分训练集、验证集、测试集
    dataAbsPath = os.path.abspath(root)
    print("路径根目录:",dataAbsPath)
    if flag == "det":
        labelFilePath = os.path.join(dataAbsPath, args.detLabelFileName)
        print("检测文件标签名:",labelFilePath)
    elif flag == "rec":
        labelFilePath = os.path.join(dataAbsPath, args.recLabelFileName)
        print("识别文件标签名:",labelFilePath)

    labelFileRead = open(labelFilePath, "r", encoding="UTF-8")
    labelFileContent = labelFileRead.readlines()
    random.shuffle(labelFileContent)
    labelRecordLen = len(labelFileContent)

    for index, labelRecordInfo in enumerate(labelFileContent):
        imageRelativePath = labelRecordInfo.split('\t')[0]
        imageLabel = labelRecordInfo.split('\t')[1]
        imageName = os.path.basename(imageRelativePath)

        if flag == "det":
            imagePath = os.path.join(dataAbsPath, imageName)
        elif flag == "rec":
            imagePath = os.path.join(dataAbsPath, "{}/{}".format(args.recImageDirName, imageName))

        # 按预设的比例划分训练集、验证集、测试集
        trainValRatio = args.trainValRatio.split(":")
        trainRatio = eval(trainValRatio[0]) / 10
        valRatio = trainRatio + eval(trainValRatio[1]) / 10
        curRatio = index / labelRecordLen

        if curRatio < trainRatio:
            imageCopyPath = os.path.join(absTrainRootPath, imageName)
            shutil.copy(imagePath, imageCopyPath)
            trainTxt.write("{}\t{}".format(imageCopyPath.split("\\")[-2]+"\\"+imageCopyPath.split("\\")[-1], imageLabel))

        elif curRatio >= trainRatio and curRatio < valRatio:
            imageCopyPath = os.path.join(absValRootPath, imageName)
            shutil.copy(imagePath, imageCopyPath)
            valTxt.write("{}\t{}".format(imageCopyPath.split("\\")[-2]+"\\"+imageCopyPath.split("\\")[-1], imageLabel))
        else:
            pass


# 删掉存在的文件
def removeFile(path):
    if os.path.exists(path):
        os.remove(path)


def genDetRecTrainVal(args):
    detAbsTrainRootPath = isCreateOrDeleteFolder(args.detRootPath, "train")
    detAbsValRootPath = isCreateOrDeleteFolder(args.detRootPath, "val")
    recAbsTrainRootPath = isCreateOrDeleteFolder(args.recRootPath, "train")
    recAbsValRootPath = isCreateOrDeleteFolder(args.recRootPath, "val")

    removeFile(os.path.join(args.detRootPath, "train.txt"))
    removeFile(os.path.join(args.detRootPath, "val.txt"))
    removeFile(os.path.join(args.recRootPath, "train.txt"))
    removeFile(os.path.join(args.recRootPath, "val.txt"))

    detTrainTxt = open(os.path.join(args.detRootPath, "train.txt"), "a", encoding="UTF-8")
    detValTxt = open(os.path.join(args.detRootPath, "val.txt"), "a", encoding="UTF-8")
    recTrainTxt = open(os.path.join(args.recRootPath, "train.txt"), "a", encoding="UTF-8")
    recValTxt = open(os.path.join(args.recRootPath, "val.txt"), "a", encoding="UTF-8")

    splitTrainVal(args.datasetRootPath, detAbsTrainRootPath, detAbsValRootPath, detTrainTxt, detValTxt,"det")
    print("*************************det结束******************")
    # for root, dirs, files in os.walk(args.datasetRootPath):
    #     for dir in dirs:
    #         if dir == 'crop_img':
    #             splitTrainVal(root, recAbsTrainRootPath, recAbsValRootPath, recTrainTxt, recValTxt, "rec")
    #         else:
    #             continue
    #     break
    # print("*************************rec结束******************")


if __name__ == "__main__":
    # 功能描述:分别划分检测和识别的训练集、验证集
    # 说明:可以根据自己的路径和需求调整参数,图像数据往往多人合作分批标注,每一批图像数据放在一个文件夹内用PPOCRLabel进行标注,
    # 如此会有多个标注好的图像文件夹汇总并划分训练集、验证集、测试集的需求
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--trainValRatio",
        type=str,
        default="10:0",
        help="ratio of trainset:valset")
    parser.add_argument(
        "--datasetRootPath",
        type=str,
        default=r"F:\Desktop\training_data\training20231212",
        help="path to the dataset marked by ppocrlabel, E.g, dataset folder named 1,2,3..."
    )
    parser.add_argument(
        "--detRootPath",
        type=str,
        default=r"F:\Desktop\training_data\training20231212\det",
        help="the path where the divided detection dataset is placed")
    parser.add_argument(
        "--recRootPath",
        type=str,
        default=r"F:\Desktop\training_data\training20231212\rec",
        help="the path where the divided recognition dataset is placed"
    )
    parser.add_argument(
        "--detLabelFileName",
        type=str,
        default="Label.txt",
        help="the name of the detection annotation file")
    parser.add_argument(
        "--recLabelFileName",
        type=str,
        default="rec_gt.txt",
        help="the name of the recognition annotation file"
    )
    parser.add_argument(
        "--recImageDirName",
        type=str,
        default="crop_img",
        help="the name of the folder where the cropped recognition dataset is located"
    )
    args = parser.parse_args()
    genDetRecTrainVal(args)

ppocrlabel切分训练集验证集和测试集

# coding:utf8
import os
import shutil
import random
import argparse


# 删除划分的训练集、验证集、测试集文件夹,重新创建一个空的文件夹
def isCreateOrDeleteFolder(path, flag):
    flagPath = os.path.join(path, flag)

    if os.path.exists(flagPath):
        shutil.rmtree(flagPath)

    os.makedirs(flagPath)
    flagAbsPath = os.path.abspath(flagPath)
    return flagAbsPath


def splitTrainVal(root, abs_train_root_path, abs_val_root_path, abs_test_root_path, train_txt, val_txt, test_txt, flag):
    
    data_abs_path = os.path.abspath(root)
    label_file_name = args.detLabelFileName if flag == "det" else args.recLabelFileName
    label_file_path = os.path.join(data_abs_path, label_file_name)

    with open(label_file_path, "r", encoding="UTF-8") as label_file:
        label_file_content = label_file.readlines()
        random.shuffle(label_file_content)
        label_record_len = len(label_file_content)

        for index, label_record_info in enumerate(label_file_content):
            image_relative_path, image_label = label_record_info.split('\t')
            image_name = os.path.basename(image_relative_path)

            if flag == "det":
                image_path = os.path.join(data_abs_path, image_name)
            elif flag == "rec":
                image_path = os.path.join(data_abs_path, args.recImageDirName, image_name)

            train_val_test_ratio = args.trainValTestRatio.split(":")
            train_ratio = eval(train_val_test_ratio[0]) / 10
            val_ratio = train_ratio + eval(train_val_test_ratio[1]) / 10
            cur_ratio = index / label_record_len

            if cur_ratio < train_ratio:
                # train_txt.write("{}\t{}\n".format(image_copy_path, image_label))
                image_copy_path = os.path.join(abs_train_root_path, image_name)
                shutil.copy(image_path, image_copy_path)
                train_txt.write(
                    "{}\t{}".format(image_copy_path.split("/")[-2] + "/" + image_copy_path.split("/")[-1], image_label))

            elif cur_ratio >= train_ratio and cur_ratio < val_ratio:
                image_copy_path = os.path.join(abs_val_root_path, image_name)
                shutil.copy(image_path, image_copy_path)
                # val_txt.write("{}\t{}\n".format(image_copy_path, image_label))

                val_txt.write(
                    "{}\t{}".format(image_copy_path.split("/")[-2] + "/" + image_copy_path.split("/")[-1], image_label))

            else:
                image_copy_path = os.path.join(abs_test_root_path, image_name)
                shutil.copy(image_path, image_copy_path)
                test_txt.write(
                    "{}\t{}".format(image_copy_path.split("/")[-2] + "/" + image_copy_path.split("/")[-1], image_label))


# 删掉存在的文件
def removeFile(path):
    if os.path.exists(path):
        os.remove(path)


def genDetRecTrainVal(args):
    detAbsTrainRootPath = isCreateOrDeleteFolder(args.detRootPath, "train")
    detAbsValRootPath = isCreateOrDeleteFolder(args.detRootPath, "val")
    detAbsTestRootPath = isCreateOrDeleteFolder(args.detRootPath, "test")
    recAbsTrainRootPath = isCreateOrDeleteFolder(args.recRootPath, "train")
    recAbsValRootPath = isCreateOrDeleteFolder(args.recRootPath, "val")
    recAbsTestRootPath = isCreateOrDeleteFolder(args.recRootPath, "test")

    removeFile(os.path.join(args.detRootPath, "train.txt"))
    removeFile(os.path.join(args.detRootPath, "val.txt"))
    removeFile(os.path.join(args.detRootPath, "test.txt"))
    removeFile(os.path.join(args.recRootPath, "train.txt"))
    removeFile(os.path.join(args.recRootPath, "val.txt"))
    removeFile(os.path.join(args.recRootPath, "test.txt"))

    detTrainTxt = open(os.path.join(args.detRootPath, "train.txt"), "a", encoding="UTF-8")
    detValTxt = open(os.path.join(args.detRootPath, "val.txt"), "a", encoding="UTF-8")
    detTestTxt = open(os.path.join(args.detRootPath, "test.txt"), "a", encoding="UTF-8")
    recTrainTxt = open(os.path.join(args.recRootPath, "train.txt"), "a", encoding="UTF-8")
    recValTxt = open(os.path.join(args.recRootPath, "val.txt"), "a", encoding="UTF-8")
    recTestTxt = open(os.path.join(args.recRootPath, "test.txt"), "a", encoding="UTF-8")

    splitTrainVal(args.datasetRootPath, detAbsTrainRootPath, detAbsValRootPath, detAbsTestRootPath, detTrainTxt, detValTxt,
                  detTestTxt, "det")

    for root, dirs, files in os.walk(args.datasetRootPath):
        for dir in dirs:
            if dir == 'crop_img':
                splitTrainVal(root, recAbsTrainRootPath, recAbsValRootPath, recAbsTestRootPath, recTrainTxt, recValTxt,
                              recTestTxt, "rec")
            else:
                continue
        break



if __name__ == "__main__":
    # 功能描述:分别划分检测和识别的训练集、验证集、测试集
    # 说明:可以根据自己的路径和需求调整参数,图像数据往往多人合作分批标注,每一批图像数据放在一个文件夹内用PPOCRLabel进行标注,
    # 如此会有多个标注好的图像文件夹汇总并划分训练集、验证集、测试集的需求
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--trainValTestRatio",
        type=str,
        default="6:2:2",
        help="ratio of trainset:valset:testset")
    parser.add_argument(
        "--datasetRootPath",
        type=str,
        default="../train_data/",
        help="path to the dataset marked by ppocrlabel, E.g, dataset folder named 1,2,3..."
    )
    parser.add_argument(
        "--detRootPath",
        type=str,
        default="../train_data/det",
        help="the path where the divided detection dataset is placed")
    parser.add_argument(
        "--recRootPath",
        type=str,
        default="../train_data/rec",
        help="the path where the divided recognition dataset is placed"
    )
    parser.add_argument(
        "--detLabelFileName",
        type=str,
        default="Label.txt",
        help="the name of the detection annotation file")
    parser.add_argument(
        "--recLabelFileName",
        type=str,
        default="rec_gt.txt",
        help="the name of the recognition annotation file"
    )
    parser.add_argument(
        "--recImageDirName",
        type=str,
        default="crop_img",
        help="the name of the folder where the cropped recognition dataset is located"
    )
    args = parser.parse_args()
    genDetRecTrainVal(args)
  • 3
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

小飞龙程序员

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值