【划分训练集、验证集和测试集(xml版本)】

import os
import random
import shutil


# --------------------------全局地址变量--------------------------------#
master_root = os.path.abspath(os.path.join(os.getcwd(), "../../"))  # 'I:\\hive-master'
data_root = os.path.join(master_root, "datasets")  # 'I:\\hive-master\\datasets'
image_full_path = os.path.join(data_root, "hive", "images")
label_full_path = os.path.join(data_root, "hive", "labels")  # 'I:\\hive-master\\datasets\\hive\\labels'


# 建立ImageSets文件
ImageSets_path = os.path.join(data_root, "hive", "ImageSets")
if not os.path.exists(ImageSets_path):
    os.makedirs(ImageSets_path)

# train.txt 和 val.txt文件位置
traintxt_path = os.path.join(data_root, "hive", "ImageSets", "train.txt")
valtxt_path = os.path.join(data_root, "hive", "ImageSets", "val.txt")
testtxt_path = os.path.join(data_root, "hive", "ImageSets", "test.txt")

# 如果已经存在txt文件 则删除
if os.path.exists(traintxt_path):
    os.remove(traintxt_path)
if os.path.exists(valtxt_path):
    os.remove(valtxt_path)
if os.path.exists(testtxt_path):
    os.remove(testtxt_path)
# --------------------------全局地址变量--------------------------------#

def create_imagesets_train_val(label_full_path, traintxt_full_path, valtxt_full_path):
    # 训练集比例
    train_percent = 0.8
    # 验证集比例
    val_percent = 0.2
    # label文件目录位置
    label_path = label_full_path
    total_label = os.listdir(label_path)  # 获得目录下所有xml文件

    num = len(total_label)
    lists = list(range(num))

    num_train = int(num * train_percent)

    # 随机选num_train个train文件
    train_list = random.sample(lists, num_train)
    for i in train_list:
        lists.remove(i)
    val_list = lists  # val等于train剩下的 这里没有划分test

    ftrain = open(traintxt_full_path, 'w')
    fval = open(valtxt_full_path, 'w')

    for i in range(num):
        name = total_label[i][:-4] + '\n'  # 出去.xml或.txt
        if i in train_list:
            ftrain.write(name)  # train.txt文件写入
        else:
            fval.write(name)  # val.txt文件写入

    ftrain.close()  # 关闭train.txt
    fval.close()    # 关闭val.txt

def create_imagesets_train_val_test(xml_full_path, traintxt_full_path, valtxt_full_path, testtxt_full_path):
    # 训练集比例
    train_percent = 0.6
    # 验证集比例
    val_percent = 0.2
    # 测试集比例
    test_percent = 0.2
    # xml文件目录位置
    xml_path = xml_full_path
    total_xml = os.listdir(xml_path)  # 获得目录下所有xml文件

    num = len(total_xml)
    lists = list(range(num))

    num_train = int(num * train_percent)  # 训练集个数

    num_val = int(num * val_percent)  # 验证集个数

    # 随机选num_train个train文件
    train_list = random.sample(lists, num_train)
    for i in train_list:
        lists.remove(i)
    val_list = random.sample(lists, num_val)
    for j in val_list:
        lists.remove(j)
    test_list = lists

    ftrain = open(traintxt_full_path, 'w')
    fval = open(valtxt_full_path, 'w')
    ftest = open(testtxt_full_path, 'w')

    for i in range(num):
        name = total_xml[i][:-4] + '\n'
        if i in train_list:
            ftrain.write(name)  # train.txt文件写入
        elif i in val_list:
            fval.write(name)  # val.txt文件写入
        else:
            ftest.write(name)  # test.txt文件写入

    ftrain.close()  # 关闭train.txt
    fval.close()  # 关闭val.txt
    ftest.close()  # 关闭test.txt

def split_image_label_train_val(image_full_path, label_full_path, traintxt_path, valtxt_path):
    image_train_path = os.path.join(image_full_path, "train")
    image_val_path = os.path.join(image_full_path, "val")
    label_train_path = os.path.join(label_full_path, "train")
    label_val_path = os.path.join(label_full_path, "val")
    if not os.path.exists(image_train_path):
        os.makedirs(image_train_path)
    if not os.path.exists(image_val_path):
        os.makedirs(image_val_path)
    if not os.path.exists(label_train_path):
        os.makedirs(label_train_path)
    if not os.path.exists(label_val_path):
        os.makedirs(label_val_path)
    ftraintxt = open(traintxt_path, 'r')
    for line in ftraintxt:
        train_image_name = line.split('\n')[0]
        train_image_full_name = train_image_name + ".jpg"
        train_label_full_name = train_image_name + '.txt'
        train_image_full_path = os.path.join(image_full_path, train_image_full_name)
        train_label_full_path = os.path.join(label_full_path, train_label_full_name)
        shutil.copy(train_image_full_path, image_train_path)
        shutil.copy(train_label_full_path, label_train_path)
    ftraintxt.close()

    fvaltxt = open(valtxt_path, 'r')
    for line in fvaltxt:
        val_image_name = line.split('\n')[0]
        val_image_full_name = val_image_name + ".jpg"
        val_label_full_name = val_image_name + '.txt'
        val_image_full_path = os.path.join(image_full_path, val_image_full_name)
        val_label_full_path = os.path.join(label_full_path, val_label_full_name)
        shutil.copy(val_image_full_path, image_val_path)
        shutil.copy(val_label_full_path, label_val_path)
    fvaltxt.close()


if __name__ == '__main__':
    # 1、划分train.txt和val.txt
    create_imagesets_train_val(label_full_path, traintxt_path, valtxt_path)
    # create_imagesets_train_val_test(label_full_path, traintxt_path, valtxt_path, testtxt_path)

    # 2、按照train.txt和val.txt 将所有images和txt文件划分为train和val两部分
    split_image_label_train_val(image_full_path, label_full_path, traintxt_path, valtxt_path)



在使用Yolov5进行目标检测任务时,划分训练集验证集测试集是一个重要的步骤。划分数据集的目的是为了在训练模型时能够有足够的数据进行优化,同时也需要验证集来评估模型性能,测试集用于最终的模型评估。 可以通过以下步骤来划分数据集: 1. 获取自己的数据集,并对数据集进行标注。标注可以使用VOC(xml格式)进行保存。 2. 使用提供的代码将标注文件从xml格式转换为txt格式。这可以方便后续的处理训练。代码可以参考引用中提供的代码。 3. 根据需求,确定训练集验证集测试集的比例。一般来说,常见的划分比例是70%的数据用于训练,10%用于验证,20%用于测试。 4. 使用代码将数据集按照比例划分训练集验证集测试集。可以参考引用中提供的代码来实现。 5. 在Yolov5的配置文件中,根据划分后的数据集进行相应的配置。具体可以参考引用中提到的yaml文件的修改方法。 划分数据集后,可以使用划分好的训练集进行模型的训练,使用验证集进行模型的调参性能评估,最后使用测试集进行最终的模型评估。这样可以确保模型在不同的数据集上有较好的泛化能力性能。 希望以上信息对您有所帮助。<span class="em">1</span><span class="em">2</span><span class="em">3</span> #### 引用[.reference_title] - *1* *3* [YOLOv5将自己数据集划分训练集验证集测试集](https://blog.csdn.net/qq_52763448/article/details/126622825)[target="_blank" data-report-click={"spm":"1018.2226.3001.9630","extra":{"utm_source":"vip_chatgpt_common_search_pc_result","utm_medium":"distribute.pc_search_result.none-task-cask-2~all~insert_cask~default-1-null.142^v93^chatsearchT3_1"}}] [.reference_item style="max-width: 50%"] - *2* [python脚本,划分训练集测试集,coco、voc格式的数据转换成yolo系列数据](https://download.csdn.net/download/qq122716072/85812629)[target="_blank" data-report-click={"spm":"1018.2226.3001.9630","extra":{"utm_source":"vip_chatgpt_common_search_pc_result","utm_medium":"distribute.pc_search_result.none-task-cask-2~all~insert_cask~default-1-null.142^v93^chatsearchT3_1"}}] [.reference_item style="max-width: 50%"] [ .reference_list ]
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值