将图片和其对应标签按比例划分成对应的train和val。
划分后的文件格式如下:
- - -images
- - - train
- - - val
- - - labels
- - - train
- - - val
替换对应路径和图片格式后缀及标签后缀即可。
import os
import shutil
import numpy as np
from sklearn.model_selection import train_test_split
def split_and_save_dataset(images_folder, labels_folder, output_folder, validation_ratio=0.2, random_seed=None):
"""
划分图像数据集为训练集和验证集,并保存到对应的目录
参数:
- images_folder: 包含图像文件的文件夹路径
- labels_folder: 包含标签文件的文件夹路径
- output_folder: 保存划分后数据集的目标文件夹路径
- validation_ratio: 验证集所占比例,默认为0.2
- random_seed: 随机数种子,可选
"""
# 获取图像文件和标签文件的列表
image_files = os.listdir(images_folder)
label_files = os.listdir(labels_folder)
# 获取文件名(不包含扩展名)作为图像和标签的标识符
image_ids = [os.path.splitext(filename)[0] for filename in image_files if filename.endswith('.jpg')]
label_ids = [os.path.splitext(filename)[0] for filename in label_files if filename.endswith('.txt')]
# 确保图像和标签的标识符匹配
common_ids = set(image_ids).intersection(label_ids)
# 设置随机数种子以确保可复现性
if random_seed is not None:
np.random.seed(random_seed)
# 划分数据集
train_ids, val_ids = train_test_split(list(common_ids), test_size=validation_ratio, random_state=random_seed)
# 创建输出目录
os.makedirs(output_folder, exist_ok=True)
# 将训练集和验证集的图像和标签复制到对应目录
for data_split, split_ids in [('train', train_ids), ('val', val_ids)]:
split_folder_images = os.path.join(output_folder, 'images', data_split)
split_folder_labels = os.path.join(output_folder, 'labels', data_split)
os.makedirs(split_folder_images, exist_ok=True)
os.makedirs(split_folder_labels, exist_ok=True)
for data_id in split_ids:
# 复制图像文件
image_src = os.path.join(images_folder, f"{data_id}.jpg")
image_dest = os.path.join(split_folder_images, f"{data_id}.jpg")
shutil.copy(image_src, image_dest)
# 复制标签文件
label_src = os.path.join(labels_folder, f"{data_id}.txt")
label_dest = os.path.join(split_folder_labels, f"{data_id}.txt")
shutil.copy(label_src, label_dest)
# 使用示例
# 假设有包含图像和标签的文件夹 images 和 labels
images_folder = "/path/to/images"
labels_folder = "/path/to/labels"
output_folder = "/path/to/output"
split_and_save_dataset(images_folder, labels_folder, output_folder, validation_ratio=0.2, random_seed=42)