import os
import random
from shutil import copy2
def split_dataset(dataset_dir, output_dir, split_ratio=(0.7, 0.3)):
"""
将数据集按照给定比例划分为训练集和验证集。
:param dataset_dir: 数据集根目录,包含图片和标签文件
:param output_dir: 输出目录,用于存放划分后的数据集
:param split_ratio: 划分比例,例如(0.7, 0.3)表示70%训练集,30%验证集
"""
# 确保输出目录存在
os.makedirs(output_dir, exist_ok=True)
# 创建train和val目录
for subset in ['train', 'val']:
subset_path = os.path.join(output_dir, subset)
os.makedirs(subset_path, exist_ok=True)
for folder in ['images', 'labels']:
os.makedirs(os.path.join(subset_path, folder), exist_ok=True)
# 获取所有图片文件名
image_files = [f for f in os.listdir(dataset_dir) if f.endswith(('jpg', 'png', 'jpeg'))]
# 构造标签文件名列表
label_files = [f.replace('.jpg', '.txt').replace('.png', '.txt').replace('.jpeg', '.txt') for f in image_files]
# 混洗索引
indices = list(range(len(image_files)))
random.shuffle(indices)
train_stop_index = int(len(image_files) * split_ratio[0])
train_count = 0
val_count = 0
for idx in indices:
img_file = image_files[idx]
label_file = label_files[idx]
src_img_path = os.path.join(dataset_dir, img_file)
src_label_path = os.path.join(dataset_dir, label_file)
if idx < train_stop_index:
dst_img_path = os.path.join(output_dir, 'train/images', img_file)
dst_label_path = os.path.join(output_dir, 'train/labels', label_file)
train_count += 1
else:
dst_img_path = os.path.join(output_dir, 'val/images', img_file)
dst_label_path = os.path.join(output_dir, 'val/labels', label_file)
val_count += 1
copy2(src_img_path, dst_img_path)
copy2(src_label_path, dst_label_path)
print(f"数据集划分完成!训练集: {train_count} 张图片, 验证集: {val_count} 张图片")
# 示例使用
dataset_directory = r'E:\path' # 数据集所在目录
output_directory = r'E:\path1' # 划分后数据集的输出目录
split_dataset(dataset_directory, output_directory)