数据集格式:
data---|---Images
|----Labels
划分结果:
data---|---train---|---Images
|---Labels
|---val---|---Images
|---Labels
|---test---|---Images
|---Labels
#将数据集中的图片与标签按照预设比例划分为训练集、验证集、测试集
import os
import random
import shutil
image_path = r'C:\Users\Administrator\Desktop\ght\Yolov7\data\Images\\' # 源图片文件夹路径
label_path = r'C:\Users\Administrator\Desktop\ght\Yolov7\data\Labels\\' # 标签文件夹路径
train_images = r'C:\Users\Administrator\Desktop\ght\Yolov7\data\train\Images'
train_labels = r'C:\Users\Administrator\Desktop\ght\Yolov7\data\train\Labels'
val_images = r'C:\Users\Administrator\Desktop\ght\Yolov7\data\val\Images'
val_labels = r'C:\Users\Administrator\Desktop\ght\Yolov7\data\val\Labels'
test_images = r'C:\Users\Administrator\Desktop\ght\Yolov7\data\test\Images'
test_labels = r'C:\Users\Administrator\Desktop\ght\Yolov7\data\test\Labels'
# 如存储路径不存在则创建
if not os.path.exists(train_images):
os.makedirs(train_images)
if not os.path.exists(train_labels):
os.makedirs(train_labels)
if not os.path.exists(val_images):
os.makedirs(val_images)
if not os.path.exists(val_labels):
os.makedirs(val_labels)
if not os.path.exists(test_images):
os.makedirs(test_images)
if not os.path.exists(test_labels):
os.makedirs(test_labels)
# 自定义三种集合的比例
train_rate = 0.6
val_rate = 0.2
test_rate = 0.2
# 求训练集
pathDir = os.listdir(image_path)# 取图片的原始路径
print(pathDir)
print('数据集总共有图片:', len(pathDir))
print(
'划分比例如下:训练集:{},验证集:{},测试集:{}'.format(int(len(pathDir) * train_rate), int(len(pathDir) * val_rate),
int(len(pathDir) * test_rate)))
picknumber = int(len(pathDir) * train_rate) # 按照预设比例算得所取训练集图片数picknumber
train_sample = random.sample(pathDir, picknumber) # 创建所取图片对应列表
train_sample_labels = list(train_sample)
for i in range(len(train_sample_labels)):
train_sample_labels[i] = train_sample_labels[i].replace('jpg','txt') # 创建所取图片对应标签的列表(标注为xml格式的可把txt改为xml)
# 复制训练集图片及标签
for name in train_sample:
shutil.copy(image_path + name, train_images + "\\" + name)
for name in train_sample_labels:
shutil.copy(label_path + name, train_labels + "\\" + name)
# 求出排除训练集的剩余数据集
all_images = os.listdir(image_path)
remaining_image = []
for file in all_images:
if file not in train_sample:
remaining_image.append(file)
all_labels = os.listdir(label_path)
remaining_label = []
for file in all_labels:
if file not in train_sample_labels:
remaining_label.append(file)
# 求验证集
picknumber2 = int(len(remaining_image) * val_rate / (val_rate + test_rate))
val_sample = random.sample(remaining_image, picknumber2)
val_sample_labels = list(val_sample)
for i in range(len(val_sample_labels)):
val_sample_labels[i] = val_sample_labels[i].replace('jpg','txt')
# 复制验证集图片及标签
for name in val_sample:
shutil.copy(image_path + name, val_images + "\\" + name)
for name in val_sample_labels:
shutil.copy(label_path + name, val_labels + "\\" + name)
#排除验证集以得到测试集
rest_image = []
for file in remaining_image:
if file not in val_sample:
rest_image.append(file)
rest_label = []
for file in remaining_label:
if file not in val_sample_labels:
rest_label.append(file)
# 复制测试集图片及标签
for name in rest_image:
shutil.copy(image_path + name, test_images + "\\" + name)
for name in rest_label:
shutil.copy(label_path + name, test_labels + "\\" + name)