参考 PASCAL VOC数据集训练集、验证集、测试集的划分和提取 - 云+社区 - 腾讯云
1、训练集、验证集、测试集按比例精确划分
#数据集划分
import os
import random
root_dir='./park_voc/VOC2007/'
## 0.7train 0.1val 0.2test
trainval_percent = 0.8
train_percent = 0.7
xmlfilepath = root_dir+'Annotations'
txtsavepath = root_dir+'ImageSets/Main'
total_xml = os.listdir(xmlfilepath)
num = len(total_xml) # 100
list = range(num)
tv = int(num*trainval_percent) # 80
tr = int(tv*train_percent) # 80*0.7=56
trainval = random.sample(list, tv)
train = random.sample(trainval, tr)
ftrainval = open(root_dir+'ImageSets/Main/trainval.txt', 'w')
ftest = open(root_dir+'ImageSets/Main/test.txt', 'w')
ftrain = open(root_dir+'ImageSets/Main/train.txt', 'w')
fval = open(root_dir+'ImageSets/Main/val.txt', 'w')
for i in list:
name = total_xml[i][:-4]+'\n'
if i in trainval:
ftrainval.write(name)
if i in train:
ftrain.write(name)
else:
fval.write(name)
else:
ftest.write(name)
ftrainval.close()
ftrain.close()
fval.close()
ftest .close()
2、训练集、验证集和测试集提取(只给出trian文件的提取方法)
# -*- coding:UTF-8 -*-
import shutil
f_txt = open('D:\dataset\VOCdevkit\split\VOC2007\ImageSets\Main\\trainval.txt', 'r')
f_train = 'D:\dataset\VOCdevkit\VOC2007\\train'
context = list(f_txt)
for imagename in context:
imagename = imagename[0:6]
imagename = imagename + '.jpg'
imagepath = 'D:\dataset\VOCdevkit\VOC2007\JPEGImages\\'+ imagename
shutil.copy(imagepath,f_train)
# 删除训练集和验证集,剩余图片为测试集
# os.remove(imagepath)
#处理Annotations同理只需将.jpg改为.xml