我们通常拿到的数据集为一整个数据集,往往我们需要拆分为训练集与测试集,其拆分代码如下。该代码从数据集中取出测试集,剩下的就可以作为训练集使用。当然,也可以在此代码上更改,获得训练集。
import os
import shutil
import random
'''
image_path:原始数据集图像文件夹路径
label_path:原始数据集标签文件夹路径
test_image_path:测试集图像文件夹路径
test_label_path:测试集标签文件夹路径
test_percent:测试集数据占比,默认0.2,一般数据拆分时测试集与训练集是2:8
'''
def split_train_test_data(image_path, label_path, test_image_path, test_label_path, test_percent=0.2):
'''建立测试图像文件夹'''
if not os.path.exists(os.path.join(test_image_path)):
os.makedirs(os.path.join(test_image_path))
if not os.path.exists(os.path.join(test_label_path)):
os.makedirs(os.path.join(test_label_path))
'''获得标签文件列表'''
label_list = os.listdir(label_path)
'''将标签列表随机排列'''
random.shuffle(label_list)
image_name_list = []
label_name_list = []
for label_name in label_list:
each_name, _ = os.path.splitext(label_name)
image_name = os.path.join(image_path, '{}.jpg'.format(each_name))
label_name = os.path.join(label_path, '{}.txt'.format(each_name))
image_name_list.append(image_name)
label_name_list.append(label_name)
'''计算测试集数据数量'''
test_label_len = int(test_percent * len(label_list))
'''将图像及标签文件移动到test文件夹中'''
count = 0
for i in range(test_label_len):
shutil.move(label_name_list[i], test_label_path)
shutil.move(image_name_list[i], test_image_path)
count += 1
print('split complete,test_image number=%d' % count)