一、数据集可以拆分为三个子集:训练集、验证集、测试集。
- 在训练集上进行训练,然后使用测试集对模型进行测试并观察其指标,根据在测试集上获得的效果调整模型。
- 模型调整包括:更改学习速率、添加或移除特征、设计新模型等一系列的手工调整。
- 训练集参与模型的学习,不能反映模型的真实能力。
- 测试集参与手工调参,也不能反应模型的真实能力。
- 所以增加验证集考察模型的真实能力。
- 只有数据集和测试集就会出现过拟合的现象。
二、将图片和txt格式标签划分为train、val、test三部分
文件夹格式为:图片和标签各一个文件夹
每个文件目标文件夹下分为train、val、test三部分
import os
import random
import shutil
from shutil import copy2
def data_take_split(data_folder,take_data_folder,label_folder,label_take_folder,train_scales = 0.8,val_scales = 0.1,test_scales = 0.1 ):
"""
将源文件分为三个数据集 训练集 验证集 测试集
0.8 0.1 0.1
"""
print('数据集划分开始')
picture_names = os.listdir(data_folder) #将原目录下的文件名全部加入到列表中
label_names = os.listdir(label_folder)
take_names = ['train','val','test'] #在目标文件夹下创建三个文件夹
for take_name in take_names:
take_path = os.path.join(take_data_folder,take_name)
if os.path.isdir(take_path):
pass
else:
os.mkdir(take_path)
for take_name in take_names:
take_path = os.path.join(label_take_folder,take_name)
if os.path.isdir(take_path):
pass
else:
os.mkdir(take_path)
# 按照比列划分数据集,并进行数据图片的复制
train_folder = os.path.join(take_data_folder,'train') #分割后的训练数据集路径
val_folder = os.path.join(take_data_folder,'val')
test_folder = os.path.join(take_data_folder,'test')
label_train_folder = os.path.join(label_take_folder,'train') #标签分割后的数据集路径
label_val_folder = os.path.join(label_take_folder,'val')
label_test_folder = os.path.join(label_take_folder,'test')
current_data_length = len(picture_names)
current_data_index_list = list(range(current_data_length))
random.shuffle(current_data_index_list)
train_stop_flage = current_data_length * train_scales
val_stop_flage = current_data_length * (train_scales + val_scales)
current_index = 0
train_num = 0
val_num = 0
test_num = 0
for i in current_data_index_list :
current_img_path = os.path.join(data_folder,picture_names[i])
current_label_path = os.path.join(label_folder,label_names[i])
if current_index <= train_stop_flage:
copy2(current_img_path,train_folder)
copy2(current_label_path,label_train_folder)
train_num += 1
elif current_index <= val_stop_flage:
copy2(current_img_path,val_folder)
copy2(current_label_path, label_val_folder)
val_num += 1
else:
copy2(current_img_path,test_folder)
copy2(current_label_path, label_test_folder)
test_num += 1
current_index += 1
print('训练集', train_num)
print('验证集', val_num)
print('测试集', test_num)
if __name__ == '__main__':
data_folder = 'C:\\Users\\kongx\\Desktop\\train\\images' # 图片源文件地址
take_data_folder = 'C:\\Users\\kongx\\Desktop\\train\\image' # 图片目标地址
label_folder = 'C:\\Users\\kongx\\Desktop\\train\\labels' # 标签源文件
take_label_folder = 'C:\\Users\\kongx\\Desktop\\train\\label' # 标签目标地址
data_take_split(data_folder,take_data_folder,label_folder,take_label_folder)