将数据集(位置:D:/Code/Data/centerlinedata/tem_voc/JPEGImages/)下的621张图片按照划分比例(如 训练集(train):验证集(val):测试集(test)=6:2:2)进行拆分复制到新的文件夹(D:/Code/Data/GREENTdata/)并在该文件夹下创建train、val、teset三个文件夹
步骤一:
使用random.shuffle(current_data_index_list)打乱索引list的顺序
copy2()函数用来复制图片到另一个位置
import os
import random
from shutil import copy2
# 原始img文件夹路径
file_path = r"D:/Code/Data/centerlinedata/tem_voc/JPEGImages/"
# 新文件路径
new_file_path = r"D:/Code/Data/GREENTdata/"
# 划分数据比例6:2:2
split_rate = [0.6, 0.2, 0.2]
class_names = os.listdir(file_path)
# 目标文件夹下创建文件夹
split_names = ['train', 'val', 'test']
print(class_names) # ['00000.jpg', '00001.jpg', '00002.jpg'... ]
# 判断是否存在目标文件夹,不存在则创建---->创建train\val\test文件夹
if os.path.isdir(new_file_path):
pass
else:
os.makedirs(new_file_path)
for split_name in split_names:
split_path = new_file_path + "/" + split_name
print(split_path) # D:/Code/Data/GREENTdata/train, val, test
if os.path.isdir(split_path):
pass
else:
os.makedirs(split_path)
# 按照比例划分数据集,并进行数据图片的复制
for class_name in class_names:
current_data_path = file_path # D:/Code/Data/centerlinedata/tem_voc/JPEGImages/
current_all_data = os.listdir(current_data_path)
current_data_length = len(current_all_data) # 文件夹下的图片个数
current_data_index_list = list(range(current_data_length))
random.shuffle(current_data_index_list)
train_path = os.path.join(new_file_path, 'train/') # D:/Code/Data/GREENTdata/train/
val_path = os.path.join(new_file_path, 'val/') # D:/Code/Data/GREENTdata/val/
test_path = os.path.join(new_file_path, 'test/') # D:/Code/Data/GREENTdata/test/
train_stop_flag = current_data_length * split_rate[0]
val_stop_flag = current_data_length * (split_rate[0] + split_rate[1])
current_idx = 0
train_num = 0
val_num = 0
test_num = 0
# 图片复制到文件夹中
for i in current_data_index_list:
src_img_path = os.path.join(current_data_path, current_all_data[i])
if current_idx <= train_stop_flag:
copy2(src_img_path, train_path)
train_num += 1
elif (current_idx > train_stop_flag) and (current_idx <= val_stop_flag):
copy2(src_img_path, val_path)
val_num += 1
else:
copy2(src_img_path, test_path)
test_num += 1
current_idx += 1
print("Done!", train_num, val_num, test_num)
步骤二:
对应标签文件夹放入train_label中,(需手动更改match_file_path 路径,后续补充完整不需要更改的),代码如下:
import os
import random
from shutil import copy2
# 原始总label文件夹路径
file_path = r"D:/Code/Data/centerlinedata/tem_voc/SegmentationClassPNG/"
# 新文件路径
new_file_path = r"D:/Code/Data/GREENTdata/"
# 匹配对应的文件夹
match_file_path = r"D:/Code/Data/GREENTdata/test/"
class_names = os.listdir(file_path)
match_names = os.listdir(match_file_path)
# 目标文件夹下创建文件夹
label_names = ['train_labels', 'val_labels', 'test_labels']
print(class_names) # ['00000.jpg', '00001.jpg', '00002.jpg'... ]
# 判断是否存在目标文件夹,不存在则创建---->创建train_label\val_label\test_label文件夹
if os.path.isdir(new_file_path):
pass
else:
os.makedirs(new_file_path)
for label_name in label_names:
split_path = new_file_path + label_name
# print(split_path) # D:/Code/Data/GREENTdata/train_label, val_label, test_label
if os.path.isdir(split_path):
pass
else:
os.makedirs(split_path)
# 按照比例划分数据集,并进行数据图片的复制
for class_name in class_names:
transF = os.path.splitext(class_name)
class_num = transF[0]
for match_name in match_names:
transF2 = os.path.splitext(match_name)
match_num = transF2[0]
if match_num == class_num:
src_img_path = os.path.join(file_path, class_name)
copy2(src_img_path, split_path)
print("Done!")
总结:代码整合!智能划分,根据文件路径需要自行修改
import os
import random
from shutil import copy2
# 步骤一未作改动,需要带入自己的文件路径file_path 、new_file_path
def splitimg():
# 图片文件夹路径
file_path = r"D:\Code\Data\UAS_Dataset\Sun\Sun_img"
# 新文件存放路径
new_file_path = r"D:\Code\Data\UAS_Dataset\Sun\Sun_img"
# 划分数据比例6:2:2
split_rate = [0.8, 0.2, 0]
class_names = os.listdir(file_path)
# 目标文件夹下创建文件夹
split_names = ['train', 'val', 'test']
print(class_names) # ['00000.jpg', '00001.jpg', '00002.jpg'... ]
current_all_data = os.listdir(file_path)
# 判断是否存在目标文件夹,不存在则创建---->创建train\val\test文件夹
if os.path.isdir(new_file_path):
pass
else:
os.makedirs(new_file_path)
for split_name in split_names:
split_path = os.path.join(new_file_path, split_name)
# D:/Code/Data/GREENTdata/train, val, test
if os.path.isdir(split_path):
pass
else:
os.makedirs(split_path)
# 按照比例划分数据集,并进行数据图片的复制
for class_name in class_names:
current_data_path = file_path # D:/Code/Data/centerlinedata/tem_voc/JPEGImages/
current_data_length = len(class_names) # 文件夹下的图片个数
current_data_index_list = list(range(current_data_length))
random.shuffle(current_data_index_list)
train_stop_flag = current_data_length * split_rate[0]
val_stop_flag = current_data_length * (split_rate[0] + split_rate[1])
current_idx = 0
train_num = 0
val_num = 0
test_num = 0
# 图片复制到文件夹中
for i in current_data_index_list:
src_img_path = os.path.join(current_data_path, current_all_data[i])
if current_idx <= train_stop_flag:
newpath = os.path.join(os.path.join(new_file_path, 'train'), current_all_data[i])
os.rename(src_img_path, newpath)
train_num += 1
elif (current_idx > train_stop_flag) and (current_idx <= val_stop_flag):
newpath = os.path.join(os.path.join(new_file_path, 'val'), current_all_data[i])
os.rename(src_img_path, newpath)
# copy2(src_img_path, newpath)
val_num += 1
else:
newpath = os.path.join(os.path.join(new_file_path, 'test'), current_all_data[i])
os.rename(src_img_path, newpath)
# copy2(src_img_path, newpath)
test_num += 1
current_idx += 1
print("Done!", train_num, val_num, test_num)
# 步骤二升级,仅需要带入文件路径file_path 、new_file_path 、 match_file_paths
def split_label_img():
# 标签文件夹所在路径
file_path = r"D:\Code\Data\UAS_Dataset\Sun\Sun_txt"
# 新文件路径
new_file_path = r"D:\Code\Data\UAS_Dataset\Sun\Sun_txt"
# 匹配对应划分过的img图片路径
match_file_paths = r"D:\Code\Data\UAS_Dataset\Sun\Sun_img"
# class_names = os.listdir(file_path)
match_names = os.listdir(match_file_paths) # 获取文件名称['train', 'val', 'test']
# 判断是否存在目标文件夹,不存在则创建---->创建train_label\val_label\test_label文件夹
if os.path.isdir(new_file_path):
pass
else:
os.makedirs(new_file_path)
for match_name in match_names:
split_path = os.path.join(new_file_path, match_name + '_labels') # 目标文件夹下创建文件夹 ['train_labels', 'val_labels', 'test_labels']
# print(split_path) # D:\Code\Data\UAS_Dataset\Rain\Rain_txt\ +[train_label, val_label, test_label]
if os.path.isdir(split_path):
pass
else:
os.makedirs(split_path)
match_img = os.path.join(match_file_paths, match_name) # img文件夹的路径,如D:\Code\Data\UAS_Dataset\Rain\Rain_img\train
imgs = os.listdir(match_img) # img图片名称
for img in imgs:
firstname = img[0:-4] # img图片名称(不带后缀)
txtname = firstname + '.txt' # 图片对应的label名称
txtpath = os.path.join(file_path, txtname) # 图片对应label的位置
newpath = os.path.join(split_path, txtname)
os.rename(txtpath, newpath)
# 按照比例划分数据集,并进行数据图片的复制
# for class_name in class_names:
# transF = os.path.splitext(class_name)
# class_num = transF[0]
# for match_name in match_names:
# transF2 = os.path.splitext(match_name)
# match_num = transF2[0]
# if match_num == class_num:
# src_img_path = os.path.join(file_path, class_name)
# copy2(src_img_path, split_path)
print("Done!")
if __name__ == '__main__':
splitimg()
split_label_img()
到这一步就大功告成啦,如果有用的话,麻烦看到的朋友点个赞呗^_^
补充:
关于txt等文本内容的文件读取方式:Python 读取文件夹名字(不包括后缀)并保存为txt文件_python获取文件名不含后缀名_小蛙的博客的博客-CSDN博客