import glob
import math
import os
import shutil
# 根据label_from找对应的image,label,
# images labels
import time
IMAGE_FORMAT = [".jpg", ".png"][0]
#一般分配比例为训练集和测试集的比例为7:3或是8:2
split_rate = 0.8 # 训练集和验证集的分割比例
# 从哪里来 dir
label_from = r"D:\Dataset\Annotation_temp" # labelimg保存label的路径 文件夹
image_from = r"D:\Dataset\Images_temp" # labelimg保存image的路径 文件夹
# 到哪里去 dir
target_dir = r"D:\Dataset\QQ_SafeVerify" # label要移动到的目标文件夹 文件夹
target_dir = target_dir + "_" + str(round(time.time()))
def mk_directory(target_dir):
'''
创建目录结构
'''
print("创建目录结构")
target_dir_info = {}
paths = [] # 待创建的文件夹
paths.append(target_dir)
temp = os.path.join(target_dir, "train")
paths.append(temp)
target_dir_info["train_images"] = os.path.join(temp, "images")
target_dir_info["train_labels"] = os.path.join(temp, "labels")
paths.append(os.path.join(temp, "images"))
paths.append(os.path.join(temp, "labels"))
temp = os.path.join(target_dir, "val")
paths.append(temp)
target_dir_info["val_images"] = os.path.join(temp, "images")
target_dir_info["val_labels"] = os.path.join(temp, "labels")
paths.append(os.path.join(temp, "images"))
paths.append(os.path.join(temp, "labels"))
paths.append(os.path.join(target_dir, "test"))
for dir in paths:
if not os.path.exists(dir):
os.mkdir(dir)
return target_dir_info
def split_samples(label_from, image_from, target_dir_info):
'''
从临时images和labels中按照分割比例,把文件移动到自定义结构的目标文件中
train
images 文件夹
labels 文件夹
classes.txt
...
val
images 文件夹
labels 文件夹
classes.txt
...
'''
print("分割数据集train_val")
ls0 = glob.glob(os.path.join(label_from, "*.txt"))
# 先判断Annotation文件的完整性
for p in ls0:
if p.endswith("classes.txt"):
continue
t = os.path.split(p)
print(t[1])
temp_file = t[1][:-4] + IMAGE_FORMAT
temp_file = os.path.join(image_from, temp_file)
if not os.path.exists(temp_file):
os.remove(p)
print("文件不存在:", temp_file)
ls = [] # 文件名.txt
for p in ls0:
if p.endswith("classes.txt"):
if os.path.exists(p):
shutil.copy(p, target_dir_info["train_labels"])
shutil.move(p, target_dir_info["val_labels"])
continue
ls.append(p)
total = len(ls)
train_num = math.ceil(total * split_rate)
valid_num = total - train_num
print("总体样本数量total:", total)
print("训练集样本数量train_num:", train_num)
print("验证集样本数量valid_num:", valid_num)
for i, v in enumerate(ls):
to_image_dir = target_dir_info["train_images"] if i < train_num else target_dir_info["val_images"]
to_label_dir = target_dir_info["train_labels"] if i < train_num else target_dir_info["val_labels"]
print("to_image_dir", to_image_dir)
print("to_label_dir", to_label_dir)
t = os.path.split(v)
print(t[1])
# 先移动labels 文件txt
from_file = os.path.join(label_from, t[1]) # t[1]:16612341421.txt
to_file = os.path.join(to_label_dir, t[1])
shutil.move(from_file, to_file)
# 在移动images jpg文件
from_file = os.path.join(image_from, t[1]) # t[1]:16612341421.txt
from_file = from_file.replace(".txt", IMAGE_FORMAT)
to_file = os.path.join(to_image_dir, t[1])
to_file = to_file.replace(".txt", IMAGE_FORMAT)
# shutil.copy(from_file, to_file)
shutil.move(from_file, to_file)
def mk_Annotation():
'''
模拟生成Annotation.txt文件
'''
print("模拟生成Annotation.txt文件")
ls = glob.glob(os.path.join(label_from, "*.txt"))
if len(ls) >= 1:
return
with open(os.path.join(label_from, "classes.txt"), "w") as f:
f.write("1")
ls = glob.glob(os.path.join(image_from, "*" + IMAGE_FORMAT))
ls = ls[:10]
for p in ls:
print(p) # C:\Users\999\Desktop\QQ_SafeVerify\sample\1672192768483.jpg
t = os.path.split(p)
print(t[1][:-4])
file = os.path.join(label_from, t[1][:-4] + ".txt")
with open(file, "w") as f:
...
if __name__ == '__main__':
#############################################创建目录结构
target_dir_info = mk_directory(target_dir)
print(target_dir_info)
# target_dir_info = {'train_images': 'C:\\Users\\999\\Desktop\\QQ_SafeVerify\\dataset\\train\\images',
# 'train_labels': 'C:\\Users\\999\\Desktop\\QQ_SafeVerify\\dataset\\train\\labels',
# 'val_images': 'C:\\Users\\999\\Desktop\\QQ_SafeVerify\\dataset\\val\\images',
# 'val_labels': 'C:\\Users\\999\\Desktop\\QQ_SafeVerify\\dataset\\val\\labels'}
#############################################模拟环境
mk_Annotation() # 在label_from 中生成10个与images对应的txt
#############################################模拟环境
split_samples(label_from, image_from, target_dir_info)
print("分割完成:", target_dir)
08-09
649
12-13
6587