Python 生成无标签数据集格式
对于无监督任务
1,假设整个数据集文件夹如下:
data_root
├── meta
│ ├── test.txt # 测试数据集的标注文件
│ ├── train.txt # 训练数据集的标注文件
│ └── val.txt # 验证数据集的标注文件
├── train
│ ├── 123.png
│ ├── folder_1
│ │ ├── xxx.png
│ │ └── xxy.png
│ └── nsdf3.png
├── test
└── val
示例代码:
import os
import random
from sklearn.model_selection import train_test_split
import shutil
def gen_unlabel_dataset(image_folder, data_root):
# 创建meta目录及其子目录
meta_dir = os.path.join(data_root, "meta")
os.makedirs(meta_dir, exist_ok=True)
# 创建train、test和val目录
train_dir = os.path.join(data_root, "train")
test_dir = os.path.join(data_root, "test")
val_dir = os.path.join(data_root, "val")
os.makedirs(train_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)
# 获取图片文件列表
Image_suffix = ['jpg', 'bmp', 'png', 'jpeg']
image_files = []
for img in os.listdir(image_folder):
if img.split('.')[-1] in Image_suffix:
image_files.append(img)
# 随机打乱图片文件夹列表
random.shuffle(image_files)
# 划分训练集和验证集
train_files, val_files = train_test_split(image_files, test_size=0.2)
# 将文件夹写入train.txt
train_file = os.path.join(meta_dir, "train.txt")
with open(train_file, "w", encoding="utf-8") as f:
for file in train_files:
f.write(f"{file}\n")
# 将图片复制到train目录下
src_path = os.path.join(image_folder, file)
dst_path = os.path.join(train_dir, file)
shutil.copy(src_path, dst_path)
# 将文件路径写入val.txt
val_file = os.path.join(meta_dir, "val.txt")
with open(val_file, "w", encoding="utf-8") as f:
for file in val_files:
f.write(f"{file}\n")
# 将图片复制到val目录下
src_path = os.path.join(image_folder, file)
dst_path = os.path.join(val_dir, file)
os.rename(src_path, dst_path)
# 创建test.txt
test_file = os.path.join(meta_dir, "test.txt")
open(test_file, "w").close()
if __name__ == "__main__":
image_folder = r''
data_root = os.path.join(image_folder, "unlabeled_dataset")
os.makedirs(data_root, exist_ok=True)