ImageNet数据集。
默认环境已经配置完成,图片也在各自对应文件夹,文件夹层级如下。
color_data
---black
--- black_1.jpg
black_2.jpg
...
balck_n.jpg
---blue
--- blue_1.jpg
blue_2.jpg
...
blue_m.jpg
---white
--- white_1.jpg
white_2.jpg
...
white_p.jpg
...
本例制作的数据集适用于yolov8分类和mmpretrain分类(博主只做过这两个框架的分类训练)
完整代码以及使用如下:
# 制作imagenet数据集
import os
import shutil
import random
from tqdm import tqdm
import numpy as np
# 类别
class_name = ['black','blue','white','green','red','yellow']
# 定义原始数据文件夹和目标数据集文件夹
data_dir = '/home/user/mydata/images'
target_dir = '/home/user/mydata/dataset'
# 定义数据集划分比例(所有)
train_split_ratio = 0.8
val_split_ratio = 0.2
# test_split_ratio = 0.1
# 创建目标数据集文件夹及其子目录结构
os.makedirs(target_dir, exist_ok=True)
os.makedirs(os.path.join(target_dir, 'meta'), exist_ok=True)
os.makedirs(os.path.join(target_dir, 'train'), exist_ok=True)
# os.makedirs(os.path.join(target_dir, 'test'), exist_ok=True)
os.makedirs(os.path.join(target_dir, 'val'), exist_ok=True)
# 获取原始数据文件夹下的子目录列表
categories = os.listdir(data_dir)
# 遍历每个子目录
for category in categories:
# 获取该类别下的所有文件
files = os.listdir(os.path.join(data_dir, category))
# 随机打乱文件顺序
random.shuffle(files)
# 计算划分数据集的索引
total_files = len(files)
train_split = int(train_split_ratio * total_files)
val_split = int(val_split_ratio * total_files)
# 划分数据集并复制到目标文件夹,使用tqdm添加进度条
for file in tqdm(files[:train_split], desc=f'Copying train data for {category}'):
src = os.path.join(data_dir, category, file)
dst = os.path.join(target_dir, 'train', category)
os.makedirs(dst, exist_ok=True)
shutil.copy(src, os.path.join(dst, file))
for file in tqdm(files[train_split:train_split + val_split], desc=f'Copying validation data for {category}'):
src = os.path.join(data_dir, category, file)
dst = os.path.join(target_dir, 'val', category)
os.makedirs(dst, exist_ok=True)
shutil.copy(src, os.path.join(dst, file))
# for file in tqdm(files[train_split + val_split:], desc=f'Copying test data for {category}'):
# src = os.path.join(data_dir, category, file)
# dst = os.path.join(target_dir, 'test', category)
# os.makedirs(dst, exist_ok=True)
# shutil.copy(src, os.path.join(dst, file))
# 创建标注文件(train.txt、val.txt、test.txt)
with open(os.path.join(target_dir, 'meta', 'train.txt'), 'w') as train_txt:
for category in categories:
train_files = os.listdir(os.path.join(target_dir, 'train', category))
for file in train_files:
index = class_name.index(category)
train_txt.write(f'{os.path.join(category, file)} {index}\n')
with open(os.path.join(target_dir, 'meta', 'val.txt'), 'w') as val_txt:
for category in categories:
val_files = os.listdir(os.path.join(target_dir, 'val', category))
for file in val_files:
index = class_name.index(category)
val_txt.write(f'{os.path.join(category, file)} {index}\n')
# with open(os.path.join(target_dir, 'meta', 'test.txt'), 'w') as test_txt:
# for category in categories:
# test_files = os.listdir(os.path.join(target_dir, 'test', category))
# for file in test_files:
# test_txt.write(f'{os.path.join("test", category, file)} {category}\n')
print("数据集划分完成!")