参考:
【LogoDetection 数据集处理】(1)将数据集切分为训练集和验证集
【LogoDetection 数据集处理】(2)画出训练集图片的标注框
原始的数据集是将所有类别的图片都放在一个文件夹中,不方便查看。所以有必要将训练集按照类别划分为多个文件夹,这里数据集中有50个类别,所以我们划分为50个文件夹,每个类别的图片放在一个文件夹中。
代码如下:
"""
将数据集按照json标注文件划分为50个classes,每个class的图片放到一个文件夹中。
"""""
import os
import json
import shutil
from tqdm import tqdm
# 因为json中的类别名字中包含斜线“/”和中文,会产生路径和文件命名的问题,因此对categories进行了重命名
categories_list=[
{
"id": 1,
"name": "bingdundun"
},
{
"id": 2,
"name": "sanyo"
},
{
"id": 3,
"name": "Eifini"
},
{
"id": 4,
"name": "PSALTER"
},
{
"id": 5,
"name": "Beaster"
},
{
"id": 6,
"name": "ON"
},
{
"id": 7,
"name": "BYREDO"
},
{
"id": 8,
"name": "Ubras"
},
{
"id": 9,
"name": "Eternelle"
},
{
"id": 10,
"name": "PerfectDiary"
},
{
"id": 11,
"name": "huaxizi"
},
{
"id": 12,
"name": "Clarins"
},
{
"id": 13,
"name": "Loccitane"
},
{
"id": 14,
"name": "Versace"
},
{
"id": 15,
"name": "Mizuno"
},
{
"id": 16,
"name": "Lining"
},
{
"id": 17,
"name": "DoubleStar"
},
{
"id": 18,
"name": "YONEX"
},
{
"id": 19,
"name": "ToryBurch"
},
{
"id": 20,
"name": "Gucci"
},
{
"id": 21,
"name": "LouisVuitton"
},
{
"id": 22,
"name": "CARTELO"
},
{
"id": 23,
"name": "JORDAN"
},
{
"id": 24,
"name": "KENZO"
},
{
"id": 25,
"name": "UNDEFEATED"
},
{
"id": 26,
"name": "BoyLondon"
},
{
"id": 27,
"name": "TREYO"
},
{
"id": 28,
"name": "carhartt"
},
{
"id": 29,
"name": "jierou"
},
{
"id": 30,
"name": "Blancpain"
},
{
"id": 31,
"name": "GXG"
},
{
"id": 32,
"name": "ledin"
},
{
"id": 33,
"name": "Diadora"
},
{
"id": 34,
"name": "TUCANO"
},
{
"id": 35,
"name": "Loewe"
},
{
"id": 36,
"name": "GraniteGear"
},
{
"id": 37,
"name": "DESCENTE"
},
{
"id": 38,
"name": "OSPREY"
},
{
"id": 39,
"name": "Swatch"
},
{
"id": 40,
"name": "erke"
},
{
"id": 41,
"name": "MassimoDutti"
},
{
"id": 42,
"name": "PINKO"
},
{
"id": 43,
"name": "PALLADIUM"
},
{
"id": 44,
"name": "origins"
},
{
"id": 45,
"name": "Trendiano"
},
{
"id": 46,
"name": "yiner"
},
{
"id": 47,
"name": "MonsterGuardians"
},
{
"id": 48,
"name": "fuerjia"
},
{
"id": 49,
"name": "IPSA"
},
{
"id": 50,
"name": "Schwarzkopf"
}
]
# 原始数据路径
data_path = "dataset/fewshotlogodetection_round1_train_202204/train"
annoations_path = os.path.join(data_path, "annotations/instances_train2017.json")
images_path = os.path.join(data_path, "images")
# 读取annoations的json文件
with open(annoations_path, 'r', encoding='utf-8') as f:
annoations_dict = json.load(f)
images_list=annoations_dict["images"]
annotations_list=annoations_dict["annotations"]
# 创建最外层文件夹
trainset_dir="TrainSet_50Classes"
isExists_trainset_dir = os.path.exists(trainset_dir)
if not isExists_trainset_dir:
os.mkdir(trainset_dir)
# 创建50个子文件夹,以及class_id与文件夹名称的字典。
class_dict={}
for i in range(len(categories_list)):
dir_name=categories_list[i]["name"]
dir_name_path=os.path.join(trainset_dir,dir_name)
class_dict[i + 1] = dir_name_path
isExists = os.path.exists(dir_name_path)
if isExists:
shutil.rmtree(dir_name_path)
os.makedirs(dir_name_path,exist_ok=True)
# 移动图片到对应类别的文件夹
images_name = os.listdir(images_path)
for image_name in tqdm(images_name,desc="process"):
image_path=os.path.join(images_path,image_name)
for i in range(len(images_list)):
if image_name==images_list[i]["file_name"]:
image_id=images_list[i]["id"]
for j in range(len(annotations_list)):
if image_id==annotations_list[j]["image_id"]:
cls_id= annotations_list[j]["category_id"]
pic_file_path=class_dict[cls_id]
shutil.copy(image_path, pic_file_path)
划分结果如下: