数据集的常见结构及其生成对应JSON文件的脚本

2301_80416780

于 2024-08-10 14:37:10 发布

阅读量2.3k

点赞数 23

文章标签： json 人工智能计算机视觉

本文链接：https://blog.csdn.net/2301_80416780/article/details/140865979

版权

1. 按类别组织

常见的结构，适用于分类任务。

dataset/

├── class1/

│ ├── image1.jpg

│ ├── image2.jpg

├── class2/

│ ├── image1.jpg

│ ├── image2.jpg

import os
import json

def create_class_index_json(data_dir, json_file):
    class_dict = {}
    class_index = 0

    # 遍历数据集目录
    for class_name in os.listdir(data_dir):
        class_path = os.path.join(data_dir, class_name)

        # 确保是目录
        if os.path.isdir(class_path):
            class_dict[str(class_index)] = class_name  # 使用索引作为键，类别名称作为值
            class_index += 1

    # 将字典写入JSON文件
    with open(json_file, 'w') as f:
        json.dump(class_dict, f, indent=4)

    print(f"Class index JSON file created at: {json_file}")

# 使用示例
data_directory = '/path/to/your/dataset'  # 数据集路径
json_output_file = 'class_index.json'      # 输出的JSON文件名
create_class_index_json(data_directory, json_output_file)

2. 按训练、验证和测试集组织

适用于需要划分数据集以进行验证和测试的情况

dataset/

├── train/

│ ├── class1/

│ ├── class2/

├── val/

│ ├── class1/

│ ├── class2/

├── test/

│ ├── class1/

│ ├── class2/

def create_train_val_test_json(data_dir, json_file):
    dataset_dict = {}

    for split in ['train', 'val', 'test']:
        split_path = os.path.join(data_dir, split)
        if os.path.isdir(split_path):
            classes = {}
            class_index = 0
            
            for class_name in os.listdir(split_path):
                class_path = os.path.join(split_path, class_name)

                if os.path.isdir(class_path):
                    classes[str(class_index)] = class_name
                    class_index += 1
            
            dataset_dict[split] = classes

    with open(json_file, 'w') as f:
        json.dump(dataset_dict, f, indent=4)

    print(f"Train/Val/Test JSON file created at: {json_file}")

# 使用示例
data_directory = '/path/to/your/dataset'  # 数据集路径
json_output_file = 'train_val_test.json'   # 输出的JSON文件名
create_train_val_test_json(data_directory, json_output_file)

3. 单一目录，文件名包含标签

所有图像都在同一个目录中，文件名包含类别信息。

dataset/

├── image_class1_01.jpg

├── image_class1_02.jpg

├── image_class2_01.jpg

├── image_class2_02.jpg

def create_filenames_with_labels_json(data_dir, json_file):
    file_dict = {}

    for filename in os.listdir(data_dir):
        if filename.endswith(('.jpg', '.png')):  # 根据需要添加其他格式
            # 提取类别信息（假设格式为 image_class1_01.jpg）
            parts = filename.split('_')
            if len(parts) >= 2:
                class_label = "_".join(parts[:-2])  # 组合成类别
                file_dict[filename] = class_label

    with open(json_file, 'w') as f:
        json.dump(file_dict, f, indent=4)

    print(f"Filenames with labels JSON file created at: {json_file}")

# 使用示例
data_directory = '/path/to/your/dataset'  # 数据集路径
json_output_file = 'filenames_with_labels.json'   # 输出的JSON文件名
create_filenames_with_labels_json(data_directory, json_output_file)

4. 多级目录

可以根据更细粒度的信息组织文件，如子类或属性。

dataset/

├── class1/

│ ├── subclass1/

│ │ ├── image1.jpg

│ │ ├── image2.jpg

│ ├── subclass2/

│ │ ├── image1.jpg

│ │ ├── image2.jpg

├── class2/

│ ├── subclass1/

│ ├── subclass2/

def create_multilevel_json(data_dir, json_file):
    dataset_dict = {}

    for class_name in os.listdir(data_dir):
        class_path = os.path.join(data_dir, class_name)

        if os.path.isdir(class_path):
            subclasses = {}
            for subclass_name in os.listdir(class_path):
                subclass_path = os.path.join(class_path, subclass_name)
                if os.path.isdir(subclass_path):
                    subclasses[subclass_name] = os.listdir(subclass_path)  # 列出子类所有文件
            dataset_dict[class_name] = subclasses

    with open(json_file, 'w') as f:
        json.dump(dataset_dict, f, indent=4)

    print(f"Multilevel JSON file created at: {json_file}")

# 使用示例
data_directory = '/path/to/your/dataset'  # 数据集路径
json_output_file = 'multilevel.json'        # 输出的JSON文件名
create_multilevel_json(data_directory, json_output_file)

5. 图像和标签分开

图像文件和标签文件分别存储，标签可以是单独的文本文件或CSV文件。

dataset/

├── images/

│ ├── image1.jpg

│ ├── image2.jpg

├── labels/

│ ├── image1.txt 或 .csv

│ ├── image2.txt 或 .csv

def create_image_label_json(images_dir, labels_dir, json_file):
    dataset_dict = {}

    for image_filename in os.listdir(images_dir):
        if image_filename.endswith(('.jpg', '.png')):  # 根据需要添加其他格式
            label_filename = os.path.splitext(image_filename)[0] + '.txt'  # 假设标签文件为文本文件
            dataset_dict[image_filename] = {
                'label_file': label_filename
            }

    with open(json_file, 'w') as f:
        json.dump(dataset_dict, f, indent=4)

    print(f"Image-Label JSON file created at: {json_file}")

# 使用示例
images_directory = '/path/to/your/images'  # 图像文件夹路径
labels_directory = '/path/to/your/labels'   # 标签文件夹路径
json_output_file = 'image_label.json'        # 输出的JSON文件名
create_image_label_json(images_directory, labels_directory, json_output_file)

6. 多模态数据

同时包含图像、文本和其他类型的数据。

dataset/

├── images/

│ ├── image1.jpg

│ ├── image2.jpg

├── texts/

│ ├── image1.txt

│ ├── image2.txt

├── labels/

│ ├── image1_label.txt

│ ├── image2_label.txt

def create_video_class_index_json(data_dir, json_file):
    video_dict = {}
    class_index = 0

    for class_name in os.listdir(data_dir):
        class_path = os.path.join(data_dir, class_name)

        if os.path.isdir(class_path):
            video_files = [f for f in os.listdir(class_path) if f.endswith(('.mp4', '.avi'))]  # 根据需要添加其他格式
            video_dict[str(class_index)] = {
                'class_name': class_name,
                'videos': video_files
            }
            class_index += 1

    with open(json_file, 'w') as f:
        json.dump(video_dict, f, indent=4)

    print(f"Video class index JSON file created at: {json_file}")

# 使用示例
data_directory = '/path/to/your/video_dataset'  # 视频数据集路径
json_output_file = 'video_class_index.json'      # 输出的JSON文件名
create_video_class_index_json(data_directory, json_output_file)

7. 视频数据集

对视频数据的组织，通常按类别或视频编号。

dataset/

├── class1/

│ ├── video1.mp4

│ ├── video2.mp4

├── class2/

│ ├── video1.mp4

│ ├── video2.mp4

def create_time_series_json(data_dir, json_file):
    time_series_dict = {}

    for date_folder in os.listdir(data_dir):
        date_path = os.path.join(data_dir, date_folder)

        if os.path.isdir(date_path):
            time_series_dict[date_folder] = []
            for file_name in os.listdir(date_path):
                if file_name.endswith(('.jpg', '.png')):  # 根据需要添加其他格式
                    time_series_dict[date_folder].append(file_name)

    with open(json_file, 'w') as f:
        json.dump(time_series_dict, f, indent=4)

    print(f"Time series JSON file created at: {json_file}")

# 使用示例
data_directory = '/path/to/your/time_series_dataset'  # 时间序列数据集路径
json_output_file = 'time_series.json'                  # 输出的JSON文件名
create_time_series_json(data_directory, json_output_file)

8. 样本不均衡

在分类任务中，某些类别的样本数量远多于其他类别。

dataset/

├── class1/ # 有大量样本

│ ├── image1.jpg

│ ├── image2.jpg

│ ├── ...

├── class2/ # 有很少样本

│ ├── image1.jpg

def create_imbalanced_class_json(data_dir, json_file):
    class_dict = {}

    for class_name in os.listdir(data_dir):
        class_path = os.path.join(data_dir, class_name)

        if os.path.isdir(class_path):
            class_dict[class_name] = []
            for file_name in os.listdir(class_path):
                if file_name.endswith(('.jpg', '.png')):  # 根据需要添加其他格式
                    class_dict[class_name].append(file_name)

    with open(json_file, 'w') as f:
        json.dump(class_dict, f, indent=4)

    print(f"Iimbalanced class JSON file created at: {json_file}")

# 使用示例
data_directory = '/path/to/your/imbalanced_dataset'  # 不均衡样本数据集路径
json_output_file = 'imbalanced_class.json'           # 输出的JSON文件名
create_imbalanced_class_json(data_directory, json_output_file)

9. 图像划分

对于图像分割任务，通常会将原始图像和对应的分割掩码分开存储。

dataset/

├── images/

│ ├── image1.jpg

│ ├── image2.jpg

├── masks/

│ ├── image1_mask.png

│ ├── image2_mask.png

def create_image_segmentation_json(images_dir, masks_dir, json_file):
    segmentation_dict = {}

    for image_filename in os.listdir(images_dir):
        if image_filename.endswith(('.jpg', '.png')):  # 根据需要添加其他格式
            mask_filename = os.path.splitext(image_filename)[0] + '_mask.png'  # 假设掩码文件以 _mask 后缀命名
            segmentation_dict[image_filename] = {
                'mask_file': mask_filename
            }

    with open(json_file, 'w') as f:
        json.dump(segmentation_dict, f, indent=4)

    print(f"Image segmentation JSON file created at: {json_file}")

# 使用示例
images_directory = '/path/to/your/images'  # 图像文件夹路径
masks_directory = '/path/to/your/masks'      # 掩码文件夹路径
json_output_file = 'image_segmentation.json'  # 输出的JSON文件名
create_image_segmentation_json(images_directory, masks_directory, json_output_file)

10. 时间序列数据

对于时间序列数据集，通常按时间戳组织。

dataset/

├── 2023-01-01/

│ ├── image1.jpg

│ ├── image2.jpg

├── 2023-01-02/

│ ├── image1.jpg

import os
import json

def create_time_series_json(data_dir, json_file):
    time_series_dict = {}

    # 遍历日期文件夹
    for date_folder in os.listdir(data_dir):
        date_path = os.path.join(data_dir, date_folder)

        # 确保是目录
        if os.path.isdir(date_path):
            # 初始化日期下的文件列表
            time_series_dict[date_folder] = []
            for file_name in os.listdir(date_path):
                if file_name.endswith(('.jpg', '.png')):  # 根据需要添加其他文件格式
                    time_series_dict[date_folder].append(file_name)

    # 将字典写入JSON文件
    with open(json_file, 'w') as f:
        json.dump(time_series_dict, f, indent=4)

    print(f"Time series JSON file created at: {json_file}")

# 使用示例
data_directory = '/path/to/your/time_series_data'  # 时间序列数据集路径
json_output_file = 'time_series.json'               # 输出的JSON文件名
create_time_series_json(data_directory, json_output_file)

11：多模态数据集

def create_multimodal_json(images_dir, texts_dir, labels_dir, json_file):
    multimodal_dict = {}

    # 遍历图像文件
    for image_filename in os.listdir(images_dir):
        if image_filename.endswith(('.jpg', '.png')):  # 根据需要添加其他格式
            base_name = os.path.splitext(image_filename)[0]
            # 构造对应的文本和标签文件名
            text_filename = os.path.join(texts_dir, base_name + '.txt')  # 假设文本文件名以 .txt 结尾
            label_filename = os.path.join(labels_dir, base_name + '_label.txt')  # 假设标签文件名以 _label.txt 结尾

            # 检查文件是否存在
            multimodal_dict[image_filename] = {
                'text_file': text_filename if os.path.exists(text_filename) else None,
                'label_file': label_filename if os.path.exists(label_filename) else None
            }

    with open(json_file, 'w') as f:
        json.dump(multimodal_dict, f, indent=4)

    print(f"Multimodal JSON file created at: {json_file}")

# 使用示例
images_directory = '/path/to/your/images'  # 图像文件夹路径
texts_directory = '/path/to/your/texts'     # 文本文件夹路径
labels_directory = '/path/to/your/labels'    # 标签文件夹路径
json_output_file = 'multimodal.json'         # 输出的JSON文件名
create_multimodal_json(images_directory, texts_directory, labels_directory, json_output_file)