json_to_dataset 详解(写成注释的形式）（添加了可以过滤掉无效json文件的代码）

Holy_cow

已于 2023-09-28 19:11:49 修改

阅读量440

点赞数

文章标签： json python windows pytorch 深度学习

于 2023-09-28 10:20:29 首次发布

本文链接：https://blog.csdn.net/Holy_cow/article/details/133377288

版权

一、包含过滤掉无用json文件的代码：

def filter_json_files(json_dir):
    """
    Filter out JSON files that contain polygons with less than 3 points.

    Args:
        json_dir (str): Path to the directory containing the JSON files.

    Returns:
        tuple: A tuple containing two lists:
            - valid_json_files (list): A list of valid JSON file paths.
            - invalid_json_files (list): A list of invalid JSON file paths.
    """
    valid_json_files = []
    invalid_json_files = []
    for filename in os.listdir(json_dir):
        if not filename.endswith('.json'):
            continue
        json_path = os.path.join(json_dir, filename)
        with open(json_path, 'r') as f:
            data = json.load(f)
            shapes = data['shapes']
            is_valid = True
            for shape in shapes:
                points = shape['points']
                if len(points) < 3:
                    is_valid = False
                    break
            if is_valid:
                valid_json_files.append(json_path)
            else:
                invalid_json_files.append(json_path)
    return valid_json_files, invalid_json_files

二、全部代码：

import base64  # base64模块提供了大量函数用来把二进制数据编码为可打印的ASCII字符，以及将其解码为二进制数据
import json  # 主要用于将python对象编码为json格式输出或存储，以及将json格式对象解码为python对象。
import os  # os就是“operating system”的缩写，顾名思义，os模块提供的就是各种 Python 程序与操作系统进行交互的接口。
import os.path as osp

import numpy as np
import PIL.Image
from labelme import utils

'''
制作自己的语义分割数据集需要注意以下几点：
1、我使用的labelme版本是3.16.7，建议使用该版本的labelme，有些版本的labelme会发生错误，
   具体错误为：Too many dimensions: 3 > 2
   安装方式为命令行pip install labelme==3.16.7
2、此处生成的标签图是8位彩色图，与视频中看起来的数据集格式不太一样。
   虽然看起来是彩图，但事实上只有8位，此时每个像素点的值就是这个像素点所属的种类。
   所以其实和视频中VOC数据集的格式一样。因此这样制作出来的数据集是可以正常使用的。也是正常的。（引用的bubbliiiing的代码）
'''


def filter_json_files(json_dir):
    """
    Filter out JSON files that contain polygons with less than 3 points.

    Args:
        json_dir (str): Path to the directory containing the JSON files.

    Returns:
        tuple: A tuple containing two lists:
            - valid_json_files (list): A list of valid JSON file paths.
            - invalid_json_files (list): A list of invalid JSON file paths.
    """
    valid_json_files = []
    invalid_json_files = []
    for filename in os.listdir(json_dir):
        if not filename.endswith('.json'):
            continue
        json_path = os.path.join(json_dir, filename)
        with open(json_path, 'r') as f:
            data = json.load(f)
            shapes = data['shapes']
            is_valid = True
            for shape in shapes:
                points = shape['points']
                if len(points) < 3:
                    is_valid = False
                    break
            if is_valid:
                valid_json_files.append(json_path)
            else:
                invalid_json_files.append(json_path)
    return valid_json_files, invalid_json_files


if __name__ == '__main__':
    jpgs_path = "datasets/JPEGImages"  # 原图路径
    pngs_path = "datasets/SegmentationClass"  # lable图路径
    classes = ["_background_", "3c"]
    # classes     = ["_background_","cat","dog"]

    # Filter out invalid JSON files
    json_dir = './datasets/before/'
    valid_json_files, invalid_json_files = filter_json_files(json_dir)

    # Print valid and invalid JSON files
    print(f'Valid JSON files: {valid_json_files}')
    print(f'Invalid JSON files: {invalid_json_files}')

    # Process valid JSON files
    for json_path in valid_json_files:
        with open(json_path, 'r') as f:
            data = json.load(f)

    #  repeat_list = main()

    # 运行该函数后输出的两个比较重要的列表
    # fause_json_list：因为形状问题无法转化的 json 文件列表;
    # repeat_list:和已存在的 json 文件重复的 json 文件列表

    count = os.listdir("./datasets/before/")  # 返回指定路径下的文件和文件夹列表。
    for i in range(0, len(count)):
        path = os.path.join("./datasets/before", count[i])

        if os.path.isfile(path) and path.endswith('json'):
            data = json.load(open(path, encoding='UTF-8'))  # 当使用中文打标签时 就在path后面添加这些参数：,encoding='UTF-8'，
            # 是把其他类型的对象转为Python对象, Python对象包括：所有Python基本数据类型, 列表, 元组, 字典, 自己定义的类

            if data['imageData']:  # 每个json文件里有'imageData'这个字段， 加载后的json文件的'imageData'字段非空，则赋值给imageData
                imageData = data['imageData']
            else:
                imagePath = os.path.join(os.path.dirname(path), data['imagePath'])  # os.path.dirname返回文件路径
                with open(imagePath, 'rb') as f:
                    imageData = f.read()
                    imageData = base64.b64encode(imageData).decode('utf-8')

            img = utils.img_b64_to_arr(imageData)  # 将imageData字段编码成array(以图片的方式打开， 再把它转换成array形式)
            label_name_to_value = {'_background_': 0}
            for shape in data['shapes']:
                label_name = shape['label']
                if label_name in label_name_to_value:
                    label_value = label_name_to_value[label_name]
                else:
                    label_value = len(label_name_to_value)
                    label_name_to_value[label_name] = label_value

            # label_values must be dense
            label_values, label_names = [], []
            for ln, lv in sorted(label_name_to_value.items(), key=lambda x: x[1]):  # sorted 方法返回的是一个新的 list
                label_values.append(lv)
                label_names.append(ln)
            assert label_values == list(range(len(label_values)))  # 使用了一个断言语句来检查label_values是否与一个按顺序包含0至len(label_values)的列表相等。

            lbl = utils.shapes_to_label(img.shape, data['shapes'], label_name_to_value)  # 创建了一个与img大小相同的mask，mask是array类型, 并且mask和label有映射关系，判断它的type is class or instance, instance实例分割

            PIL.Image.fromarray(img).save(osp.join(jpgs_path, count[i].split(".")[0] + '.jpg'))   # 再把之前的img(array形式转换成image), 生成原图

            new = np.zeros([np.shape(img)[0], np.shape(img)[1]])
            '''
            对于label_names中的每个name：
            index_json = label_names.index(name)
            找到name在label_names中的索引。
            index_all = classes.index(name)
            找到name在classes中的索引。
            new = new + index_all * (np.array(lbl) == index_json)
            这行代码实际上是在创建一个新的标签图new，其中每个像素的值是它在classes中的索引。这是通过将原始标签图lbl中的每个像素值（即它在label_names中的索引）替换为它在classes中的索引来完成的。
            这样，无论JSON文件中的标签顺序如何，我们都可以得到一个一致的标签图，其中每个像素的值对应于classes中的索引。
            '''
            for name in label_names:
                index_json = label_names.index(name)
                index_all = classes.index(name)
                new = new + index_all * (np.array(lbl) == index_json)

            utils.lblsave(osp.join(pngs_path, count[i].split(".")[0] + '.png'), new)  # 生成标签图
            print('Saved ' + count[i].split(".")[0] + '.jpg and ' + count[i].split(".")[0] + '.png')

三、检测标签图的像素值

在上述代码中，生成标签图时，标签图的像素值对应于classes中的索引，如果你做的是二分类，那你的目标类别在classes中的索引为‘1’，则你的目标类别在标签图中的像素值就为1。可以运行如下代码进行检测：

from PIL import Image
import numpy as np

img = Image.open('VOCdevkit/VOC2007/SegmentationClass/video(1).png') # 把路径换成你的标签图路径
img_array = np.array(img)
unique_values = np.unique(img_array)
print(unique_values)