Json转yolo格式并划分数据集

Heisenberg0071
已于 2024-07-23 15:09:07 修改
阅读量280
点赞数 4
文章标签： json YOLO python
于 2024-07-23 15:04:32 首次发布
本文链接：https://blog.csdn.net/qq_42394540/article/details/140636684
版权
import os
import json
import argparse
import colorama
import random
import shutil


# 将img和label放在同级目录下
# 新生成的txt文件在new label

def parse_args():
    parser = argparse.ArgumentParser(description="json(LabelMe) to txt(YOLOv8)")

    parser.add_argument("--new_dir", required=False, type=str, default=r"D:\Yolo\od\Json2Txt\data\new_label")
    parser.add_argument("--json_dir", required=False, type=str, default=r"D:\Yolo\od\Json2Txt\data\label")

    parser.add_argument("--labels", required=False, type=str,
                        default=r"D:\Yolo\od\Json2Txt\label.txt",
                        help="txt file that hold indexes and labels, one label per line, for example: face 0")
    parser.add_argument("--val_size", default=0.2, type=float,
                        help="the proportion of the validation set to the overall dataset:[0., 0.5]")
    parser.add_argument("--name", default='data1', required=False, type=str, help="the name of the dataset")
    args = parser.parse_args()
    return args


def get_labels_index(name):
    labels = {}  # key,value
    with open(name, "r") as file:
        for line in file:
            # print("line:", line)
            key_value = []
            for v in line.split(" "):
                # print("v:", v)
                key_value.append(v.replace("\n", ""))  # remove line breaks(\n) at the end of the line
            if len(key_value) != 2:
                print(colorama.Fore.RED + "Error: each line should have only two values(key value):", len(key_value))
                continue
            labels[key_value[0]] = key_value[1]
    with open(name, "r") as file:
        line_num = len(file.readlines())
    if line_num != len(labels):
        print(colorama.Fore.RED + "Error: there may be duplicate lables:", line_num, len(labels))
    return labels


def get_json_files(dir):
    jsons = []
    for x in os.listdir(dir):
        if x.endswith(".json"):
            jsons.append(x)
    return jsons


def parse_json(name):
    with open(name, "r") as file:
        data = json.load(file)
    if not data["labeled"]:
        return 0, 0, None
    width = data["size"]["width"]
    height = data["size"]["height"]
    # print(f"width: {width}; height: {height}")
    objects = []
    for output in data["outputs"]["object"]:
        object = []
        object.append(output["name"])
        object.append(output["bndbox"])
        objects.append(object)
    return width, height, objects  # [[], [], ..., []]


def get_box_width_height(box):
    dist = lambda val: max(val) - min(val)
    x = [pt[0] for pt in box]
    y = [pt[1] for pt in box]
    return min(x), min(y), dist(x), dist(y)


def bounding_box_normalization(width, height, objects, labels):
    # 精灵标注助手数据格式： xmin, ymin, xmax, ymax("xmin":968,"ymin":376,"xmax":1067,"ymax":622)
    # yolo数据格式: class [x_center, y_center, w, h] 边界框的中心坐标和宽度、高度，这些值都是相对于整张图片的比例，小于1。

    boxes = []
    for object in objects:
        box = []  # class x_center y_center width height
        box.append(labels[object[0]])  # label
        # print("point:", object[1])
        # x_min, y_min, box_w, box_h = get_box_width_height(object[1])
        locations = object[1]
        x_min, y_min, x_max, y_max = locations["xmin"], locations["ymin"], locations["xmax"], locations["ymax"]
        box_w, box_h = x_max - x_min, y_max - y_min
        box.append(round((float(x_min + box_w / 2.0) / width), 6))
        box.append(round((float(y_min + box_h / 2.0) / height), 6))
        box.append(round(float(box_w / width), 6))
        box.append(round(float(box_h / height), 6))
        boxes.append(box)
    return boxes


def write_to_txt(dir, json, width, height, objects, labels):
    boxes = bounding_box_normalization(width, height, objects, labels)
    # print("boxes:", boxes)
    name = json[:-len(".json")] + ".txt"
    # print("name:", name)
    with open(dir + "/" + name, "w") as file:
        for item in boxes:
            # print("item:", item)
            if len(item) != 5:
                print(colorama.Fore.RED + "Error: the length must be 5:", len(item))
                continue
            string = item[0] + " " + str(item[1]) + " " + str(item[2]) + " " + str(item[3]) + " " + str(item[4]) + "\r"
            file.write(string)


def json_to_txt(dir, jsons, labels):
    for json in jsons:  # list
        name = dir + "/" + json
        # print("name:", name)
        width, height, objects = parse_json(name)  # Any, Any, [[], [], ..., []]
        # print(f"width: {width}; height: {height}; objects: {objects}")
        if not objects:  # background
            image_name = json[:-len(".json")]
            with open(os.path.join(dir, image_name + ".txt"), 'w') as f:
                pass
            continue
        write_to_txt(dir, json, width, height, objects, labels)


def is_in_range(value, a, b):
    return a <= value <= b


def get_random_sequence(length, val_size):
    numbers = list(range(0, length))
    val_sequence = random.sample(numbers, int(length * val_size))
    # print("val_sequence:", val_sequence)
    train_sequence = [x for x in numbers if x not in val_sequence]
    # print("train_sequence:", train_sequence)
    return train_sequence, val_sequence


def get_files_number(dir):
    count = 0
    for file in os.listdir(dir):
        if os.path.isfile(os.path.join(dir, file)):
            count += 1
    return count


def split_train_val(dir, jsons, name, val_size):
    if is_in_range(val_size, 0., 0.5) is False:
        print(colorama.Fore.RED + "Error: the interval for val_size should be:[0., 0.5]:", val_size)
        raise
    dst_dir_images_train = "dataset/" + name + "/images/train"
    dst_dir_images_val = "dataset/" + name + "/images/val"
    dst_dir_labels_train = "dataset/" + name + "/labels/train"
    dst_dir_labels_val = "dataset/" + name + "/labels/val"
    try:
        os.makedirs(dst_dir_images_train)  # , exist_ok=True
        os.makedirs(dst_dir_images_val)
        os.makedirs(dst_dir_labels_train)
        os.makedirs(dst_dir_labels_val)
    except OSError as e:
        print(colorama.Fore.RED + "Error: cannot create directory:", e.strerror)
        raise
    # supported image formats
    img_formats = (".bmp", ".jpeg", ".jpg", ".png", ".webp")
    # print("jsons:", jsons)
    train_sequence, val_sequence = get_random_sequence(len(jsons), val_size)
    for index in train_sequence:
        for format in img_formats:
            file = dir + "/" + jsons[index][:-len(".json")] + format
            # print("file:", file)
            if os.path.isfile(file):
                shutil.copy(file, dst_dir_images_train)
                break
        file = dir + "/" + jsons[index][:-len(".json")] + ".txt"
        if os.path.isfile(file):
            shutil.copy(file, dst_dir_labels_train)
    for index in val_sequence:
        for format in img_formats:
            file = dir + "/" + jsons[index][:-len(".json")] + format
            if os.path.isfile(file):
                shutil.copy(file, dst_dir_images_val)
                break
        file = dir + "/" + jsons[index][:-len(".json")] + ".txt"
        if os.path.isfile(file):
            shutil.copy(file, dst_dir_labels_val)
    num_images_train = get_files_number(dst_dir_images_train)
    num_images_val = get_files_number(dst_dir_images_val)
    num_labels_train = get_files_number(dst_dir_labels_train)
    num_labels_val = get_files_number(dst_dir_labels_val)
    if num_images_train + num_images_val != len(jsons) or num_labels_train + num_labels_val != len(jsons):
        print(colorama.Fore.RED + "Error: the number of files is inconsistent:", num_images_train, num_images_val,
              num_labels_train, num_labels_val, len(jsons))
        raise


def generate_yaml_file(labels, name):
    path = os.path.join("dataset", name, name + ".yaml")
    # print("path:", path)
    with open(path, "w") as file:
        file.write("path: ../dataset/%s # dataset root dir\n" % name)
        file.write("train: images/train # train images (relative to 'path')\n")
        file.write("val: images/val  # val images (relative to 'path')\n")
        file.write("test: # test images (optional)\n\n")
        file.write("# Classes\n")
        file.write("names:\n")
        for key, value in labels.items():
            # print(f"key: {key}; value: {value}")
            file.write("  %d: %s\n" % (int(value), key))


def trans(label_dir, new_label_dir):
    # 确保目标文件夹存在，如果不存在则创建它
    if not os.path.exists(new_label_dir):
        os.makedirs(new_label_dir)
    # 遍历label文件夹中的所有文件
    for filename in os.listdir(label_dir):
        # 如果文件是以.txt结尾
        if filename.endswith('.txt'):
            # 构建源文件路径和目标文件路径
            src_path = os.path.join(label_dir, filename)
            dest_path = os.path.join(new_label_dir, filename)
            # 移动文件
            shutil.move(src_path, dest_path)


if __name__ == "__main__":
    colorama.init()
    args = parse_args()
    # 1. parse JSON file and write it to a TXT file
    labels = get_labels_index(args.labels)  # return dirt
    # print("labels:", labels)
    jsons = get_json_files(args.json_dir)  # return list
    # print("jsons:", jsons)
    json_to_txt(args.json_dir, jsons, labels)
    trans(args.json_dir, args.new_dir)  # 从label转移到new



    # 2. split the dataset
    # split_train_val(args.dir, jsons, args.name, args.val_size)
    # # 3. generate a YAML file
    # generate_yaml_file(labels, args.name)
    print(colorama.Fore.GREEN + "====== execution completed ======")

# command: python json2txt.py --dir dataset/GNGT/images_jsons --labels dataset/GNGT/label.txt --val_size 0.2 --name GNGT
# note: remove background images that do not contain the target
# https://www.jb51.net/python/321443ml2.htm