有关数据集处理方面的实用“工具”-CSDN博客

本文链接：https://blog.csdn.net/weixin_49176452/article/details/147470374

1.划分数据集

import os
import random
from shutil import copy2


# 步骤一未作改动，需要带入自己的文件路径file_path 、new_file_path
def splitimg():
    # 图片文件夹路径
    file_path = r"/mnt/e/datasets/Water-FOD/images"
    # 新文件存放路径
    new_file_path = r"/mnt/e/datasets/Water_FOD/images"
    # 划分数据比例6:2:2
    split_rate = [0.6, 0.2, 0.2]
    class_names = os.listdir(file_path)
    # 目标文件夹下创建文件夹
    split_names = ['train', 'val', 'test']
    print(class_names)  # ['00000.jpg', '00001.jpg', '00002.jpg'... ]
    current_all_data = os.listdir(file_path)

    # 判断是否存在目标文件夹，不存在则创建---->创建train\val\test文件夹
    if os.path.isdir(new_file_path):
        pass
    else:
        os.makedirs(new_file_path)

    for split_name in split_names:
        split_path = os.path.join(new_file_path, split_name)
        # D:/Code/Data/GREENTdata/train, val, test
        if os.path.isdir(split_path):
            pass
        else:
            os.makedirs(split_path)

        # 按照比例划分数据集，并进行数据图片的复制
        for class_name in class_names:
            current_data_path = file_path  # D:/Code/Data/centerlinedata/tem_voc/JPEGImages/
            current_data_length = len(class_names)  # 文件夹下的图片个数
            current_data_index_list = list(range(current_data_length))
            random.shuffle(current_data_index_list)

            train_stop_flag = current_data_length * split_rate[0]
            val_stop_flag = current_data_length * (split_rate[0] + split_rate[1])

    current_idx = 0
    train_num = 0
    val_num = 0
    test_num = 0
    # 图片复制到文件夹中
    for i in current_data_index_list:
        src_img_path = os.path.join(current_data_path, current_all_data[i])
        if current_idx <= train_stop_flag:
            newpath = os.path.join(os.path.join(new_file_path, 'train'), current_all_data[i])
            os.rename(src_img_path, newpath)
            train_num += 1
        elif (current_idx > train_stop_flag) and (current_idx <= val_stop_flag):
            newpath = os.path.join(os.path.join(new_file_path, 'val'), current_all_data[i])
            os.rename(src_img_path, newpath)
            # copy2(src_img_path, newpath)
            val_num += 1
        else:
            newpath = os.path.join(os.path.join(new_file_path, 'test'), current_all_data[i])
            os.rename(src_img_path, newpath)
            # copy2(src_img_path, newpath)
            test_num += 1
        current_idx += 1
    print("Done!", train_num, val_num, test_num)


# 步骤二升级，仅需要带入文件路径file_path 、new_file_path 、 match_file_paths
def split_label_img():
    # 标签文件夹所在路径
    file_path = r"/mnt/e/datasets/Water-FOD/annotations/xml"
    # 新文件路径
    new_file_path = r"/mnt/e/datasets/Water_FOD/annotations"
    # 匹配对应划分过的img图片路径
    match_file_paths = r"/mnt/e/datasets/Water_FOD/images"

    # class_names = os.listdir(file_path)
    match_names = os.listdir(match_file_paths)  # 获取文件名称['train', 'val', 'test']

    # 判断是否存在目标文件夹，不存在则创建---->创建train_label\val_label\test_label文件夹
    if os.path.isdir(new_file_path):
        pass
    else:
        os.makedirs(new_file_path)
    for match_name in match_names:
        split_path = os.path.join(new_file_path,
                                  match_name + '_labels')  # 目标文件夹下创建文件夹 ['train_labels', 'val_labels', 'test_labels']
        # print(split_path)   # D:\Code\Data\UAS_Dataset\Rain\Rain_txt\ +[train_label,  val_label,  test_label]
        if os.path.isdir(split_path):
            pass
        else:
            os.makedirs(split_path)
        match_img = os.path.join(match_file_paths,
                                 match_name)  # img文件夹的路径，如D:\Code\Data\UAS_Dataset\Rain\Rain_img\train
        imgs = os.listdir(match_img)  # img图片名称
        for img in imgs:
            firstname = img[0:-4]  # img图片名称（不带后缀）
            txtname = firstname + '.xml'  # 图片对应的label名称
            txtpath = os.path.join(file_path, txtname)  # 图片对应label的位置
            newpath = os.path.join(split_path, txtname)
            os.rename(txtpath, newpath)

    # 按照比例划分数据集，并进行数据图片的复制
    # for class_name in class_names:
    #     transF = os.path.splitext(class_name)
    #     class_num = transF[0]
    #     for match_name in match_names:
    #         transF2 = os.path.splitext(match_name)
    #         match_num = transF2[0]
    #         if match_num == class_num:
    #             src_img_path = os.path.join(file_path, class_name)
    #             copy2(src_img_path, split_path)

    print("Done!")


if __name__ == '__main__':
    splitimg()
    split_label_img()

主要的改动有：
标注文件格式改为自己需要划分的后缀名。

2.随机删除指定数量的图片和对应的标注，做之前记得备份原数据集，避免无法找回。

import os
import random
import argparse


def delete_files(num, img_dir, xml_dir):
    # 获取所有共同的基文件名
    img_bases = set()
    for f in os.listdir(img_dir):
        if f.endswith('.jpg'):
            base = os.path.splitext(f)[0]
            img_bases.add(base)

    xml_bases = set()
    for f in os.listdir(xml_dir):
        if f.endswith('.xml'):
            base = os.path.splitext(f)[0]
            xml_bases.add(base)

    common_bases = img_bases & xml_bases

    # 检查可删除数量
    if len(common_bases) < num:
        print(f"错误: 仅有 {len(common_bases)} 对文件存在，无法删除 {num} 对")
        return

    # 随机选择要删除的文件
    to_delete = random.sample(common_bases, num)

    # 确认操作
    proceed = input(f"即将删除 {num} 对文件，确认吗？(y/n): ")
    if proceed.lower() != 'y':
        print("操作已取消")
        return

    # 执行删除
    deleted_count = 0
    for base in to_delete:
        img_path = os.path.join(img_dir, f"{base}.jpg")
        xml_path = os.path.join(xml_dir, f"{base}.xml")

        try:
            os.remove(img_path)
            os.remove(xml_path)
            deleted_count += 1
        except Exception as e:
            print(f"删除 {base} 失败: {str(e)}")

    print(f"成功删除 {deleted_count}/{num} 对文件")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="随机删除指定数量的图片和XML标注对")
    parser.add_argument('--num', type=int, default=2,required=True, help='要删除的文件对数量')
    parser.add_argument('--img_dir', default='/mnt/e/datasets/SODA-D/Images/Images', help='图片目录路径（默认：images）')
    parser.add_argument('--xml_dir', default='/mnt/e/datasets/SODA-D/lables/val', help='XML目录路径（默认：annotations）')

    args = parser.parse_args()

    # 验证目录存在
    if not os.path.isdir(args.img_dir):
        print(f"图片目录不存在: {args.img_dir}")
        exit(1)
    if not os.path.isdir(args.xml_dir):
        print(f"XML目录不存在: {args.xml_dir}")
        exit(1)

    delete_files(args.num, args.img_dir, args.xml_dir)

代码中改为自己数据集的图片格式和标注文件格式。

--------------------------------------------------------------------------------------------------------------------------------

通用目标检测格式

COCO格式：

结构：使用单个JSON文件存储所有图像的标注信息，包括类别、边界框（bbox）、分割掩码等。

适用场景：适用于大规模目标检测和实例分割任务，支持多标签和复杂标注。

VOC格式

结构：每张图像对应一个XML文件，包含物体类别、边界框（左上右下坐标）及图像元数据

适用场景：传统目标检测任务，兼容性强

YOLO格式

结构：每个图像对应一个TXT文件，标注信息为归一化的中心坐标及宽高（例如 类别 x_center y_center width height）

特点：轻量化，适合实时检测模型训练

DOTA格式

结构：每张图像对应一个TXT文件，支持旋转框（OBB）标注，格式为多边形顶点坐标或旋转框参数

适用场景：航空图像或复杂场景下的倾斜目标检测

--------------------------------------------------------------------------------------------------------------------------------

3.json转xml

适用场景：适用于大规模目标检测和实例分割任务，支持多标签和复杂标注ml

import json
import xml.etree.ElementTree as ET
from xml.dom import minidom
import os


def json_to_voc_xml(json_path, output_dir):
    # 创建输出目录
    os.makedirs(output_dir, exist_ok=True)

    # 加载JSON数据
    with open(json_path, 'r') as f:
        data = json.load(f)

    # 创建映射关系
    images = {img["id"]: img for img in data["images"]}
    categories = {cat["id"]: cat["name"] for cat in data["categories"]}

    # 按图像ID分组标注
    annotations = {}
    for ann in data["annotations"]:
        img_id = ann["image_id"]
        if img_id not in annotations:
            annotations[img_id] = []
        annotations[img_id].append(ann)

    # 处理每个图像
    for img_id, img_info in images.items():
        # 创建XML根节点
        root = ET.Element("annotation")

        # 添加基本信息
        ET.SubElement(root, "folder").text = "images"
        ET.SubElement(root, "filename").text = img_info["file_name"]
        ET.SubElement(root, "path").text = ""  # 可根据需要填写实际路径

        # 图像尺寸
        size = ET.SubElement(root, "size")
        ET.SubElement(size, "width").text = str(img_info["width"])
        ET.SubElement(size, "height").text = str(img_info["height"])
        ET.SubElement(size, "depth").text = "3"  # 假设为RGB图像

        # 添加每个对象的标注
        for ann in annotations.get(img_id, []):
            obj = ET.SubElement(root, "object")
            ET.SubElement(obj, "name").text = categories[ann["category_id"]]
            ET.SubElement(obj, "pose").text = "Unspecified"
            ET.SubElement(obj, "truncated").text = "0"
            ET.SubElement(obj, "difficult").text = "0"

            # 转换bbox格式 (x,y,w,h -> xmin,ymin,xmax,ymax)
            bbox = ann["bbox"]
            xmin = bbox[0]
            ymin = bbox[1]
            xmax = bbox[0] + bbox[2]
            ymax = bbox[1] + bbox[3]

            bndbox = ET.SubElement(obj, "bndbox")
            ET.SubElement(bndbox, "xmin").text = str(int(xmin))
            ET.SubElement(bndbox, "ymin").text = str(int(ymin))
            ET.SubElement(bndbox, "xmax").text = str(int(xmax))
            ET.SubElement(bndbox, "ymax").text = str(int(ymax))

        # 格式化输出XML
        xml_str = ET.tostring(root, encoding="utf-8")
        dom = minidom.parseString(xml_str)
        pretty_xml = dom.toprettyxml(indent="  ", encoding="utf-8").decode("utf-8")

        # 保存XML文件
        xml_filename = os.path.splitext(img_info["file_name"])[0] + ".xml"
        output_path = os.path.join(output_dir, xml_filename)
        with open(output_path, "w") as f:
            f.write(pretty_xml)


if __name__ == "__main__":
    # 使用示例 - 根据需要修改路径
    json_to_voc_xml(
        json_path="/mnt/e/datasets/SODA-D/Annotations/val.json",  # 输入JSON文件路径
        output_dir="/mnt/e/datasets/SODA-D/val"  # 输出目录路径
    )

4.根据图像文件移动对应的XML文件

import os
import shutil

# 定义数据集路径
dataset_path = "/mnt/e/datasets/water_trash"
images_path = os.path.join(dataset_path, "/mnt/e/datasets/water_trash/images")
annotations_path = os.path.join(dataset_path, "/mnt/e/datasets/water_trash/Annotations")
train_path = os.path.join(dataset_path, "/mnt/e/datasets/water_trash1/images/train")
val_path = os.path.join(dataset_path, "/mnt/e/datasets/water_trash1/images/val")
test_path = os.path.join(dataset_path, "/mnt/e/datasets/water_trash1/images/test")
train_anno_path = os.path.join(dataset_path, "/mnt/e/datasets/water_trash1/Annotations/train")
val_anno_path = os.path.join(dataset_path, "/mnt/e/datasets/water_trash1/Annotations/val")
test_anno_path = os.path.join(dataset_path, "/mnt/e/datasets/water_trash1/Annotations/test")

# 创建目标目录
os.makedirs(train_anno_path, exist_ok=True)
os.makedirs(val_anno_path, exist_ok=True)
os.makedirs(test_anno_path, exist_ok=True)

def move_annotations(image_folder, target_annotation_folder):
    """根据图像文件移动对应的XML标注文件"""
    for img_file in os.listdir(image_folder):
        if img_file.endswith(".jpg") or img_file.endswith(".png"):
            xml_file = os.path.splitext(img_file)[0] + ".xml"
            xml_source_path = os.path.join(annotations_path, xml_file)
            xml_target_path = os.path.join(target_annotation_folder, xml_file)
            if os.path.exists(xml_source_path):
                shutil.move(xml_source_path, xml_target_path)
            else:
                print(f"Warning: Annotation file {xml_file} not found!")

# 进行XML文件的划分
move_annotations(train_path, train_anno_path)
move_annotations(val_path, val_anno_path)
move_annotations(test_path, test_anno_path)

print("XML文件划分完成！")

5.xml转json

from tqdm import tqdm
import os
import json
import xml.etree.ElementTree as ET


def read_xml(xml_root):
    '''
    :param xml_root: .xml文件
    :return: dict('cat':['cat1',...],'bboxes':[[x1,y1,x2,y2],...],'whd':[w ,h,d])
    '''
    dict_info = {'cat': [], 'bboxes': [], 'box_wh': [], 'img_whd': []}
    if os.path.splitext(xml_root)[-1] == '.xml':
        tree = ET.parse(xml_root)  # ET是一个xml文件解析库，ET.parse（）打开xml文件。parse--"解析"
        root = tree.getroot()  # 获取根节点
        whd = root.find('size')
        whd = [whd.find('width').text, whd.find('height').text, whd.find('depth').text]
        dict_info['img_whd'] = whd
        for obj in root.findall('object'):  # 找到根节点下所有“object”节点
            cat = str(obj.find('name').text)  # 找到object节点下name子节点的值（字符串）
            bbox = obj.find('bndbox')
            x1, y1, x2, y2 = [int(bbox.find('xmin').text),
                              int(bbox.find('ymin').text),
                              int(bbox.find('xmax').text),
                              int(bbox.find('ymax').text)]
            b_w = x2 - x1 + 1
            b_h = y2 - y1 + 1

            dict_info['cat'].append(cat)
            dict_info['bboxes'].append([x1, y1, x2, y2])
            dict_info['box_wh'].append([b_w, b_h])

    else:
        pass
        # print('[inexistence]:{} suffix is not xml '.format(xml_root))
    return dict_info


def get_path_name(file_path, format='.jpg'):
    obj_path_lst = [os.path.join(root, file) for root, _, files in os.walk(file_path) for file in files if
                    file.endswith(format)]

    obj_name_lst = [os.path.basename(p) for p in obj_path_lst]

    return obj_path_lst, obj_name_lst


def xml2cocojson(xml_root, out_dir=None, assign_label=None, json_name=None, img_root=None):
    '''
    :param xml_root: xml文件所在路径，可以总路径
    :param out_dir:json文件保存地址
    :param assign_label: 提供训练列表，如['pedes', 'bus']，若为None则从xml中搜寻并自动给出
    :param json_name:保存json文件的名字
    :param img_root: 和xml_root格式一样，提供图片路径，用于获取高与宽
    :return:返回coco json 格式
    '''
    xml_root_lst, xml_names_lst = get_path_name(xml_root, format='.xml')

    json_name = json_name if json_name is not None else 'coco_data_format.json'
    out_dir = out_dir if out_dir else 'out_dir'
    os.makedirs(out_dir, exist_ok=True)
    out_dir_json = os.path.join(out_dir, json_name)

    # 若提供img_root获得路径与名称
    img_root_lst, img_name_lst = get_path_name(xml_root, format='.jpg') if img_root else None, None

    json_dict = {"images": [], "type": "instances", "annotations": [], "categories": []}

    image_id = 10000000
    anation_id = 10000000
    label_lst = assign_label if assign_label else []

    info = {'vaild_img': 0, 'invaild_img': 0}
    for i, xml_path in tqdm(enumerate(xml_root_lst)):
        xml_info = read_xml(xml_path)
        cat_lst = xml_info['cat']  # 类别是数字，从0 1 2 等
        img_w, img_h = int(xml_info['img_whd'][0]), int(xml_info['img_whd'][1])
        img_name = xml_names_lst[i][:-3] + 'jpg'

        if img_name_lst:  # 从图像中获取图像尺寸，高与宽
            import cv2
            j = list(img_name_lst).index(img_name)
            img_name = img_name_lst[j]
            img = cv2.imread(img_root_lst[int(j)])
            img_w, img_h = img.shape[:2]
        if len(cat_lst) < 1: continue

        image_id += 1
        image = {'file_name': img_name, 'height': img_h, 'width': img_w, 'id': image_id}

        boxes_lst = xml_info['bboxes']
        for j, cat in tqdm(enumerate(cat_lst)):

            if not assign_label:  # 未指定，添加类
                if cat not in label_lst:
                    label_lst.append(cat)

            b = boxes_lst[j]
            obj_width, obj_height = b[2] - b[0], b[3] - b[1]
            xmin, ymin = b[0], b[1]

            category_id = int(label_lst.index(cat) + 1)  # 我使用类别数字从1开始，满足coco格式，当然也可以从0开始

            if image not in json_dict['images']:
                json_dict['images'].append(image)  # 将图像信息添加到json中

            anation_id = anation_id + 1

            ann = {'area': obj_width * obj_height, 'iscrowd': 0, 'image_id': image_id,
                   'bbox': [xmin, ymin, obj_width, obj_height],
                   'category_id': category_id, 'id': anation_id, 'ignore': 0,
                   'segmentation': []}
            json_dict['annotations'].append(ann)

    for cid, cate in enumerate(label_lst):  # 我这里使用1开始的，当然也可以使用0开始
        cat = {'supercategory': 'FWW', 'id': cid + 1, 'name': cate}
        json_dict['categories'].append(cat)

    with open(out_dir_json, 'w') as f:
        json.dump(json_dict, f, indent=4)  # indent表示间隔长度

    print('saving json path:{}\n info:{}\ncategory list: {}'.format(out_dir_json, info, label_lst))


if __name__ == '__main__':
    root = '/mnt/e/datasets/SODA-D/lables/test'
    cat_lst = None
    xml2cocojson(root, assign_label=cat_lst)

6.xml转txt（yolo）

# -*- coding: utf-8 -*-

import xml.etree.ElementTree as ET
import os, cv2
import numpy as np


classes = []


def convert(size, box):
    dw = 1. / (size[0])
    dh = 1. / (size[1])
    x = (box[0] + box[1]) / 2.0 - 1
    y = (box[2] + box[3]) / 2.0 - 1
    w = box[1] - box[0]
    h = box[3] - box[2]
    x = x * dw
    w = w * dw
    y = y * dh
    h = h * dh
    return (x, y, w, h)


def convert_annotation(xmlpath, xmlname):
    with open(xmlpath, "r", encoding='utf-8') as in_file:
        txtname = xmlname[:-4] + '.txt'
        txtfile = os.path.join(txtpath, txtname)
        tree = ET.parse(in_file)
        root = tree.getroot()
        filename = root.find('filename')
        img = cv2.imdecode(np.fromfile('{}/{}.{}'.format(imgpath, xmlname[:-4], postfix), np.uint8), cv2.IMREAD_COLOR)
        h, w = img.shape[:2]
        res = []
        for obj in root.iter('object'):
            cls = obj.find('name').text
            if cls not in classes:
                classes.append(cls)
            cls_id = classes.index(cls)
            xmlbox = obj.find('bndbox')
            b = (float(xmlbox.find('xmin').text), float(xmlbox.find('xmax').text), float(xmlbox.find('ymin').text),
                 float(xmlbox.find('ymax').text))
            bb = convert((w, h), b)
            res.append(str(cls_id) + " " + " ".join([str(a) for a in bb]))
        if len(res) != 0:
            with open(txtfile, 'w+') as f:
                f.write('\n'.join(res))


if __name__ == "__main__":
    postfix = 'jpg'  # 图像后缀
    imgpath = r'datasets/COCO2017-mini/images/val2017'  # 图像文件路径
    xmlpath = r'datasets/COCO2017-mini/annotations/val2017'  # xml文件文件路径
    txtpath = r'datasets/COCO2017-mini/annotations/val_txt'  # 生成的txt文件路径

    if not os.path.exists(txtpath):
        os.makedirs(txtpath, exist_ok=True)

    list = os.listdir(xmlpath)
    error_file_list = []
    for i in range(0, len(list)):
        try:
            path = os.path.join(xmlpath, list[i])
            if ('.xml' in path) or ('.XML' in path):
                convert_annotation(path, list[i])
                print(f'file {list[i]} convert success.')
            else:
                print(f'file {list[i]} is not xml format.')
        except Exception as e:
            print(f'file {list[i]} convert error.')
            print(f'error message:\n{e}')
            error_file_list.append(list[i])
    print(f'this file convert failure\n{error_file_list}')
    print(f'Dataset Classes:{classes}')