删除voc数据集中指定的标签信息,并拷贝到新的输出文件夹中

1、删除voc数据集中指定的标签信息,并拷贝到新的输出文件夹中

 本博客想记录一个比较常用的功能,在标注数据集时候可能会出现错误,可能是标注者标注文件标签时候多输入了一个空格或者小数点,或者输入标签1时候不小心多按了一个1,或者标注者理解错意思整个数据集标注的标签错误,以上情况用以下两种方式清洗数据。

2、筛查标注文件中所有的类别是否和场景所需要的标签一致

  假设现在的标签为allowed_labels = ['1', '2', '3', '0'], 以下代码会找出不是设定的标签值,且输出错误标签的绝对路径以及包含所有标签的labels.txt标签文件。如果标签有错误,则会出现RuntimeError: CUDA error报错,点我进入解决问题

import os
import xml.etree.ElementTree as ET

from tqdm import tqdm


def getClsTxt(xmlDir, cls_txt, allowed_labels):
    """
    xmlDir        : XML directory path
    cls_txt       : Output cls file path
    allowed_labels: List of allowed labels
    """

    invalid_label_paths = []  # List to store paths of XML files with invalid labels

    for name in tqdm(os.listdir(xmlDir)):
        xmlFile = os.path.join(xmlDir, name)
        with open(xmlFile, "r+", encoding='utf-8') as fp:
            tree = ET.parse(fp)
            root = tree.getroot()

            invalid_labels = set()
            for obj in root.iter('object'):
                cls_element = obj.find('name')
                if cls_element is not None:
                    cls = cls_element.text
                    invalid_labels.add(cls)
                    if cls not in allowed_labels:
                        invalid_label_paths.append((xmlFile, cls))  # Store both XML path and invalid label

            set_cls.update(invalid_labels)

    if invalid_label_paths:
        print("Invalid labels found in the following XML files:")
        for path, invalid_label in invalid_label_paths:
            print(f"{path}, Error category is: {invalid_label}")
    else:
        print("No invalid labels found.")

    with open(cls_txt, "w+") as ft:
        for i in set_cls:
            ft.write(i + "\n")


if __name__ == '__main__':
    set_cls = set()
    xmlDir = "output/VOC-1205/Annotations"
    cls_txt = "output/VOC-1205/labels.txt"
    allowed_labels = ['1', '2', '3', '0']

    getClsTxt(xmlDir, cls_txt, allowed_labels)

3、删除完全标错的某一个标签,不改变正确的标签

  假设现在1和3标签是错误的(labels_to_remove = ['1', '3']),没有这几个标签,需要删除,代码如下:

import os
import shutil
import xml.etree.ElementTree as ET
from tqdm import tqdm


def process_xml(xml_path, output_folder, labels_to_remove):
    tree = ET.parse(xml_path)
    root = tree.getroot()

    remove_objects = []
    for obj in root.findall('object'):
        name = obj.find('name').text
        if name in labels_to_remove:
            remove_objects.append(obj)

    for obj in remove_objects:
        root.remove(obj)

    new_xml_path = os.path.join(output_folder, 'Annotations', os.path.basename(xml_path))
    tree.write(new_xml_path)


def process_dataset(input_folder, output_folder, labels_to_remove):
    img_output_folder = os.path.join(output_folder, 'JPEGImages')
    ann_output_folder = os.path.join(output_folder, 'Annotations')

    os.makedirs(img_output_folder, exist_ok=True)
    os.makedirs(ann_output_folder, exist_ok=True)

    xml_files = [f for f in os.listdir(os.path.join(input_folder, 'Annotations')) if f.endswith('.xml')]

    for xml_file in tqdm(xml_files, desc='Processing XML files'):
        xml_path = os.path.join(input_folder, 'Annotations', xml_file)
        img_file = os.path.join(input_folder, 'JPEGImages', os.path.splitext(xml_file)[0] + '.jpg')

        if os.path.exists(xml_path) and os.path.exists(img_file):
            process_xml(xml_path, output_folder, labels_to_remove)
            shutil.copy(img_file, img_output_folder)


if __name__ == "__main__":
    input_folder = './output/VOC-1205'
    output_folder = './output/VOC-1206'
    labels_to_remove = ['1', '3']

    process_dataset(input_folder, output_folder, labels_to_remove)

4、删除.xml文件中没有一条bbox信息的文件以及图片

  可能经过了上述的章节3,删除一些标签后,该xml可能没有一个bbox框,所以可以删除图片以及.xml文件, 代码如下

import os
import shutil
import xml.etree.ElementTree as ET


def copy_non_empty_bndbox_files(input_folder, output_folder):
    # 创建输出文件夹
    output_images_folder = os.path.join(output_folder, 'JPEGImages')
    output_annotations_folder = os.path.join(output_folder, 'Annotations')

    os.makedirs(output_images_folder, exist_ok=True)
    os.makedirs(output_annotations_folder, exist_ok=True)

    # 遍历原始文件夹
    for filename in os.listdir(input_folder):
        if filename.endswith('.xml'):
            xml_path = os.path.join(input_folder, filename)
            image_filename = os.path.splitext(filename)[0] + '.jpg'
            image_path = os.path.join(input_folder, image_filename)

            # 解析 XML 文件
            tree = ET.parse(xml_path)
            root = tree.getroot()

            # 查找是否有非空的 bndbox
            bndbox_elements = root.findall('.//bndbox')
            if any(bndbox_elements):
                # 复制图片文件
                shutil.copy(image_path, os.path.join(output_images_folder, image_filename))
                # 复制 XML 文件
                shutil.copy(xml_path, os.path.join(output_annotations_folder, filename))


if __name__ == "__main__":
    input_folder = './origin_dataset/VOC'
    output_folder = './output/VOC-1206'

    copy_non_empty_bndbox_files(input_folder, output_folder)

5、划分voc数据集以及转换为coco数据集格式

划分voc数据集以及转换为coco数据集格式

  • 9
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

锦鲤AI幸运

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值