目录结构
|-- data
|-- 1.xml
|-- 1.jpg
...
|-- n.xml
|-- n.jpg
'''
Author: 悠悠青青.
Date: 2023-10-07 15:50:27
LastEditors: Please set LastEditors
LastEditTime: 2023-10-07 19:56:39
'''
import os
import xml.etree.ElementTree as ET
def need_to_remove(classes, xml_path):
flag = 0
try:
etree = ET.parse(xml_path)
eroot = etree.getroot()
for child in eroot.findall('object'):
cls = child.find('name').text
if cls in classes:
flag += 1
else:
eroot.remove(child)
if flag:
etree.write(xml_path)
except:
pass
return flag == 0
def main(classes, root):
xml_files = [os.path.join(root, x) for x in os.listdir(root) if x.endswith('.xml')]
nums = len(xml_files)
idx = 1
for xml_path in xml_files:
flag = need_to_remove(classes, xml_path)
print('\rprocessing {} / {}'.format(idx, nums), end = "")
idx += 1
if flag:
img_path = xml_path.replace('.xml', '.jpg')
if (os.path.exists(img_path)):
os.remove(img_path)
os.remove(xml_path)
print()
print("FINISHED")
if __name__ == "__main__":
root = "/home/shares/datasets/data1"
classes = ['class1', 'class2', 'class3', 'class4']
main(classes, root)
参数说明
- root :待处理数据集路径,其下文件为jpg图片与同名xml文件
- classes:需要的类别
- 代码执行完毕后会移除xml中不需要的类别,若xml文件中不存在需要类别则同时移除对应图片文件