1. 解析xml
注意uft-8
编码
from lxml import etree
import os
def parse_xml_to_dict(xml):
"""
将xml文件解析成字典形式, 参考tensorflow的recursive_parse_xml_to_dict
Args:
xml: xml tree obtained by parsing XML file contents using lxml.etree
Returns:
Python dictionary holding XML contents.
"""
if len(xml) == 0: # 遍历到底层,直接返回tag对应的信息
return {xml.tag: xml.text}
result = {}
for child in xml:
child_result = parse_xml_to_dict(child) # 递归遍历标签信息
if child.tag != 'object':
result[child.tag] = child_result[child.tag]
else:
if child.tag not in result: # 因为object可能有多个,所以需要放入列表里
result[child.tag] = []
result[child.tag].append(child_result[child.tag])
return {xml.tag: result}
xml_folder = '/Data02/decoded_det_seg/det_img/xmls'
name_list = []
for file in os.listdir(xml_folder):
if file.endswith('xml'):
file_path = os.path.join(xml_folder, file)
print(file_path)
with open(file_path) as fid:
xml_str = fid.read()
xml_str = xml_str.encode('utf-8') # ascii
xml = etree.fromstring(xml_str)
data = parse_xml_to_dict(xml)["annotation"]
if 'object' in data.keys():
for obj in data["object"]:
name = obj['name']
if name not in name_list:
name_list.append(name)
print(name_list)
2.xml和jpg名字不同
jpeg的文件名如下
截取的xml文件名如下
如上所示,一些jpg中的图片,并没有相应的标注文件,这是不利于训练过程的。
2.1 几行程序搞定匹配
将匹配成功的xml和jpg保存到新的路径下
import os
import shutil
# 原始路径
xml_path = r"F:\BaiduNetdiskDownload\VOCdevkit\VOC2007\Annotations"
jpg_path = r"F:\BaiduNetdiskDownload\VOCdevkit\VOC2007\JPEGImages"
# 保存路径
save_xml = r"F:\BaiduNetdiskDownload\VOCdevkit\VOC2007\labels"
save_jpg = r"F:\BaiduNetdiskDownload\VOCdevkit\VOC2007\images"
if not os.path.exists(save_xml):
print("保存路径不存在,创建路径")
os.mkdir(save_xml)
if not os.path.exists(save_jpg):
os.mkdir(save_jpg)
xml_folder = os.listdir(xml_path)
# print(len(xml_folder))
jpg_folder = os.listdir(jpg_path)
# print(len(jpg_folder))
# 获取文件名,保存在列表中
xml_folder = os.listdir(xml_path)
jpg_folder = os.listdir(jpg_path)
# 取出xml和jpg文件名所有的前缀,保存在列表中,每个元素为str类型
xml_all = []
jpg_all = []
for i in range(len(xml_folder)):
xml_all.append(xml_folder[i].split(".")[0])
for i in range(len(jpg_folder)):
jpg_all.append(jpg_folder[i].split(".")[0])
# common_all找出xml和jpg中共同的前缀
common_all = []
for file in xml_all:
if file in jpg_all:
common_all.append(file)
# 将共同前缀中的文件,分别存储到新的文件夹中
for file in common_all:
xml = os.path.join(xml_path, file+'.xml')
jpg = os.path.join(jpg_path, file+ '.jpg')
shutil.copy(xml, save_xml)
shutil.copy(jpg, save_jpg)
最终结果
新的xml文件目录
新的jpg文件目录