数据预处理篇---获取xml的标签数量,提取某个标签的图片名字及对应的图片
数据集的格式
- 图片标签
- 类别标签
- xml 数据结构
获取xml的标签数量
# -- coding: utf-8 --
import pickle
import os
import glob
from os import listdir, getcwd
from os.path import join
import xml.etree.ElementTree as ET # 导入xml模块
test_dir = 'D:/dataset/second_project/error_dataset2021_1/ImageSets/Main/test.txt'
train_dir = 'D:/dataset/second_project/error_dataset2021_1/ImageSets/Main/train.txt'
trainval_dir = 'D:/dataset/second_project/error_dataset2021_1/ImageSets/Main/trainval.txt'
xml_dir = 'D:/dataset/second_project/error_dataset2021_1/Annotations/' # 保存xml文件的路径
xml_index = open(train_dir).readlines()
print(len(xml_index)) # 计算测试集的xml文件的数量
nums = 0
c = 0
for A in range(len(xml_index)):
index = xml_index[A].strip('\n') # 截取xml文件名
# print(xml_dir+index+'.xml')
xml_file = open(xml_dir + index + '.xml', encoding="utf-8")
xml = ET.parse(xml_file)
for name in xml.iter('object'):
nums = nums + 1
if name.find("name").text == 'Bicycle': # 按标注的标签名进行统计
c = c + 1
xml_file.close()
print("标签为Bicycle的个数:", c)
print("\n总标签数目:", nums)
注意:变量不要发生重复,以免出现问题
结果如下:
提取某个标签的名字
# -- coding: utf-8 --
import xml.etree.ElementTree as ET # 导入xml模块
import pickle
import os
import glob
from os import listdir, getcwd
from os.path import join
test_dir = 'D:/dataset/second_project/error_dataset2021_1/ImageSets/Main/test.txt'
train_dir = 'D:/dataset/second_project/error_dataset2021_1/ImageSets/Main/train.txt'
trainval_dir = 'D:/dataset/second_project/error_dataset2021_1/ImageSets/Main/trainval.txt'
xml_dir = 'D:/dataset/second_project/error_dataset2021_1/Annotations/' # 保存xml文件的路径
xml_index = open(train_dir).readlines()
print(len(xml_index)) # 计算测试集的xml文件的数量
save_file = "D:/dataset/second_project/data_augmentation"
if not os.path.exists(save_file):
os.makedirs(save_file)
fsave = open(save_file + "/img_path.txt", "w", encoding="utf-8")\
for i in range(len(xml_index)):
index = xml_index[i].strip('\n') # 截取xml文件名
# print(xml_dir+index+'.xml')
xml_file = open(xml_dir + index + '.xml', encoding="utf-8")
xml = ET.parse(xml_file)
for path in xml.iter('annotation'):
for name in path.iter('object'):
if name.find("name").text == 'mailbox':
img_path = path.find('path').text.split("\\")[-1]
fsave.write(img_path + '\n')
print("extract label success")
结果如下:保存在img_path.txt的文件中
找的对应名字下的原始图片
# -- coding: utf-8 --
import os, shutil
path = 'D:/dataset/second_project/error_dataset2021/JPEGImages/'
img_path = os.listdir(path)
print(img_path)
save_img_path = 'D:/dataset/second_project/data_augmentation/mail_box_img/'
if not os.path.exists(save_img_path):
os.makedirs(save_img_path)
save_file = 'D:/dataset/second_project/data_augmentation/img_path.txt'
f = open(save_file, "r")
while True:
line = f.readline()
# xml_path = 'D:/dataset/third_project/data_augmentation/' + line
for img in img_path:
if img.split('.')[0] == line.split('.')[0]:
print(img)
shutil.copy(path + img, save_img_path + img)
if not line:
break
生成新的文件夹,保存相对应的图片