整理数据集 VOC格式
根据图片删除多余xml
import os
images_dir = 'Annotations'
xml_dir = 'JPEGImages'
# 创建列表
xmls = []
# 读取xml文件名(即:标注的图片名)
for xml in os.listdir(xml_dir):
# xmls.append(os.path.splitext(xml)[0]) #append()参数:在列表末尾添加新的对象,即将所有文件名读入列表
xmls.append(xml.split('.')[0]) # splitext和split的区别:前者('0001','.jpg'), 后者('0001','jpg') 在此可选用
print(xmls)
# 读取所有图片
for image_name in os.listdir(images_dir):
image_name = image_name.split('.')[0]
if image_name not in xmls:
image_name = image_name + '.xml'
print(image_name)
os.remove(os.path.join(images_dir, image_name))
根据xml删除多余图片
import os
images_dir = 'JPEGImages'
xml_dir = 'Annotations'
# 创建列表
xmls = []
# 读取xml文件名(即:标注的图片名)
for xml in os.listdir(xml_dir):
# xmls.append(os.path.splitext(xml)[0]) #append()参数:在列表末尾添加新的对象,即将所有文件名读入列表
xmls.append(xml.split('.')[0]) # splitext和split的区别:前者('0001','.jpg'), 后者('0001','jpg') 在此可选用
print(xmls)
# 读取所有图片
for image_name in os.listdir(images_dir):
image_name = image_name.split('.')[0]
if image_name not in xmls:
image_name = image_name + '.jpg'
print(image_name)
os.remove(os.path.join(images_dir, image_name))
删除大小小于指定大小的xml,例如size<=1kb
def get_path(file_path):
for root, dirs, files in os.walk(file_path):
for file in files:
filename = os.path.join(root, file)
del_file(filename)
def del_file(filename):
size = os.path.getsize(filename)
# 1kb=1024
if size < 1 * 1024:
print("remove", filename)
os.remove(filename)
if __name__ == "__main__":
file_path = "Annotations"
get_path(file_path)
删除框的标注边界值超出了图像的边界
import os
import xml.etree.ElementTree as ET
annotation_folder = 'Annotations/'
list = os.listdir(annotation_folder)
def file_name(file_dir):
L = []
for root, dirs, files in os.walk(file_dir):
for file in files:
if os.path.splitext(file)[1] == '.xml':
L.append(os.path.join(root, file))
return L
count = 0
xml_dirs = file_name(annotation_folder)
for i in range(0, len(xml_dirs)):
#print(xml_dirs[i])
annotation_file = open(xml_dirs[i]).read()
root = ET.fromstring(annotation_file)
label = root.find('filename').text
# print(label)
count_label = count
#get the pictures' width and height
for size in root.findall('size'):
label_width = int(size.find('width').text)
label_height = int(size.find('height').text)
#get the boundbox's width and height
for obj in root.findall('object'):
for bbox in obj.findall('bndbox'):
label_xmin = int(bbox.find('xmin').text)
label_ymin = int(bbox.find('ymin').text)
label_xmax = int(bbox.find('xmax').text)
label_ymax = int(bbox.find('ymax').text)
if label_xmin<=0 or label_xmax>label_width or label_ymin<=0 or label_ymax>label_height:
#judge the filename is not repeat
if label_temp == label:
continue
print('--'*30)
print(xml_dirs[i]) #print the xml's filename
#print(label)
print("width:",label_width)
print("height:",label_height)
print(label_xmin,label_ymin,label_xmax,label_ymax)
print('--'*30)
count = count+1
os.remove(xml_dirs[i])
label_temp = label
print("================================")
print(count)
删除框左上角坐标标大于右下角坐标的xml
import os
import xml.etree.ElementTree as ET
annotation_folder = 'Annotations'
list = os.listdir(annotation_folder)
def file_name(file_dir):
L = []
for root, dirs, files in os.walk(file_dir):
for file in files:
if os.path.splitext(file)[1] == '.xml':
L.append(os.path.join(root, file))
return L
count = 0
xml_dirs = file_name(annotation_folder)
for i in range(0, len(xml_dirs)):
#print(xml_dirs[i])
annotation_file = open(xml_dirs[i]).read()
root = ET.fromstring(annotation_file)
label = root.find('filename').text
# print(label)
count_label = count
#get the pictures' width and height
for size in root.findall('size'):
label_width = int(size.find('width').text)
label_height = int(size.find('height').text)
#get the boundbox's width and height
for obj in root.findall('object'):
for bbox in obj.findall('bndbox'):
label_xmin = int(bbox.find('xmin').text)
label_ymin = int(bbox.find('ymin').text)
label_xmax = int(bbox.find('xmax').text)
label_ymax = int(bbox.find('ymax').text)
if label_xmin >= label_xmax or label_ymin >= label_ymax:
#judge the filename is not repeat
if label_temp == label:
continue
print('--'*30)
print(xml_dirs[i]) #print the xml's filename
#print(label)
print("label_xmin:", label_xmin, " label_xmax:", label_xmax)
print("label_ymin:", label_ymin, ' label_ymax:', label_ymax)
print('--'*30)
count = count+1
label_temp = label
print("================================")
print(count)
重新排序命名图片和对应的xml
from xml.etree.ElementTree import ElementTree
from os import walk, path
import cv2
import os
def read_xml(in_path):
tree = ElementTree()
tree.parse(in_path)
return tree
def write_xml(tree, out_path):
tree.write(out_path, encoding="utf-8", xml_declaration=True)
def get_path_prex(rootdir):
data_path = []
prefixs = []
for root, dirs, files in walk(rootdir, topdown=True):
for name in files:
pre, ending = path.splitext(name)
if ending != ".xml":
continue
else:
data_path.append(path.join(root, name))
prefixs.append(pre)
return data_path, prefixs
if __name__ == "__main__":
# build files which will be used in VOC2007
if not os.path.exists("Annotations_"):
os.mkdir("Annotations_")
if not os.path.exists("JPEGImages_"):
os.mkdir("JPEGImages_")
xml_paths, prefixs = get_path_prex("Annotations")
for i in range(len(xml_paths)):
# rename and save the corresponding xml
tree = read_xml(xml_paths[i])
# save output xml, 000001.xml
write_xml(tree, "Annotations_/{}.xml".format("%04d" % (i + 1)))
# rename and save the corresponding image
img_pre = prefixs[i] + ".jpg"
root = os.getcwd() + '/JPEGImages/'
img_path = path.join(root, img_pre)
img = cv2.imread(img_path)
# save output jpg, 000001.jpg
cv2.imwrite('JPEGImages_/{}.jpg'.format("%04d" % (i + 1)), img)
print("——————排序+重命名完成———————")