众所周知,一个好的目标检测模型需要大量的数据来训练,当数据量较多的情况,我们没办法直观看到每一类别的目标框个数,就无法判断类别是否平衡。
下面的算法作用就是统计数据集中每一类别的目标框个数,只需要修改xml文件地址就可直接使用。
# -*- coding:utf-8 -*-
import os
import xml.etree.ElementTree as ET
import numpy as np
import matplotlib
from PIL import Image
def parse_obj(xml_path, filename):
tree = ET.parse(xml_path + filename)
objects = []
for obj in tree.findall('object'):
obj_struct = {}
obj_struct['name'] = obj.find('name').text
objects.append(obj_struct)
return objects
def read_image(image_path, filename):
im = Image.open(image_path + filename)
W = im.size[0]
H = im.size[1]
area = W * H
im_info = [W, H, area]
return im_info
if __name__ == '__main__':
xml_path = r'xml文件地址'
filenamess = os.listdir(xml_path)
filenames = []
for name in filenamess:
name = name.replace('.xml', '')
filenames.append(name)
recs = {}
obs_shape = {}
classnames = []
num_objs = {}
obj_avg = {}
for i, name in enumerate(filenames):
recs[name] = parse_obj(xml_path, name + '.xml')
for name in filenames:
for object in recs[name]:
if object['name'] not in num_objs.keys():
num_objs[object['name']] = 1
else:
num_objs[object['name']] += 1
if object['name'] not in classnames:
classnames.append(object['name'])
for name in classnames:
print('{}:{}个'.format(name, num_objs[name]))
print('信息统计算完毕。')