处理数据的代码

最新推荐文章于 2023-01-22 17:35:29 发布

吴小白（W）

最新推荐文章于 2023-01-22 17:35:29 发布

阅读量302

点赞数

本文链接：https://blog.csdn.net/m0_37607114/article/details/110871066

版权

VOC数据 xml统计类别筛选 COCO转换 json处理

关键词由CSDN通过智能技术生成

记录一些用来处理数据的代码，便于使用

voc数据集

import os
import pprint  
#此方法用来统计xml中不同类别的个数
def tongji_xml(Anno_dir):
  counts = {}
  for filename in os.listdir(Anno_dir):
    tree = read_xml(os.path.join(Anno_dir,filename))
    
    text_nodesCate = []
    for child in tree.findall('object'):
        name = child.find('name').text
        if name not in counts:
            counts[name] = 1
        else:
            counts[name] += 1
  pprint.pprint (counts)  

#统计txt文件中不同类别的个数
def tongji_txt(txt_path, Anno_dir):
  file = open(txt_path, 'r')
  lines = file.readlines()
  counts = {}
  for line in lines:
    line = line.strip()
    #print(line)
    tree = read_xml(os.path.join(Anno_dir,line + '.xml'))
    text_nodesCate = get_node_by_keyvalue(find_nodes(tree, "object/name"), "")
    for cate in text_nodesCate:
      cate = str(cate.text)
      if cate not in counts:
        counts[cate] = 1
      else:
        counts[cate] += 1
  print(len(counts))
  pprint.pprint (counts)

#从xml中删除一些类别

import os
import xml.etree.cElementTree as ET
#新建一个文件夹
def newfile(path):
    path=path.strip()
    path=path.rstrip("\\")
    # 判断路径是否存在
    isExists=os.path.exists(path)
    # 不存在
    if not isExists:
        # 创建目录操作函数
        os.makedirs(path)
        print(path+' 创建成功')
        return True
    #存在
    else:
        print(path+' 目录已存在')
        return False

pathCate = 'xml_dir'
pathNewCate = 'saved_xml_dir'

newfile(pathNewCate)
CLASSES = ['A', 'B']  #保留的类别
xml_list = os.listdir(pathCate)
    for axml in xml_list:
        path_xml = os.path.join(pathCate, axml)
        tree = ET.parse(path_xml)
        root = tree.getroot()  
        
        flag = 1   #如果xml中没有要保留的类别，则删除该xml
        for child in root.findall('object'):
            name = child.find('name').text
            total = len(name)
            i = 0
            if name not in CLASSES:
                root.remove(child)
                i = i+1
            if i >= total:
                flag = 0

        if flag == 1:
            tree.write(os.path.join(pathNewCate, axml))
    print("end")

#根据图片编号找出与之对应的xml
import shutil
def img_xml(img_path, from_ann_path,to_ann_path):
  for img in os.listdir(img_path):
    img = img.split('.')[0]
    shutil.copy(os.path.join(from_ann_path ,img +'.xml'),os.path.join(to_ann_path, img + '.xml'))

#划分数据集
import os  
import random  
  
trainval_percent = 0.75  # trainval数据集占所有数据的比例
test_percent = 0.25  # train数据集占trainval数据的比例
xmlfilepath = r'VOC2007\Annotations'
txtsavepath = r'VOC2007\ImageSets\Main'
total_xml = os.listdir(xmlfilepath)  

num=len(total_xml)  
list=range(num)  

tv=int(num*trainval_percent)  
trainval= random.sample(list,tv)  

ftrainval = open(os.path.join(txtsavepath,'trainval.txt'), 'w')  
ftest = open(os.path.join(txtsavepath,'test.txt'), 'w')  
  
for i  in list:  
    name=total_xml[i][:-4]+'\n'  
    if i in trainval:  
        ftrainval.write(name)  
    else:  
        ftest.write(name)  
  
ftrainval.close()  
ftest .close()

coco数据集

FILE = './annotations/instances_test2017.json'
import json
from pprint import pprint
with open(FILE, 'r') as file:
    gt = json.loads(file.read())
    categories  = gt['categories']
    annotations = gt['annotations']
count = {}
for item in annotations:
	idx = item['category_id']
	name = categories[idx-1]['name']
	if name not in count.keys():
		count[name] = 1
	else:
		count[name] += 1
pprint(count)

voc转coco

import os
import json
import xml.etree.ElementTree as ET
import numpy as np
import cv2
 
def _isArrayLike(obj):
    return hasattr(obj, '__iter__') and hasattr(obj, '__len__')
class voc2coco:
    def __init__(self, devkit_path=None):
        self.classes = ('__background__', 
                        'A', 'B')
        self.num_classes = len(self.classes)
        self.data_path = devkit_path
        self.annotaions_path = os.path.join(self.data_path, 'Annotations')
        self.image_set_path = os.path.join(self.data_path, 'ImageSets')
        #self.year = year
        self.categories_to_ids_map = self._get_categories_to_ids_map()
        self.categories_msg = self._categories_msg_generator()
 
    def _load_annotation(self, ids=[]):
        ids = ids if _isArrayLike(ids) else [ids]
        image_msg = []
        annotation_msg = []
        annotation_id = 1
        for index in ids:
            filename = '{:0>6}'.format(index)
            json_file = os.path.join(self.data_path, 'Segmentation_json', filename + '.json')
            if os.path.exists(json_file):
                img_file = os.path.join(self.data_path, 'JPEGImages', filename + '.jpg')
                im = cv2.imread(img_file)
                width = im.shape[1]
                height = im.shape[0]
                seg_data = json.load(open(json_file, 'r'))
                assert type(seg_data) == type(dict()), 'annotation file format {} not supported'.format(type(seg_data))
                for shape in seg_data['shapes']:
                    seg_msg = []
                    for point in shape['points']:
                        seg_msg += point
                    one_ann_msg = {"segmentation": [seg_msg],
                                   "area": self._area_computer(shape['points']),
                                   "iscrowd": 0,
                                   "image_id": int(index),
                                   "bbox": self._points_to_mbr(shape['points']),
                                   "category_id": self.categories_to_ids_map[shape['label']],
                                   "id": annotation_id,
                                   "ignore": 0
                                   }
                    annotation_msg.append(one_ann_msg)
                    annotation_id += 1
            else:
                xml_file = os.path.join(self.annotaions_path, filename + '.xml')
                tree = ET.parse(xml_file)
                size = tree.find('size')
                objs = tree.findall('object')
                width = size.find('width').text
                height = size.find('height').text
                for obj in objs:
                    bndbox = obj.find('bndbox')
                    [xmin, xmax, ymin, ymax] \
                        = [int(bndbox.find('xmin').text) - 1, int(bndbox.find('xmax').text),
                           int(bndbox.find('ymin').text) - 1, int(bndbox.find('ymax').text)]
                    if xmin < 0:
                        xmin = 0
                    if ymin < 0:
                        ymin = 0
                    bbox = [xmin, xmax, ymin, ymax]
                    one_ann_msg = {"segmentation": self._bbox_to_mask(bbox),
                                   "area": self._bbox_area_computer(bbox),
                                   "iscrowd": 0,
                                   "image_id": int(index),
                                   "bbox": [xmin, ymin, xmax - xmin, ymax - ymin],
                                   "category_id": self.categories_to_ids_map[obj.find('name').text],
                                   "id": annotation_id,
                                   "ignore": 0
                                   }
                    annotation_msg.append(one_ann_msg)
                    annotation_id += 1
            one_image_msg = {"file_name": filename + ".jpg",
                             "height": int(height),
                             "width": int(width),
                             "id": int(index)
                             }
            image_msg.append(one_image_msg)
        return image_msg, annotation_msg
    def _bbox_to_mask(self, bbox):
        assert len(bbox) == 4, 'Wrong bndbox!'
        mask = [bbox[0], bbox[2], bbox[0], bbox[3], bbox[1], bbox[3], bbox[1], bbox[2]]
        return [mask]
    def _bbox_area_computer(self, bbox):
        width = bbox[1] - bbox[0]
        height = bbox[3] - bbox[2]
        return width * height
    def _save_json_file(self, filename=None, data=None):
        json_path = os.path.join(self.data_path, 'annotations')
        assert filename is not None, 'lack filename'
        if os.path.exists(json_path) == False:
            os.mkdir(json_path)
        if not filename.endswith('.json'):
            filename += '.json'
        assert type(data) == type(dict()), 'data format {} not supported'.format(type(data))
        with open(os.path.join(json_path, filename), 'w') as f:
            f.write(json.dumps(data))
    def _get_categories_to_ids_map(self):
        return dict(zip(self.classes, range(self.num_classes)))
    def _get_all_indexs(self):
        ids = []
        for root, dirs, files in os.walk(self.annotaions_path, topdown=False):
            for f in files:
                if str(f).endswith('.xml'):
                    id = int(str(f).strip('.xml'))
                    ids.append(id)
        assert ids is not None, 'There is none xml file in {}'.format(self.annotaions_path)
        return ids
    def _get_indexs_by_image_set(self, image_set=None):
        if image_set is None:
            return self._get_all_indexs()
        else:
            image_set_path = os.path.join(self.image_set_path, 'Main', image_set + '.txt')
            assert os.path.exists(image_set_path), 'Path does not exist: {}'.format(image_set_path)
            with open(image_set_path) as f:
                ids = [x.strip() for x in f.readlines()]
            return ids
    def _points_to_mbr(self, points):
        assert _isArrayLike(points), 'Points should be array like!'
        x = [point[0] for point in points]
        y = [point[1] for point in points]
        assert len(x) == len(y), 'Wrong point quantity'
        xmin, xmax, ymin, ymax = min(x), max(x), min(y), max(y)
        height = ymax - ymin
        width = xmax - xmin
        return [xmin, ymin, width, height]
    def _categories_msg_generator(self):
        categories_msg = []
        for category in self.classes:
            if category == '__background__':
                continue
            one_categories_msg = {"supercategory": "none",
                                  "id": self.categories_to_ids_map[category],
                                  "name": category
                                  }
            categories_msg.append(one_categories_msg)
        return categories_msg
    def _area_computer(self, points):
        assert _isArrayLike(points), 'Points should be array like!'
        tmp_contour = []
        for point in points:
            tmp_contour.append([point])
        contour = np.array(tmp_contour, dtype=np.int32)
        area = cv2.contourArea(contour)
        return area
    def voc_to_coco_converter(self):
        img_sets = ['test', 'train'] #, 'train','val', 'trainval']
        for img_set in img_sets:
            ids = self._get_indexs_by_image_set(img_set)
            img_msg, ann_msg = self._load_annotation(ids)
            result_json = {"images": img_msg,
                           "type": "instances",
                           "annotations": ann_msg,
                           "categories": self.categories_msg}
            self._save_json_file('./annotations/'+'instances_' + img_set+"2017", result_json)
def demo():
    converter = voc2coco('./coco/')
    converter.voc_to_coco_converter()
if __name__ == "__main__":
    demo()

吴小白（W）

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
处理数据的代码

记录一些用来处理数据的代码，便于使用voc数据集import osimport pprint #此方法用来统计xml中不同类别的个数def tongji_xml(Anno_dir): counts = {} for filename in os.listdir(Anno_dir): tree = read_xml(os.path.join(Anno_dir,filename)) text_nodesCate = [] for child in tree.
复制链接

扫一扫