VOC Dataset (解析voc数据集、mosaic增强、voc转labelme)

Mr.Q

已于 2023-02-16 18:39:05 修改

阅读量1.7k

点赞数 1

分类专栏： PyTorch YOLO 文章标签： yolov1

于 2021-12-21 17:07:07 首次发布

本文链接：https://blog.csdn.net/jizhidexiaoming/article/details/122057283

版权

PyTorch 同时被 2 个专栏收录

38 篇文章 21 订阅

订阅专栏

YOLO

13 篇文章 5 订阅

订阅专栏

下载链接：

链接：https://pan.baidu.com/s/1L_tCyT3zr4vWcSW6Eyoxeg
提取码：8ful
--来自百度网盘超级会员V3的分享

1. 标注文件内容

2. python解析代码（Dataset）

3. mosaic增强（可选操作）

4. voc2labelme

1. 标注文件内容

Annotations/000005.xml，主要内容如下。

<annotation>
	<folder>VOC2007</folder>
	<filename>000005.jpg</filename>  # 对应图片文件名
	<source>
		<database>The VOC2007 Database</database>
		<annotation>PASCAL VOC2007</annotation>
		<image>flickr</image>
		<flickrid>325991873</flickrid>
	</source>
	<owner>
		<flickrid>archintent louisville</flickrid>
		<name>?</name>
	</owner>
	<size>  # 图像原始尺寸
		<width>500</width>
		<height>375</height>
		<depth>3</depth>
	</size>
	<segmented>0</segmented>  # 是否用于分割
	<object>
		<name>chair</name>  # 物体类别
		<pose>Rear</pose>  # 拍摄角度：front, rear, left, right, unspecified
		<truncated>0</truncated>  # 目标是否被截断，或者被遮挡（超过15%）
		<difficult>0</difficult>  # 检测难易程度，这个主要是根据目标的大小，光照变化，图片质量来判断
		<bndbox>  # 目标位置
			<xmin>263</xmin>
			<ymin>211</ymin>
			<xmax>324</xmax>
			<ymax>339</ymax>
		</bndbox>
	</object>
</annotation>

2. python解析代码（Dataset）

"""
return:
    img: tensor. rgb. (c,h,w). 缩放后的图像.
    gt: numpy. (num_bbox,5). 相对于原图归一化的坐标和类别信息
        [xmin/w, ymin/h, xmax/w, ymax/h, label_ind]
        eg. [[0.524,0.56,0.646,0.90133333,8], [], ...]

"""

"""VOC Dataset Classes

Original author: Francisco Massa
https://github.com/fmassa/vision/blob/voc_dataset/torchvision/datasets/voc.py

Updated by: Ellis Brown, Max deGroot
"""
import os.path as osp
import sys
import torch
import torch.utils.data as data
import cv2
import numpy as np
import random


if sys.version_info[0] == 2:
    import xml.etree.cElementTree as ET
else:
    import xml.etree.ElementTree as ET

VOC_CLASSES = (  # always index 0
    'aeroplane', 'bicycle', 'bird', 'boat',
    'bottle', 'bus', 'car', 'cat', 'chair',
    'cow', 'diningtable', 'dog', 'horse',
    'motorbike', 'person', 'pottedplant',
    'sheep', 'sofa', 'train', 'tvmonitor')

# note: if you used our download scripts, this should be right
path_to_dir = osp.dirname(osp.abspath(__file__))
VOC_ROOT = path_to_dir + "/VOCdevkit/"


# VOC_ROOT = "/home/k303/object-detection/dataset/VOCdevkit/"


class VOCAnnotationTransform(object):
    """Transforms a VOC annotation into a Tensor of bbox coords and label index
    Initilized with a dictionary lookup of classnames to indexes

    Arguments:
        class_to_ind (dict, optional): dictionary lookup of classnames -> indexes
            (default: alphabetic indexing of VOC's 20 classes)
        keep_difficult (bool, optional): keep difficult instances or not
            (default: False)
        height (int): height
        width (int): width
    """

    def __init__(self, class_to_ind=None, keep_difficult=False):
        """
        class_to_ind = {
            "cat": 0,
            "**", 1,
        }
        """
        self.class_to_ind = class_to_ind or dict(
            zip(VOC_CLASSES, range(len(VOC_CLASSES))))
        self.keep_difficult = keep_difficult

    def __call__(self, target, width, height):
        """
        Arguments:
            target (annotation) : the target annotation to be made usable
                will be an ET.Element
        Returns:
            a list containing lists of bounding boxes [xmin/w, ymin/h, xmax/w, ymax/h, label_ind]
        """
        res = []
        for obj in target.iter('object'):  # 利用根节点，找到子节点object
            difficult = int(obj.find('difficult').text) == 1  # object子节点difficult，等于1，则属于困难样本
            if not self.keep_difficult and difficult:  # 如果不保留困难样本，且当前是困难样本，则跳过。
                continue
            name = obj.find('name').text.lower().strip()  # 目标类别
            bbox = obj.find('bndbox')  # 目标位置

            pts = ['xmin', 'ymin', 'xmax', 'ymax']
            bndbox = []
            for i, pt in enumerate(pts):
                cur_pt = int(bbox.find(pt).text) - 1  # 这里为啥要-1
                # scale height or width
                cur_pt = cur_pt / width if i % 2 == 0 else cur_pt / height
                bndbox.append(cur_pt)
            label_idx = self.class_to_ind[name]  # 获取目标对应的类别数
            bndbox.append(label_idx)  # 在归一化后的位置信息后追加类别信息。
            res += [bndbox]  # [xmin, ymin, xmax, ymax, label_ind]
            # img_id = target.find('filename').text[:-4]

        return res  # [[xmin, ymin, xmax, ymax, label_ind], ... ]


class VOCDetection(data.Dataset):
    """VOC Detection Dataset Object

    input is image, target is annotation

    Arguments:
        root (string): filepath to VOCdevkit folder.
        image_set (string): imageset to use (eg. 'train', 'val', 'test')
        transform (callable, optional): transformation to perform on the
            input image
        target_transform (callable, optional): transformation to perform on the
            target `annotation`
            (eg: take in caption string, return tensor of word indices)
        dataset_name (string, optional): which dataset to load
            (default: 'VOC2007')
    """

    def __init__(self, root, img_size,
                 image_sets=[('2007', 'trainval'), ('2012', 'trainval')],
                 transform=None, target_transform=VOCAnnotationTransform(),
                 dataset_name='VOC0712', mosaic=False):
        self.root = root  # str. 数据路径: path_to_dir + "/VOCdevkit/"
        self.img_size = img_size  # int. 640
        self.image_set = image_sets  # list. [('2007', 'trainval'), ('2012', 'trainval')]
        self.transform = transform  # image transform: resize, -mean
        self.target_transform = target_transform  #
        self.name = dataset_name  # str. VOC0712
        self._annopath = osp.join('%s', 'Annotations', '%s.xml')
        self._imgpath = osp.join('%s', 'JPEGImages', '%s.jpg')
        self.ids = list()  # 保存参与训练的图片路径
        self.mosaic = mosaic
        for (year, name) in image_sets:  # [('2007', 'trainval'), ('2012', 'trainval')]
            rootpath = osp.join(self.root, 'VOC' + year)  # VOC2007
            for line in open(osp.join(rootpath, 'ImageSets', 'Main', name + '.txt')):  # trainval.txt
                self.ids.append((rootpath, line.strip()))  # line: str. strip():删除前后空格或者换行符的字符串

    def __getitem__(self, index):
        """
        img: tensor. rgb. (c,h,w). 缩放后的图像.
        gt: numpy. (num_bbox,5). 相对于原图归一化的坐标加类别信息
            [xmin/w, ymin/h, xmax/w, ymax/h, label_ind]
            eg. [[0.524,0.56,0.646,0.90133333,8], [], ...]
        h: 原图高度
        w:
        return:
            im, gt
        """
        im, gt, h, w = self.pull_item(index)

        return im, gt

    def __len__(self):
        return len(self.ids)
    
    def pull_item(self, index):
        """

        return:
            img: tensor. (c,h,w). 缩放后的图像.
            target: numpy. (num_bbox,5). 相对于原图归一化的坐标加类别信息
                    [xmin/w, ymin/h, xmax/w, ymax/h, label_ind]
                    [[0.524,0.56,0.646,0.90133333,8], [], ...]
            height: 原图高度
            width:
        """
        img_id = self.ids[index]

        # Parse XML document into element tree. Return root element of this tree.
        target = ET.parse(self._annopath % img_id).getroot()  # 标注信息的根节点
        img = cv2.imread(self._imgpath % img_id)
        height, width, channels = img.shape

        if self.target_transform is not None:
            # # [[xmin, ymin, xmax, ymax, label_ind], ... ] 缩放后的位置信息加类别信息
            target = self.target_transform(target, width, height)

        # mosaic augmentation  镶嵌增强，即缩放多张图片并拼接一张图片。
        if self.mosaic and np.random.randint(2):
            return self.mosaic_augmentation(img=img, target=target, index=index)
        # basic augmentation(SSDAugmentation or BaseTransform)
        if self.transform is not None:
            # check labels
            if len(target) == 0:  # 如果图片中没有任何目标，则生成全0标注信息
                target = np.zeros([1, 5])  # 类别设置为0不要紧，因为只计算有目标的类别损失
            else:
                target = np.array(target)  # (3, 5). list to numpy. 每一行是一个目标信息
            # resize img, and -mean. 其中boxes（相对大小，不需要变）和labels没有改变
            img, boxes, labels = self.transform(img, boxes=target[:, :4], labels=target[:, 4])
            # to rgb
            img = img[:, :, (2, 1, 0)]
            # img = img.transpose(2, 0, 1)
            target = np.hstack((boxes, np.expand_dims(labels, axis=1)))  # (3,4) + (3,1) -> (3,5)
        return torch.from_numpy(img).permute(2, 0, 1), target, height, width
        # return torch.from_numpy(img), target, height, width

    def pull_image(self, index):
        '''Returns the original image object at index in PIL form

        Note: not using self.__getitem__(), as any transformations passed in
        could mess up this functionality.

        Argument:
            index (int): index of img to show
        Return:
            PIL img
        '''
        img_id = self.ids[index]
        return cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR), img_id

    def pull_anno(self, index):
        '''Returns the original annotation of image at index

        Note: not using self.__getitem__(), as any transformations passed in
        could mess up this functionality.

        Argument:
            index (int): index of img to get annotation of
        Return:
            list:  [img_id, [(label, bbox coords),...]]
                eg: ('001718', [('dog', (96, 13, 438, 332))])
        '''
        img_id = self.ids[index]
        anno = ET.parse(self._annopath % img_id).getroot()
        gt = self.target_transform(anno, 1, 1)
        return img_id[1], gt


class BaseTransform:
    def __init__(self, size, mean):
        self.size = size
        self.mean = np.array(mean, dtype=np.float32)

    def __call__(self, image, boxes=None, labels=None):
        x = cv2.resize(image, (self.size[0], self.size[1])).astype(np.float32)
        x -= self.mean
        return x, boxes, labels


if __name__ == "__main__":
    img_size = 640
    # dataset
    dataset = VOCDetection(VOC_ROOT, img_size, [('2007', 'trainval')],
                           transform=BaseTransform(size=[img_size, img_size], mean=(0, 0, 0)),  # resize, -mean
                           target_transform=VOCAnnotationTransform(),
                           mosaic=True)
    for i in range(1000):
        im, gt, h, w = dataset.pull_item(i)  # img:rgb缩放后的图像(c,h_r,w_r); gt: 标注信息(num_bbox,5); h和w原始大小
        img = im.permute(1, 2, 0).numpy()[:, :, (2, 1, 0)].astype(np.uint8)  # rgb to bgr
        cv2.imwrite('-1.jpg', img)
        img = cv2.imread('-1.jpg')

        for box in gt:  # 一张图像中所有的标注框
            xmin, ymin, xmax, ymax, cls_idx = box  # 相对于原图归一化的位置信息，乘以缩放后的大小，就获得相对于缩放后的位置
            xmin *= img_size
            ymin *= img_size
            xmax *= img_size
            ymax *= img_size
            cv2.rectangle(img, (int(xmin), int(ymin)), (int(xmax), int(ymax)), (0, 0, 255), 2)
            cv2.putText(img, VOC_CLASSES[int(cls_idx)], (int(xmin), int(ymin)), fontFace=cv2.FONT_HERSHEY_SIMPLEX,
                        fontScale=0.8, color=(255, 0, 0), thickness=2)
        cv2.imshow('gt', img)
        cv2.waitKey(0)

3. mosaic增强（可选操作）

这部分不是重点，且下面写法费解，可忽略。后续有时间则实现一个简易版。

def mosaic_augmentation(self, img, index, target):
        ids_list_ = self.ids[:index] + self.ids[index + 1:]
        # random sample 3 indexs
        id2, id3, id4 = random.sample(ids_list_, 3)
        ids = [id2, id3, id4]
        img_lists = [img]
        tg_lists = [target]
        for id_ in ids:
            img_ = cv2.imread(self._imgpath % id_)
            height_, width_, channels_ = img_.shape

            target_ = ET.parse(self._annopath % id_).getroot()
            target_ = self.target_transform(target_, width_, height_)

            img_lists.append(img_)
            tg_lists.append(target_)

        mosaic_img = np.zeros([self.img_size * 2, self.img_size * 2, img.shape[2]], dtype=np.uint8)
        # mosaic center
        yc, xc = [int(random.uniform(-x, 2 * self.img_size + x)) for x in
                  [-self.img_size // 2, -self.img_size // 2]]

        mosaic_tg = []
        for i in range(4):
            img_i, target_i = img_lists[i], tg_lists[i]
            h0, w0, _ = img_i.shape

            # resize image to img_size
            r = self.img_size / max(h0, w0)
            if r != 1:  # always resize down, only resize up if training with augmentation
                img_i = cv2.resize(img_i, (int(w0 * r), int(h0 * r)))
            h, w, _ = img_i.shape

            # place img in img4
            if i == 0:  # top left
                x1a, y1a, x2a, y2a = max(xc - w, 0), max(yc - h, 0), xc, yc  # xmin, ymin, xmax, ymax (large image)
                x1b, y1b, x2b, y2b = w - (x2a - x1a), h - (y2a - y1a), w, h  # xmin, ymin, xmax, ymax (small image)
            elif i == 1:  # top right
                x1a, y1a, x2a, y2a = xc, max(yc - h, 0), min(xc + w, self.img_size * 2), yc
                x1b, y1b, x2b, y2b = 0, h - (y2a - y1a), min(w, x2a - x1a), h
            elif i == 2:  # bottom left
                x1a, y1a, x2a, y2a = max(xc - w, 0), yc, xc, min(self.img_size * 2, yc + h)
                x1b, y1b, x2b, y2b = w - (x2a - x1a), 0, w, min(y2a - y1a, h)
            elif i == 3:  # bottom right
                x1a, y1a, x2a, y2a = xc, yc, min(xc + w, self.img_size * 2), min(self.img_size * 2, yc + h)
                x1b, y1b, x2b, y2b = 0, 0, min(w, x2a - x1a), min(y2a - y1a, h)

            mosaic_img[y1a:y2a, x1a:x2a] = img_i[y1b:y2b, x1b:x2b]
            padw = x1a - x1b
            padh = y1a - y1b

            # labels
            target_i = np.array(target_i)
            target_i_ = target_i.copy()
            if len(target_i) > 0:
                # a valid target, and modify it.
                target_i_[:, 0] = (w * (target_i[:, 0]) + padw)
                target_i_[:, 1] = (h * (target_i[:, 1]) + padh)
                target_i_[:, 2] = (w * (target_i[:, 2]) + padw)
                target_i_[:, 3] = (h * (target_i[:, 3]) + padh)

                mosaic_tg.append(target_i_)

        if len(mosaic_tg) == 0:
            mosaic_tg = np.zeros([1, 5])
        else:
            mosaic_tg = np.concatenate(mosaic_tg, axis=0)
            # Cutout/Clip targets
            np.clip(mosaic_tg[:, :4], 0, 2 * self.img_size, out=mosaic_tg[:, :4])
            # normalize
            mosaic_tg[:, :4] /= (self.img_size * 2)

        # augment
        mosaic_img, boxes, labels = self.transform(mosaic_img, mosaic_tg[:, :4], mosaic_tg[:, 4])
        # to rgb
        mosaic_img = mosaic_img[:, :, (2, 1, 0)]
        # img = img.transpose(2, 0, 1)
        mosaic_tg = np.hstack((boxes, np.expand_dims(labels, axis=1)))

        scale = np.array([[1., 1., 1., 1.]])
        offset = np.zeros([1, 4])

        return torch.from_numpy(mosaic_img).permute(2, 0, 1).float(), mosaic_tg, self.img_size, self.img_size

4. voc2labelme

import json
import os
import sys

import cv2


if sys.version_info[0] == 2:
    import xml.etree.cElementTree as ET
else:
    import xml.etree.ElementTree as ET


VOC_CLASSES = (  # always index 0
    'aeroplane', 'bicycle', 'bird', 'boat',
    'bottle', 'bus', 'car', 'cat', 'chair',
    'cow', 'diningtable', 'dog', 'horse',
    'motorbike', 'person', 'pottedplant',
    'sheep', 'sofa', 'train', 'tvmonitor')


class VOCAnnotationParser(object):
    """Transforms a VOC annotation into a Tensor of bbox coords and label index
    Initilized with a dictionary lookup of classnames to indexes
    Arguments:
        class_to_ind (dict, optional): dictionary lookup of classnames -> indexes
            (default: alphabetic indexing of VOC's 20 classes)
        keep_difficult (bool, optional): keep difficult instances or not
            (default: False)
        height (int): height
        width (int): width
    """

    def __init__(self, class_to_ind=None, keep_difficult=False):
        """
        class_to_ind = {
            "cat": 0,
            "**", 1,
        }
        """
        self.class_to_ind = class_to_ind or dict(
            zip(VOC_CLASSES, range(len(VOC_CLASSES))))
        self.keep_difficult = keep_difficult

    def __call__(self, target, width, height):
        """
        Arguments:
            target (annotation) : the target annotation to be made usable
                will be an ET.Element
        Returns:
            a list containing lists of bounding boxes [xmin/w, ymin/h, xmax/w, ymax/h, label_ind]
        """
        res = []
        for obj in target.iter('object'):  # 利用根节点，找到子节点object
            difficult = int(obj.find('difficult').text) == 1  # object子节点difficult，等于1，则属于困难样本
            if not self.keep_difficult and difficult:  # 如果不保留困难样本，且当前是困难样本，则跳过。
                continue
            name = obj.find('name').text.lower().strip()  # 目标类别
            bbox = obj.find('bndbox')  # 目标位置

            pts = ['xmin', 'ymin', 'xmax', 'ymax']
            bndbox = []
            for i, pt in enumerate(pts):
                cur_pt = int(bbox.find(pt).text)  # 这里为啥要-1
                # scale height or width
                # cur_pt = cur_pt / width if i % 2 == 0 else cur_pt / height
                bndbox.append(cur_pt)
            # label_idx = self.class_to_ind[name]  # 获取目标对应的类别数
            # bndbox.append(label_idx)  # 在归一化后的位置信息后追加类别信息。
            bndbox.append(name)  # 在归一化后的位置信息后追加类别信息。
            res += [bndbox]  # [xmin, ymin, xmax, ymax, label_ind]
            # img_id = target.find('filename').text[:-4]

        return res  # [[xmin, ymin, xmax, ymax, label_ind], ... ]


def voc_dict_to_label_dict(img, voc_output, label_dict):
    h, w, c = img.shape
    if len(label_dict) == 0:
        shape_info = {'points': None,
                      'group_id': None,
                      # "fill_color": None,
                      # "line_color": None,
                      "label": "bg",
                      "shape_type": "polygon",
                      "flags": {}
                      }
        voc_output["shapes"].append(shape_info)
    else:
        for label in label_dict:
            pt_list = []
            pt1 = [int(label[0]), int(label[1])]
            pt2 = [int(label[2]), int(label[3])]
            pt_list.append(pt1)
            pt_list.append(pt2)
            shape_info = {'points': pt_list,
                          'group_id': None,
                          # "fill_color": None,
                          # "line_color": None,
                          "label": label[4],
                          "shape_type": "rectangle",
                          "flags": {}
                          }
            voc_output["shapes"].append(shape_info)
        voc_output["imageHeight"] = h
        voc_output["imageWidth"] = w


def voc2json(img_path, label_dict):
    voc_output = {
        "version": "3.16.7",
        "flags": {},
        # "fillColor": [255, 0, 0, 128],
        # "lineColor": [0, 255, 0, 128],
        "imagePath": {},
        "shapes": [],
        "imageData": {}}
    img_file_name = os.path.basename(img_path)
    voc_output["imagePath"] = img_file_name
    # image = Image.open(IMAGE_DIR + '/' + name1)
    # imageData = img_tobyte(image)
    # coco_output["imageData"] = imageData
    voc_output["imageData"] = None
    img = cv2.imread(img_path)

    voc_dict_to_label_dict(img, voc_output, label_dict)

    extension = os.path.splitext(img_file_name)[-1]
    json_full_path = img_path.replace(extension, ".json")

    with open(json_full_path, 'w') as output_json_file:
        json.dump(voc_output, output_json_file, indent=4)


if __name__ == '__main__':
    img_path = r"F:\zxq\data\self\VOCdevkit\VOC2007-new\train\images\000009.jpg"
    anno_path = r"F:\zxq\data\self\VOCdevkit\VOC2007-new\train\labels\000009.xml"
    target = ET.parse(anno_path).getroot()  # 标注信息的根节点
    img = cv2.imread(img_path)
    h, w, c = img.shape
    target_trans = VOCAnnotationParser(keep_difficult=True)
    label_info = target_trans(target, w, h)
    print(label_info)

    for label in label_info:
        cv2.rectangle(img, (label[0], label[1]), (label[2], label[3]), (0, 0, 255), 2)
        cv2.putText(img, label[4], (label[0]+2, label[1]+8), cv2.FONT_HERSHEY_SIMPLEX, 0.3, (0, 255, 0), 1)

    cv2.namedWindow("img", cv2.WINDOW_NORMAL), cv2.imshow("img", img), cv2.waitKey()

    voc2json(img_path, label_info)