下载链接:
链接:https://pan.baidu.com/s/1L_tCyT3zr4vWcSW6Eyoxeg
提取码:8ful
--来自百度网盘超级会员V3的分享
目录
1. 标注文件内容
Annotations/000005.xml,主要内容如下。
<annotation>
<folder>VOC2007</folder>
<filename>000005.jpg</filename> # 对应图片文件名
<source>
<database>The VOC2007 Database</database>
<annotation>PASCAL VOC2007</annotation>
<image>flickr</image>
<flickrid>325991873</flickrid>
</source>
<owner>
<flickrid>archintent louisville</flickrid>
<name>?</name>
</owner>
<size> # 图像原始尺寸
<width>500</width>
<height>375</height>
<depth>3</depth>
</size>
<segmented>0</segmented> # 是否用于分割
<object>
<name>chair</name> # 物体类别
<pose>Rear</pose> # 拍摄角度:front, rear, left, right, unspecified
<truncated>0</truncated> # 目标是否被截断,或者被遮挡(超过15%)
<difficult>0</difficult> # 检测难易程度,这个主要是根据目标的大小,光照变化,图片质量来判断
<bndbox> # 目标位置
<xmin>263</xmin>
<ymin>211</ymin>
<xmax>324</xmax>
<ymax>339</ymax>
</bndbox>
</object>
</annotation>
2. python解析代码(Dataset)
""" return: img: tensor. rgb. (c,h,w). 缩放后的图像. gt: numpy. (num_bbox,5). 相对于原图归一化的坐标和类别信息 [xmin/w, ymin/h, xmax/w, ymax/h, label_ind] eg. [[0.524,0.56,0.646,0.90133333,8], [], ...] """
"""VOC Dataset Classes
Original author: Francisco Massa
https://github.com/fmassa/vision/blob/voc_dataset/torchvision/datasets/voc.py
Updated by: Ellis Brown, Max deGroot
"""
import os.path as osp
import sys
import torch
import torch.utils.data as data
import cv2
import numpy as np
import random
if sys.version_info[0] == 2:
import xml.etree.cElementTree as ET
else:
import xml.etree.ElementTree as ET
VOC_CLASSES = ( # always index 0
'aeroplane', 'bicycle', 'bird', 'boat',
'bottle', 'bus', 'car', 'cat', 'chair',
'cow', 'diningtable', 'dog', 'horse',
'motorbike', 'person', 'pottedplant',
'sheep', 'sofa', 'train', 'tvmonitor')
# note: if you used our download scripts, this should be right
path_to_dir = osp.dirname(osp.abspath(__file__))
VOC_ROOT = path_to_dir + "/VOCdevkit/"
# VOC_ROOT = "/home/k303/object-detection/dataset/VOCdevkit/"
class VOCAnnotationTransform(object):
"""Transforms a VOC annotation into a Tensor of bbox coords and label index
Initilized with a dictionary lookup of classnames to indexes
Arguments:
class_to_ind (dict, optional): dictionary lookup of classnames -> indexes
(default: alphabetic indexing of VOC's 20 classes)
keep_difficult (bool, optional): keep difficult instances or not
(default: False)
height (int): height
width (int): width
"""
def __init__(self, class_to_ind=None, keep_difficult=False):
"""
class_to_ind = {
"cat": 0,
"**", 1,
}
"""
self.class_to_ind = class_to_ind or dict(
zip(VOC_CLASSES, range(len(VOC_CLASSES))))
self.keep_difficult = keep_difficult
def __call__(self, target, width, height):
"""
Arguments:
target (annotation) : the target annotation to be made usable
will be an ET.Element
Returns:
a list containing lists of bounding boxes [xmin/w, ymin/h, xmax/w, ymax/h, label_ind]
"""
res = []
for obj in target.iter('object'): # 利用根节点,找到子节点object
difficult = int(obj.find('difficult').text) == 1 # object子节点difficult,等于1,则属于困难样本
if not self.keep_difficult and difficult: # 如果不保留困难样本,且当前是困难样本,则跳过。
continue
name = obj.find('name').text.lower().strip() # 目标类别
bbox = obj.find('bndbox') # 目标位置
pts = ['xmin', 'ymin', 'xmax', 'ymax']
bndbox = []
for i, pt in enumerate(pts):
cur_pt = int(bbox.find(pt).text) - 1 # 这里为啥要-1
# scale height or width
cur_pt = cur_pt / width if i % 2 == 0 else cur_pt / height
bndbox.append(cur_pt)
label_idx = self.class_to_ind[name] # 获取目标对应的类别数
bndbox.append(label_idx) # 在归一化后的位置信息后追加类别信息。
res += [bndbox] # [xmin, ymin, xmax, ymax, label_ind]
# img_id = target.find('filename').text[:-4]
return res # [[xmin, ymin, xmax, ymax, label_ind], ... ]
class VOCDetection(data.Dataset):
"""VOC Detection Dataset Object
input is image, target is annotation
Arguments:
root (string): filepath to VOCdevkit folder.
image_set (string): imageset to use (eg. 'train', 'val', 'test')
transform (callable, optional): transformation to perform on the
input image
target_transform (callable, optional): transformation to perform on the
target `annotation`
(eg: take in caption string, return tensor of word indices)
dataset_name (string, optional): which dataset to load
(default: 'VOC2007')
"""
def __init__(self, root, img_size,
image_sets=[('2007', 'trainval'), ('2012', 'trainval')],
transform=None, target_transform=VOCAnnotationTransform(),
dataset_name='VOC0712', mosaic=False):
self.root = root # str. 数据路径: path_to_dir + "/VOCdevkit/"
self.img_size = img_size # int. 640
self.image_set = image_sets # list. [('2007', 'trainval'), ('2012', 'trainval')]
self.transform = transform # image transform: resize, -mean
self.target_transform = target_transform #
self.name = dataset_name # str. VOC0712
self._annopath = osp.join('%s', 'Annotations', '%s.xml')
self._imgpath = osp.join('%s', 'JPEGImages', '%s.jpg')
self.ids = list() # 保存参与训练的图片路径
self.mosaic = mosaic
for (year, name) in image_sets: # [('2007', 'trainval'), ('2012', 'trainval')]
rootpath = osp.join(self.root, 'VOC' + year) # VOC2007
for line in open(osp.join(rootpath, 'ImageSets', 'Main', name + '.txt')): # trainval.txt
self.ids.append((rootpath, line.strip())) # line: str. strip():删除前后空格或者换行符的字符串
def __getitem__(self, index):
"""
img: tensor. rgb. (c,h,w). 缩放后的图像.
gt: numpy. (num_bbox,5). 相对于原图归一化的坐标加类别信息
[xmin/w, ymin/h, xmax/w, ymax/h, label_ind]
eg. [[0.524,0.56,0.646,0.90133333,8], [], ...]
h: 原图高度
w:
return:
im, gt
"""
im, gt, h, w = self.pull_item(index)
return im, gt
def __len__(self):
return len(self.ids)
def pull_item(self, index):
"""
return:
img: tensor. (c,h,w). 缩放后的图像.
target: numpy. (num_bbox,5). 相对于原图归一化的坐标加类别信息
[xmin/w, ymin/h, xmax/w, ymax/h, label_ind]
[[0.524,0.56,0.646,0.90133333,8], [], ...]
height: 原图高度
width:
"""
img_id = self.ids[index]
# Parse XML document into element tree. Return root element of this tree.
target = ET.parse(self._annopath % img_id).getroot() # 标注信息的根节点
img = cv2.imread(self._imgpath % img_id)
height, width, channels = img.shape
if self.target_transform is not None:
# # [[xmin, ymin, xmax, ymax, label_ind], ... ] 缩放后的位置信息加类别信息
target = self.target_transform(target, width, height)
# mosaic augmentation 镶嵌增强,即缩放多张图片并拼接一张图片。
if self.mosaic and np.random.randint(2):
return self.mosaic_augmentation(img=img, target=target, index=index)
# basic augmentation(SSDAugmentation or BaseTransform)
if self.transform is not None:
# check labels
if len(target) == 0: # 如果图片中没有任何目标,则生成全0标注信息
target = np.zeros([1, 5]) # 类别设置为0不要紧,因为只计算有目标的类别损失
else:
target = np.array(target) # (3, 5). list to numpy. 每一行是一个目标信息
# resize img, and -mean. 其中boxes(相对大小,不需要变)和labels没有改变
img, boxes, labels = self.transform(img, boxes=target[:, :4], labels=target[:, 4])
# to rgb
img = img[:, :, (2, 1, 0)]
# img = img.transpose(2, 0, 1)
target = np.hstack((boxes, np.expand_dims(labels, axis=1))) # (3,4) + (3,1) -> (3,5)
return torch.from_numpy(img).permute(2, 0, 1), target, height, width
# return torch.from_numpy(img), target, height, width
def pull_image(self, index):
'''Returns the original image object at index in PIL form
Note: not using self.__getitem__(), as any transformations passed in
could mess up this functionality.
Argument:
index (int): index of img to show
Return:
PIL img
'''
img_id = self.ids[index]
return cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR), img_id
def pull_anno(self, index):
'''Returns the original annotation of image at index
Note: not using self.__getitem__(), as any transformations passed in
could mess up this functionality.
Argument:
index (int): index of img to get annotation of
Return:
list: [img_id, [(label, bbox coords),...]]
eg: ('001718', [('dog', (96, 13, 438, 332))])
'''
img_id = self.ids[index]
anno = ET.parse(self._annopath % img_id).getroot()
gt = self.target_transform(anno, 1, 1)
return img_id[1], gt
class BaseTransform:
def __init__(self, size, mean):
self.size = size
self.mean = np.array(mean, dtype=np.float32)
def __call__(self, image, boxes=None, labels=None):
x = cv2.resize(image, (self.size[0], self.size[1])).astype(np.float32)
x -= self.mean
return x, boxes, labels
if __name__ == "__main__":
img_size = 640
# dataset
dataset = VOCDetection(VOC_ROOT, img_size, [('2007', 'trainval')],
transform=BaseTransform(size=[img_size, img_size], mean=(0, 0, 0)), # resize, -mean
target_transform=VOCAnnotationTransform(),
mosaic=True)
for i in range(1000):
im, gt, h, w = dataset.pull_item(i) # img:rgb缩放后的图像(c,h_r,w_r); gt: 标注信息(num_bbox,5); h和w原始大小
img = im.permute(1, 2, 0).numpy()[:, :, (2, 1, 0)].astype(np.uint8) # rgb to bgr
cv2.imwrite('-1.jpg', img)
img = cv2.imread('-1.jpg')
for box in gt: # 一张图像中所有的标注框
xmin, ymin, xmax, ymax, cls_idx = box # 相对于原图归一化的位置信息,乘以缩放后的大小,就获得相对于缩放后的位置
xmin *= img_size
ymin *= img_size
xmax *= img_size
ymax *= img_size
cv2.rectangle(img, (int(xmin), int(ymin)), (int(xmax), int(ymax)), (0, 0, 255), 2)
cv2.putText(img, VOC_CLASSES[int(cls_idx)], (int(xmin), int(ymin)), fontFace=cv2.FONT_HERSHEY_SIMPLEX,
fontScale=0.8, color=(255, 0, 0), thickness=2)
cv2.imshow('gt', img)
cv2.waitKey(0)
3. mosaic增强 (可选操作)
这部分不是重点,且下面写法费解,可忽略。后续有时间则实现一个简易版。
def mosaic_augmentation(self, img, index, target):
ids_list_ = self.ids[:index] + self.ids[index + 1:]
# random sample 3 indexs
id2, id3, id4 = random.sample(ids_list_, 3)
ids = [id2, id3, id4]
img_lists = [img]
tg_lists = [target]
for id_ in ids:
img_ = cv2.imread(self._imgpath % id_)
height_, width_, channels_ = img_.shape
target_ = ET.parse(self._annopath % id_).getroot()
target_ = self.target_transform(target_, width_, height_)
img_lists.append(img_)
tg_lists.append(target_)
mosaic_img = np.zeros([self.img_size * 2, self.img_size * 2, img.shape[2]], dtype=np.uint8)
# mosaic center
yc, xc = [int(random.uniform(-x, 2 * self.img_size + x)) for x in
[-self.img_size // 2, -self.img_size // 2]]
mosaic_tg = []
for i in range(4):
img_i, target_i = img_lists[i], tg_lists[i]
h0, w0, _ = img_i.shape
# resize image to img_size
r = self.img_size / max(h0, w0)
if r != 1: # always resize down, only resize up if training with augmentation
img_i = cv2.resize(img_i, (int(w0 * r), int(h0 * r)))
h, w, _ = img_i.shape
# place img in img4
if i == 0: # top left
x1a, y1a, x2a, y2a = max(xc - w, 0), max(yc - h, 0), xc, yc # xmin, ymin, xmax, ymax (large image)
x1b, y1b, x2b, y2b = w - (x2a - x1a), h - (y2a - y1a), w, h # xmin, ymin, xmax, ymax (small image)
elif i == 1: # top right
x1a, y1a, x2a, y2a = xc, max(yc - h, 0), min(xc + w, self.img_size * 2), yc
x1b, y1b, x2b, y2b = 0, h - (y2a - y1a), min(w, x2a - x1a), h
elif i == 2: # bottom left
x1a, y1a, x2a, y2a = max(xc - w, 0), yc, xc, min(self.img_size * 2, yc + h)
x1b, y1b, x2b, y2b = w - (x2a - x1a), 0, w, min(y2a - y1a, h)
elif i == 3: # bottom right
x1a, y1a, x2a, y2a = xc, yc, min(xc + w, self.img_size * 2), min(self.img_size * 2, yc + h)
x1b, y1b, x2b, y2b = 0, 0, min(w, x2a - x1a), min(y2a - y1a, h)
mosaic_img[y1a:y2a, x1a:x2a] = img_i[y1b:y2b, x1b:x2b]
padw = x1a - x1b
padh = y1a - y1b
# labels
target_i = np.array(target_i)
target_i_ = target_i.copy()
if len(target_i) > 0:
# a valid target, and modify it.
target_i_[:, 0] = (w * (target_i[:, 0]) + padw)
target_i_[:, 1] = (h * (target_i[:, 1]) + padh)
target_i_[:, 2] = (w * (target_i[:, 2]) + padw)
target_i_[:, 3] = (h * (target_i[:, 3]) + padh)
mosaic_tg.append(target_i_)
if len(mosaic_tg) == 0:
mosaic_tg = np.zeros([1, 5])
else:
mosaic_tg = np.concatenate(mosaic_tg, axis=0)
# Cutout/Clip targets
np.clip(mosaic_tg[:, :4], 0, 2 * self.img_size, out=mosaic_tg[:, :4])
# normalize
mosaic_tg[:, :4] /= (self.img_size * 2)
# augment
mosaic_img, boxes, labels = self.transform(mosaic_img, mosaic_tg[:, :4], mosaic_tg[:, 4])
# to rgb
mosaic_img = mosaic_img[:, :, (2, 1, 0)]
# img = img.transpose(2, 0, 1)
mosaic_tg = np.hstack((boxes, np.expand_dims(labels, axis=1)))
scale = np.array([[1., 1., 1., 1.]])
offset = np.zeros([1, 4])
return torch.from_numpy(mosaic_img).permute(2, 0, 1).float(), mosaic_tg, self.img_size, self.img_size
4. voc2labelme
import json
import os
import sys
import cv2
if sys.version_info[0] == 2:
import xml.etree.cElementTree as ET
else:
import xml.etree.ElementTree as ET
VOC_CLASSES = ( # always index 0
'aeroplane', 'bicycle', 'bird', 'boat',
'bottle', 'bus', 'car', 'cat', 'chair',
'cow', 'diningtable', 'dog', 'horse',
'motorbike', 'person', 'pottedplant',
'sheep', 'sofa', 'train', 'tvmonitor')
class VOCAnnotationParser(object):
"""Transforms a VOC annotation into a Tensor of bbox coords and label index
Initilized with a dictionary lookup of classnames to indexes
Arguments:
class_to_ind (dict, optional): dictionary lookup of classnames -> indexes
(default: alphabetic indexing of VOC's 20 classes)
keep_difficult (bool, optional): keep difficult instances or not
(default: False)
height (int): height
width (int): width
"""
def __init__(self, class_to_ind=None, keep_difficult=False):
"""
class_to_ind = {
"cat": 0,
"**", 1,
}
"""
self.class_to_ind = class_to_ind or dict(
zip(VOC_CLASSES, range(len(VOC_CLASSES))))
self.keep_difficult = keep_difficult
def __call__(self, target, width, height):
"""
Arguments:
target (annotation) : the target annotation to be made usable
will be an ET.Element
Returns:
a list containing lists of bounding boxes [xmin/w, ymin/h, xmax/w, ymax/h, label_ind]
"""
res = []
for obj in target.iter('object'): # 利用根节点,找到子节点object
difficult = int(obj.find('difficult').text) == 1 # object子节点difficult,等于1,则属于困难样本
if not self.keep_difficult and difficult: # 如果不保留困难样本,且当前是困难样本,则跳过。
continue
name = obj.find('name').text.lower().strip() # 目标类别
bbox = obj.find('bndbox') # 目标位置
pts = ['xmin', 'ymin', 'xmax', 'ymax']
bndbox = []
for i, pt in enumerate(pts):
cur_pt = int(bbox.find(pt).text) # 这里为啥要-1
# scale height or width
# cur_pt = cur_pt / width if i % 2 == 0 else cur_pt / height
bndbox.append(cur_pt)
# label_idx = self.class_to_ind[name] # 获取目标对应的类别数
# bndbox.append(label_idx) # 在归一化后的位置信息后追加类别信息。
bndbox.append(name) # 在归一化后的位置信息后追加类别信息。
res += [bndbox] # [xmin, ymin, xmax, ymax, label_ind]
# img_id = target.find('filename').text[:-4]
return res # [[xmin, ymin, xmax, ymax, label_ind], ... ]
def voc_dict_to_label_dict(img, voc_output, label_dict):
h, w, c = img.shape
if len(label_dict) == 0:
shape_info = {'points': None,
'group_id': None,
# "fill_color": None,
# "line_color": None,
"label": "bg",
"shape_type": "polygon",
"flags": {}
}
voc_output["shapes"].append(shape_info)
else:
for label in label_dict:
pt_list = []
pt1 = [int(label[0]), int(label[1])]
pt2 = [int(label[2]), int(label[3])]
pt_list.append(pt1)
pt_list.append(pt2)
shape_info = {'points': pt_list,
'group_id': None,
# "fill_color": None,
# "line_color": None,
"label": label[4],
"shape_type": "rectangle",
"flags": {}
}
voc_output["shapes"].append(shape_info)
voc_output["imageHeight"] = h
voc_output["imageWidth"] = w
def voc2json(img_path, label_dict):
voc_output = {
"version": "3.16.7",
"flags": {},
# "fillColor": [255, 0, 0, 128],
# "lineColor": [0, 255, 0, 128],
"imagePath": {},
"shapes": [],
"imageData": {}}
img_file_name = os.path.basename(img_path)
voc_output["imagePath"] = img_file_name
# image = Image.open(IMAGE_DIR + '/' + name1)
# imageData = img_tobyte(image)
# coco_output["imageData"] = imageData
voc_output["imageData"] = None
img = cv2.imread(img_path)
voc_dict_to_label_dict(img, voc_output, label_dict)
extension = os.path.splitext(img_file_name)[-1]
json_full_path = img_path.replace(extension, ".json")
with open(json_full_path, 'w') as output_json_file:
json.dump(voc_output, output_json_file, indent=4)
if __name__ == '__main__':
img_path = r"F:\zxq\data\self\VOCdevkit\VOC2007-new\train\images\000009.jpg"
anno_path = r"F:\zxq\data\self\VOCdevkit\VOC2007-new\train\labels\000009.xml"
target = ET.parse(anno_path).getroot() # 标注信息的根节点
img = cv2.imread(img_path)
h, w, c = img.shape
target_trans = VOCAnnotationParser(keep_difficult=True)
label_info = target_trans(target, w, h)
print(label_info)
for label in label_info:
cv2.rectangle(img, (label[0], label[1]), (label[2], label[3]), (0, 0, 255), 2)
cv2.putText(img, label[4], (label[0]+2, label[1]+8), cv2.FONT_HERSHEY_SIMPLEX, 0.3, (0, 255, 0), 1)
cv2.namedWindow("img", cv2.WINDOW_NORMAL), cv2.imshow("img", img), cv2.waitKey()
voc2json(img_path, label_info)