import torch
import xml.etree.ElementTree as ET
import os
import cv2
import numpy as np
from torchvision import transforms
import random
def get_mosaic_coordinate(mosaic_image, mosaic_index, xc, yc, w, h, input_h, input_w):
# TODO update doc
# index0 to top left part of image
#
if mosaic_index == 0:
x1, y1, x2, y2 = max(xc - w, 0), max(yc - h, 0), xc, yc
small_coord = w - (x2 - x1), h - (y2 - y1), w, h # 记录是否边界,若超出moosaic底板,超出部分裁剪
# index1 to top right part of image
elif mosaic_index == 1:
x1, y1, x2, y2 = xc, max(yc - h, 0), min(xc + w, input_w * 2), yc
small_coord = 0, h - (y2 - y1), min(w, x2 - x1), h
# index2 to bottom left part of image
elif mosaic_index == 2:
x1, y1, x2, y2 = max(xc - w, 0), yc, xc, min(input_h * 2, yc + h)
small_coord = w - (x2 - x1), 0, w, min(y2 - y1, h)
# index2 to bottom right part of image
elif mosaic_index == 3:
x1, y1, x2, y2 = xc, yc, min(xc + w, input_w * 2), min(input_h * 2, yc + h) # noqa
small_coord = 0, 0, min(w, x2 - x1), min(y2 - y1, h)
return (x1, y1, x2, y2), small_coord
class VOCDataset(torch.utils.data.Dataset):
CLASSES_NAME = (
"__background__ ", # 记得加上背景类
"aeroplane",
"bicycle",
"bird",
"boat",
"bottle",
"bus",
"car",
"cat",
"chair",
"cow",
"diningtable",
"dog",
"horse",
"motorbike",
"person",
"pottedplant",
"sheep",
"sofa",
"train",
"tvmonitor",
)
# 初始化类
def __init__(self, root_dir, resize_size=[800, 1024], split='test', use_difficult=False):
self.root = root_dir
self.use_difficult = use_difficult
self.imgset = split
self._annopath = os.path.join(self.root, "Annotations", "%s.xml")
self._imgpath = os.path.join(self.root, "JPEGImages", "%s.jpg")
self._imgsetpath = os.path.join(self.root, "ImageSets", "Main", "%s.txt")
# 读取trainval.txt中内容
with open(self._imgsetpath % self.imgset) as f: # % 是python字符串中的一个转义字符可以百度下,不难
self.img_ids = f.readlines()
self.img_ids = [x.strip() for x in self.img_ids] # ['000009', '000052']
self.name2id = dict(zip(VOCDataset.CLASSES_NAME, range(len(VOCDataset.CLASSES_NAME))))
# self.resize_size = resize_size
self.mean = [0.485, 0.456, 0.406] # voc数据集中所有图像矩阵的均值和方差,为后续图像归一化做准备
self.std = [0.229, 0.224, 0.225]
print("INFO=====>voc dataset init finished ! !")
def __len__(self):
return len(self.img_ids)
def _read_img_rgb(self, path):
return cv2.cvtColor(cv2.imread(path)[...,::-1], cv2.COLOR_BGR2RGB)
def __getitem__(self, item):
input_w, input_h = 640, 640
# yc = int(random.uniform(0.5 * input_h, 1.5 * input_h))
yc = 582
# xc = int(random.uniform(0.5 * input_w, 1.5 * input_w))
xc = 587
# indices = [item] + [random.randint(0, len(self.img_ids) - 1) for _ in range(3)]
indices = [1, 1058, 1430, 1187]
for i_mosaic, index in enumerate(indices):
img_id = self.img_ids[index]
img = self._read_img_rgb(self._imgpath % img_id)
h0, w0 = img.shape[:2] # orig hw
cv2.imwrite("orial_"+str(i_mosaic)+".jpg", img)
scale = min(1. * input_h / h0, 1. * input_w / w0) # 按照对应边比例最小的进行缩放
img = cv2.resize(
img, (int(w0 * scale), int(h0 * scale)), interpolation=cv2.INTER_LINEAR
)
cv2.imwrite("resize_"+str(i_mosaic)+".jpg", img)
(h, w, c) = img.shape
if i_mosaic == 0:
# 基于114创建一个 2* input_h, 2*input_w的灰色底板
mosaic_img = np.full((input_h * 2, input_w * 2, c), 114, dtype=np.uint8)
(l_x1, l_y1, l_x2, l_y2), (s_x1, s_y1, s_x2, s_y2) = get_mosaic_coordinate(
mosaic_img, i_mosaic, xc, yc, w, h, input_h, input_w
)
# 将 (s_x1, s_y1, s_x2, s_y2)部分对应图像拼接到mosaic底板上
print("~~~~~~~~~~~~mosaic_img")
print(mosaic_img[l_y1:l_y2, l_x1:l_x2].shape)
print("~~~~~~~~~~~~resize_img")
print(img[s_y1:s_y2, s_x1:s_x2].shape)
mosaic_img[l_y1:l_y2, l_x1:l_x2] = img[s_y1:s_y2, s_x1:s_x2]
# padw, padh = l_x1 - s_x1, l_y1 - s_y1
# cv2.imshow("mosaic_img", mosaic_img)
cv2.imwrite("mosaic.jpg",mosaic_img)
cv2.waitKey()
anno = ET.parse(self._annopath % img_id).getroot() # 读取xml文档的根节点
boxes = []
classes = []
for obj in anno.iter("object"):
difficult = int(obj.find("difficult").text) == 1
if not self.use_difficult and difficult:
continue
_box = obj.find("bndbox")
box = [
_box.find("xmin").text,
_box.find("ymin").text,
_box.find("xmax").text,
_box.find("ymax").text,
]
TO_REMOVE = 1 # 由于像素是网格存储,坐标2实质表示第一个像素格,所以-1
box = tuple(
map(lambda x: x - TO_REMOVE, list(map(float, box)))
)
boxes.append(box)
name = obj.find("name").text.lower().strip()
classes.append(self.name2id[name]) # 将类别映射回去
boxes = np.array(boxes, dtype=np.float32)
#将img,box和classes转成tensor
img = transforms.ToTensor()(img) # transforms 自动将 图像进行了归一化,
boxes = torch.from_numpy(boxes)
classes = torch.LongTensor(classes)
return img, boxes, classes
if __name__ == '__main__':
dataset = VOCDataset('D:\Py_code\Mosaic\data\VOC07_test\VOCdevkit\VOC2007') # 实例化一个对象
img,box,cls = dataset[1] # 返回第一张图像及box和对应的类别
print(img.shape)
print(box)
print(cls)
# 这里简单做一下可视化
# # 由于opencv读入是矩阵,而img现在是tensor,因此,首先将tensor转成numpy.array
# img_ = (img.numpy()*255).astype(np.uint8).transpose(1,2,0)# 注意由于图像像素分布0-255,所以转成uint8
# # cv2.imshow('test',img_)
# # cv2.waitKey(0)
原始4张图片
缩放后图片
马赛克融合后的图片
图片中超出蒙版区域的部分都被裁剪掉
x1, y1, x2, y2 = max(xc - w, 0), max(yc - h, 0), xc, yc
small_coord = w - (x2 - x1), h - (y2 - y1), w, h
这两行代码是记录放置位置,以及最大放置区域
mosaic_img[l_y1:l_y2, l_x1:l_x2] = img[s_y1:s_y2, s_x1:s_x2]
利用这行代码将图片进行粘贴