datasets.py
import glob
import math
import os
import random
import shutil
from pathlib import Path
import cv2
import numpy as np
import torch
from torch.utils.data import Dataset
from tqdm import tqdm
from utils.utils import xyxy2xywh
class LoadImages: # for inference
def __init__(self, path, img_size=416):#传入参数为输入图片文件夹
self.height = img_size
img_formats = ['.jpg', '.jpeg', '.png', '.tif']
vid_formats = ['.mov', '.avi', '.mp4']
files = []
if os.path.isdir(path):
files = sorted(glob.glob('%s/*.*' % path))
elif os.path.isfile(path):
files = [path]
images = [x for x in files if os.path.splitext(x)[-1].lower() in img_formats]
videos = [x for x in files if os.path.splitext(x)[-1].lower() in vid_formats]
nI, nV = len(images), len(videos)
self.files = images + videos
self.nF = nI + nV # number of files
self.video_flag = [False] * nI + [True] * nV
self.mode = 'images'
if any(videos):
self.new_video(videos[0]) # new video
else:
self.cap = None
assert self.nF > 0, 'No images or videos found in ' + path #判断文件是否存在
def __iter__(self):
self.count = 0 #__iter__返回的self实例带着属性count=0会进入__next__并且不再回头直到__next__抛出异常,下次进入重新创建迭代对象
return self
def __next__(self):
if self.count == self.nF:
raise StopIteration
path = self.files[self.count]
if self.video_flag[self.count]:
# Read video
self.mode = 'video'
ret_val, img0 = self.cap.read()
if not ret_val:
self.count += 1
self.cap.release()
if self.count == self.nF: # last video
raise StopIteration
else:
path = self.files[self.count]
self.new_video(path)
ret_val, img0 = self.cap.read()
self.frame += 1
print('video %g/%g (%g/%g) %s: ' % (self.count + 1, self.nF, self.frame, self.nframes, path), end='')
else:
# Read image
self.count += 1
img0 = cv2.imread(path) # BGR
assert img0 is not None, 'File Not Found ' + path
print('image %g/%g %s: ' % (self.count, self.nF, path), end='')
# Padded resize
#返回的img为按长边保持比例缩放到416后,放到图片中间,边缘填充灰色(因为color = (127.5, 127.5, 127.5)))的像素数组
# 其他三个是缩放比例ratio,宽高的总填充量
img, _, _, _ = letterbox(img0, new_shape=self.height)
# Normalize RGB
img = img[:, :, ::-1].transpose(2, 0, 1) # BGR to RGB 像素矩阵的翻转
img = np.ascontiguousarray(img, dtype=np.float32) # uint8 to float32
img /= 255.0 # 0 - 255 to 0.0 - 1.0 #归一化
# cv2.imwrite(path + '.letterbox.jpg', 255 * img.transpose((1, 2, 0))[:, :, ::-1]) # save letterbox image
return path, img, img0, self.cap
def new_video(self, path):
self.frame = 0
self.cap = cv2.VideoCapture(path)
self.nframes = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT))
def __len__(self): #自定义len(),调用len()会返回file数目
return self.nF # number of files
class LoadWebcam: # for inference
def __init__(self, img_size=416):
self.cam = cv2.VideoCapture(0)
self.height = img_size
def __iter__(self):
self.count = -1
return self
def __next__(self):
self.count += 1
if cv2.waitKey(1) == 27: # esc to quit
cv2.destroyAllWindows()
raise StopIteration
# Read image
ret_val, img0 = self.cam.read()
assert ret_val, 'Webcam Error'
img_path = 'webcam_%g.jpg' % self.count
img0 = cv2.flip(img0, 1) # flip left-right
print('webcam %g: ' % self.count, end='')
# Padded resize
img, _, _, _ = letterbox(img0, new_shape=self.height)
# Normalize RGB
img = img[:, :, ::-1].transpose(2, 0, 1) # BGR to RGB
img = np.ascontiguousarray(img, dtype=np.float32) # uint8 to float32
img /= 255.0 # 0 - 255 to 0.0 - 1.0
return img_path, img, img0, None
def __len__(self):
return 0
class LoadImagesAndLabels(Dataset): # for training/testing
def __init__(self, path, img_size=416, batch_size=16, augment=False, rect=True, image_weights=False,
multi_scale=False):
with open(path, 'r') as f:
img_files = f.read().splitlines()
self.img_files = list(filter(lambda x: len(x) > 0, img_files))
n = len(self.img_files)
bi = np.floor(np.arange(n) / batch_size).astype(np.int) # batch index
nb = bi[-1] + 1 # number of batches
assert n > 0, 'No images found in %s' % path
self.n = n
self.batch = bi # batch index of image
self.img_size = img_size
self.augment = augment
self.image_weights = image_weights
self.rect = False if image_weights else rect
self.label_files = [x.replace('images', 'labels').
replace('.jpeg', '.txt').
replace('.jpg', '.txt').
replace('.bmp', '.txt').
replace('.png', '.txt') for x in self.img_files]
if multi_scale:
s = img_size / 32
self.multi_scale = ((np.linspace(0.5, 1.5, nb) * s).round().astype(np.int) * 32)
# Rectangular Training https://github.com/ultralytics/yolov3/issues/232
if self.rect:
from PIL import Image
# Read image shapes
sp = 'data' + os.sep + path.replace('.txt', '.shapes').split(os.sep)[-1] # shapefile path
if os.path.exists(sp): # read existing shapefile
with open(sp, 'r') as f:
s = np.array([x.split() for x in f.read().splitlines()], dtype=np.float32)
assert len(s) == n, 'Shapefile out of sync, please delete %s and rerun' % sp
else: # no shapefile, so read shape using PIL and write shapefile for next time (faster)
s = np.array([Image.open(f).size for f in tqdm(self.img_files, desc='Reading image shapes')])
np.savetxt(sp, s, fmt='%g')
# Sort by aspect ratio
ar = s[:, 1] / s[:, 0] # aspect ratio
i = ar.argsort()
ar = ar[i]
self.img_files = [self.img_files[i] for i in i]
self.label_files = [self.label_files[i] for i in i]
# Set training image shapes
shapes = [[1, 1]] * nb
for i in range(nb):
ari = ar[bi == i]
mini, maxi = ari.min(), ari.max()
if maxi < 1:
shapes[i] = [maxi, 1]
elif mini > 1:
shapes[i] = [1, 1 / mini]
self.batch_shapes = np.ceil(np.array(shapes) * img_size / 32.).astype(np.int) * 32
# Preload labels (required for weighted CE training)
self.imgs = [None] * n
self.labels = [np.zeros((0, 5))] * n
iter = tqdm(self.label_files, desc='Reading labels') if n > 1000 else self.label_files
for i, file in enumerate(iter):
try:
with open(file, 'r') as f:
l = np.array([x.split() for x in f.read().splitlines()], dtype=np.float32)
if l.shape[0]:
assert l.shape[1] == 5, '> 5 label columns: %s' % file
assert (l >= 0).all(), 'negative labels: %s' % file
assert (l[:, 1:] <= 1).all(), 'non-normalized or out of bounds coordinate labels: %s' % file
self.labels[i] = l
except:
pass # print('Warning: missing labels for %s' % self.img_files[i]) # missing label file
assert len(np.concatenate(self.labels, 0)) > 0, 'No labels found. Incorrect label paths provided.'
def __len__(self):
return len(self.img_files)
# def __iter__(self):
# self.count = -1
# print('ran dataset iter')
# #self.shuffled_vector = np.random.permutation(self.nF) if self.augment else np.arange(self.nF)
# return self
def __getitem__(self, index):
if self.image_weights:
index = self.indices[index]
img_path = self.img_files[index]
label_path = self.label_files[index]
# Load image
img = self.imgs[index]
if img is None:
img = cv2.imread(img_path) # BGR
assert img is not None, 'File Not Found ' + img_path
if self.n < 1001:
self.imgs[index] = img # cache image into memory
# Augment colorspace
#hsv色彩空间变换
augment_hsv = True
if self.augment and augment_hsv:
# SV augmentation by 50% 饱和度和明度提高50%
fraction = 0.50 # must be < 1.0
img_hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV) # hue, sat, val 色彩空间变换
S = img_hsv[:, :, 1].astype(np.float32) # saturation
V = img_hsv[:, :, 2].astype(np.float32) # value
a = (random.random() * 2 - 1) * fraction + 1
b = (random.random() * 2 - 1) * fraction + 1
S *= a
V *= b
#与BGR类似,HSV的shape=(w,h,c),其中三通道的c[0,1,2]含有h,s,v信息
img_hsv[:, :, 1] = S if a < 1 else S.clip(None, 255)
img_hsv[:, :, 2] = V if b < 1 else V.clip(None, 255)
cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR, dst=img)
# Letterbox
h, w, _ = img.shape
if self.rect:
shape = self.batch_shapes[self.batch[index]]
img, ratio, padw, padh = letterbox(img, new_shape=shape, mode='rect')
else:
shape = int(self.multi_scale[self.batch[index]]) if hasattr(self, 'multi_scale') else self.img_size
img, ratio, padw, padh = letterbox(img, new_shape=shape, mode='square')
# Load labels
labels = []
if os.path.isfile(label_path):
# with open(label_path, 'r') as f:
# x = np.array([x.split() for x in f.read().splitlines()], dtype=np.float32)
x = self.labels[index]
if x.size > 0:
# Normalized xywh to pixel xyxy format
labels = x.copy()
#图片进行了缩放,所以gt的box也要按照ritio缩放
#坐标信息xywh是按照原图像进行归一化的
labels[:, 1] = ratio * w * (x[:, 1] - x[:, 3] / 2) + padw
labels[:, 2] = ratio * h * (x[:, 2] - x[:, 4] / 2) + padh
labels[:, 3] = ratio * w * (x[:, 1] + x[:, 3] / 2) + padw
labels[:, 4] = ratio * h * (x[:, 2] + x[:, 4] / 2) + padh
# Augment image and labels
if self.augment:
img, labels = random_affine(img, labels, degrees=(-5, 5), translate=(0.10, 0.10), scale=(0.90, 1.10))
nL = len(labels) # number of labels gt的数目
if nL:
# convert xyxy to xywh
# 仿射变换改了坐标为xyxy,这里又改回xywh
#ground truth是对缩填充放后的图片为准进行归一化
labels[:, 1:5] = xyxy2xywh(labels[:, 1:5])
# Normalize coordinates 0 - 1
labels[:, [2, 4]] /= img.shape[0] # height
labels[:, [1, 3]] /= img.shape[1] # width
if self.augment:
# random left-right flip
#随机左右翻转
lr_flip = True
if lr_flip and random.random() > 0.5:
img = np.fliplr(img)
if nL:
labels[:, 1] = 1 - labels[:, 1]
# random up-down flip
ud_flip = False
if ud_flip and random.random() > 0.5:
img = np.flipud(img)
if nL:
labels[:, 2] = 1 - labels[:, 2]
labels_out = torch.zeros((nL, 6))
if nL:
labels_out[:, 1:] = torch.from_numpy(labels)
# Normalize
img = img[:, :, ::-1].transpose(2, 0, 1) # BGR to RGB, to 3x416x416
img = np.ascontiguousarray(img, dtype=np.float32) # uint8 to float32
img /= 255.0 # 0 - 255 to 0.0 - 1.0
return torch.from_numpy(img), labels_out, img_path, (h, w)
@staticmethod
def collate_fn(batch):
img, label, path, hw = list(zip(*batch)) # transposed
for i, l in enumerate(label):
l[:, 0] = i # add target image index for build_targets()
return torch.stack(img, 0), torch.cat(label, 0), path, hw
def letterbox(img, new_shape=416, color=(127.5, 127.5, 127.5), mode='auto'):
# Resize a rectangular image to a 32 pixel multiple rectangle
# https://github.com/ultralytics/yolov3/issues/232
shape = img.shape[:2] # current shape [height, width]
if isinstance(new_shape, int):
ratio = float(new_shape) / max(shape)
else:
ratio = max(new_shape) / max(shape) # ratio = new / old
new_unpad = (int(round(shape[1] * ratio)), int(round(shape[0] * ratio)))
# Compute padding https://github.com/ultralytics/yolov3/issues/232
if mode is 'auto': # minimum rectangle
dw = np.mod(new_shape - new_unpad[0], 32) / 2 # width padding
dh = np.mod(new_shape - new_unpad[1], 32) / 2 # height padding
elif mode is 'square': # square
dw = (new_shape - new_unpad[0]) / 2 # width padding
dh = (new_shape - new_unpad[1]) / 2 # height padding
elif mode is 'rect': # square
dw = (new_shape[1] - new_unpad[0]) / 2 # width padding
dh = (new_shape[0] - new_unpad[1]) / 2 # height padding
top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR) # resized, no border
img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color) # padded square
return img, ratio, dw, dh
def random_affine(img, targets=(), degrees=(-10, 10), translate=(.1, .1), scale=(.9, 1.1), shear=(-2, 2),
borderValue=(127.5, 127.5, 127.5)):
# torchvision.transforms.RandomAffine(degrees=(-10, 10), translate=(.1, .1), scale=(.9, 1.1), shear=(-10, 10))
# https://medium.com/uruvideo/dataset-augmentation-with-random-homographies-a8f4b44830d4
if targets is None:
targets = []
border = 0 # width of added border (optional)
height = img.shape[0] + border * 2
width = img.shape[1] + border * 2
# Rotation and Scale
R = np.eye(3)
a = random.random() * (degrees[1] - degrees[0]) + degrees[0]
# a += random.choice([-180, -90, 0, 90]) # 90deg rotations added to small rotations
s = random.random() * (scale[1] - scale[0]) + scale[0]
R[:2] = cv2.getRotationMatrix2D(angle=a, center=(img.shape[1] / 2, img.shape[0] / 2), scale=s)
# Translation
T = np.eye(3)
T[0, 2] = (random.random() * 2 - 1) * translate[0] * img.shape[0] + border # x translation (pixels)
T[1, 2] = (random.random() * 2 - 1) * translate[1] * img.shape[1] + border # y translation (pixels)
# Shear
S = np.eye(3)
S[0, 1] = math.tan((random.random() * (shear[1] - shear[0]) + shear[0]) * math.pi / 180) # x shear (deg)
S[1, 0] = math.tan((random.random() * (shear[1] - shear[0]) + shear[0]) * math.pi / 180) # y shear (deg)
M = S @ T @ R # Combined rotation matrix. ORDER IS IMPORTANT HERE!!
imw = cv2.warpPerspective(img, M, dsize=(width, height), flags=cv2.INTER_LINEAR,
borderValue=borderValue) # BGR order borderValue
# Return warped points also
if len(targets) > 0:
n = targets.shape[0]
points = targets[:, 1:5].copy()
area0 = (points[:, 2] - points[:, 0]) * (points[:, 3] - points[:, 1])
# warp points
xy = np.ones((n * 4, 3))
xy[:, :2] = points[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(n * 4, 2) # x1y1, x2y2, x1y2, x2y1
xy = (xy @ M.T)[:, :2].reshape(n, 8)
# create new boxes
x = xy[:, [0, 2, 4, 6]]
y = xy[:, [1, 3, 5, 7]]
xy = np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T
# # apply angle-based reduction of bounding boxes
# radians = a * math.pi / 180
# reduction = max(abs(math.sin(radians)), abs(math.cos(radians))) ** 0.5
# x = (xy[:, 2] + xy[:, 0]) / 2
# y = (xy[:, 3] + xy[:, 1]) / 2
# w = (xy[:, 2] - xy[:, 0]) * reduction
# h = (xy[:, 3] - xy[:, 1]) * reduction
# xy = np.concatenate((x - w / 2, y - h / 2, x + w / 2, y + h / 2)).reshape(4, n).T
# reject warped points outside of image
xy[:, [0, 2]] = xy[:, [0, 2]].clip(0, width)
xy[:, [1, 3]] = xy[:, [1, 3]].clip(0, height)
w = xy[:, 2] - xy[:, 0]
h = xy[:, 3] - xy[:, 1]
area = w * h
ar = np.maximum(w / (h + 1e-16), h / (w + 1e-16))
i = (w > 4) & (h > 4) & (area / (area0 + 1e-16) > 0.1) & (ar < 10)
targets = targets[i]
targets[:, 1:5] = xy[i]
return imw, targets
def convert_images2bmp():
# cv2.imread() jpg at 230 img/s, *.bmp at 400 img/s
for path in ['../coco/images/val2014/', '../coco/images/train2014/']:
folder = os.sep + Path(path).name
output = path.replace(folder, folder + 'bmp')
if os.path.exists(output):
shutil.rmtree(output) # delete output folder
os.makedirs(output) # make new output folder
for f in tqdm(glob.glob('%s*.jpg' % path)):
save_name = f.replace('.jpg', '.bmp').replace(folder, folder + 'bmp')
cv2.imwrite(save_name, cv2.imread(f))
for label_path in ['../coco/trainvalno5k.txt', '../coco/5k.txt']:
with open(label_path, 'r') as file:
lines = file.read()
lines = lines.replace('2014/', '2014bmp/').replace('.jpg', '.bmp').replace(
'/Users/glennjocher/PycharmProjects/', '../')
with open(label_path.replace('5k', '5k_bmp'), 'w') as file:
file.write(lines)