原因
yolo原本的Dataload可阅读性太差,可拓展性也很差。另一个为了熟悉源代码。
class DataFolder(VisionDataset):
def __init__(
self,
root: str,
img_size=640,
batch_size=32,
augment=True,
hyp=None,
transform: Optional[Callable] = None,
target_transform: Optional[Callable] = None
) -> None:
super().__init__(root, transform=transform, target_transform=target_transform)
self.sample = {}
self.findLabelAndImg(root, self.sample)
# 创建索引
self.indices = range(len(self.sample['filePath']))
self.img_size = img_size
self.batch_size = batch_size
# 超参数
self.hyp = hyp
self.augment = augment
# 增强
self.mosaic = True
self.mixup = True
self.albumentations = Albumentations(size=img_size) if augment else None
def findLabelAndImg(self, root, sample):
try:
f = [] # image files
p = Path(root)
if p.is_dir(): # dir
f += glob.glob(str(p / '**' / '*.*'), recursive=True)
im_files = sorted(x.replace('/', os.sep) for x in f if x.split('.')[-1].lower() in IMG_FORMATS)
assert im_files, f'No images found'
except Exception as e:
raise Exception(f'Error loading data from {root}: {e}') from e
sample['filePath'] = f
sample['labelPath'] = img2label_paths(im_files)
def _load_image(self, id: int):
f = self.sample['filePath'][id]
im = cv2.imread(f)
return im, im.shape[:2] # BGR
def _load_target(self, id: int):
lb_file = self.sample['labelPath'][id]
if os.path.isfile(lb_file):
nf = 1 # label found
with open(lb_file) as f:
lb = [x.split() for x in f.read().strip().splitlines() if len(x)]
lb = np.array(lb, dtype=np.float32)
nl = len(lb)
if nl:
assert lb.shape[1] == 5, f'labels require 5 columns, {lb.shape[1]} columns detected'
assert (lb >= 0).all(), f'negative label values {lb[lb < 0]}'
assert (lb[:, 1:] <= 1).all(), f'non-normalized or out of bounds coordinates {lb[:, 1:][lb[:, 1:] > 1]}'
else:
ne = 1 # label empty
lb = np.zeros((0, 5), dtype=np.float32)
else:
print(f'未发现label{lb_file}')
lb = np.zeros((0, 5), dtype=np.float32)
return lb
def __getitem__(self, index: int) -> Tuple[Any, Any]:
id = self.indices[index]
hyp = self.hyp
img, (h, w) = self._load_image(id)
img, ratio, pad = letterbox(img, self.img_size, auto=False, scaleup=self.augment)
labels = self._load_target(index)
if labels.size: # normalized xywh to pixel xyxy format
labels[:, 1:] = xywhn2xyxy(labels[:, 1:], ratio[0] * w, ratio[1] * h, padw=pad[0], padh=pad[1])
if self.augment:
img, labels = random_perspective(img,
labels,
degrees=hyp['degrees'],
translate=hyp['translate'],
scale=hyp['scale'],
shear=hyp['shear'],
perspective=hyp['perspective'])
nl = len(labels) # number of labels
# xyxy 坐标转换为 xywh坐标
if nl:
labels[:, 1:5] = xyxy2xywhn(labels[:, 1:5], w=img.shape[1], h=img.shape[0], clip=True, eps=1E-3)
if self.augment:
# Albumentations
img, labels = self.albumentations(img, labels)
nl = len(labels) # update after albumentations
# HSV color-space
augment_hsv(img, hgain=hyp['hsv_h'], sgain=hyp['hsv_s'], vgain=hyp['hsv_v'])
# Flip up-down
if random.random() < hyp['flipud']:
img = np.flipud(img)
if nl:
labels[:, 2] = 1 - labels[:, 2]
# Flip left-right
if random.random() < hyp['fliplr']:
img = np.fliplr(img)
if nl:
labels[:, 1] = 1 - labels[:, 1]
# Cutouts
# labels = cutout(img, labels, p=0.5)
# nl = len(labels) # update after cutout
return img, labels
def __len__(self) -> int:
return len(self.indices)
定义一个类:
class DataFolder(VisionDataset):
类的介绍:
"""
Base Class For making datasets which are compatible with torchvision.
It is necessary to override the ``__getitem__`` and ``__len__`` method.
Args:
root (string): Root directory of dataset.
transforms (callable, optional): A function/transforms that takes in
an image and a label and returns the transformed versions of both.
transform (callable, optional): A function/transform that takes in an PIL image
and returns a transformed version. E.g, ``transforms.RandomCrop``
target_transform (callable, optional): A function/transform that takes in the
target and transforms it.
.. note::
:attr:`transforms` and the combination of :attr:`transform` and :attr:`target_transform` are mutually exclusive.
"""
具体的可以看看,我另外一篇关于这个类的源码阅读。这个类我觉得相当于定义了一种编码规范
实现
class DataFolder(VisionDataset):
def __init__(
self,
root: str,
transform: Optional[Callable] = None,
target_transform: Optional[Callable] = None,
cache=False,
) -> None:
super().__init__(root, transform=transform, target_transform=target_transform)
self.im_files, self.label_files = find_img_labels(self.root)
self.samples = [list(x) + [Path(x[0]).with_suffix('.npy'), None] for x in zip(self.im_files, self.label_files)]
self.index = range(len(self.label_files))
self.cache_ram = cache is True or cache == 'ram'
self.cache_disk = cache == 'disk'
def _load_image(self, id: int):
f, t, fn, im = self.samples[id]
if self.cache_ram and im is None:
im = cv2.imread(f)
elif self.cache_disk:
if not fn.exists(): # load npy
np.save(fn.as_posix(), cv2.imread(f))
im = np.load(fn)
else: # read image
im = cv2.imread(f) # BGR
return im
def _load_target(self, id: int):
f, t, fn, im = self.samples[id]
if os.path.isfile(t):
with open(t) as f:
lb = [x.split() for x in f.read().strip().splitlines() if len(x)]
lb = np.array(lb, dtype=np.float32)
else:
raise FileNotFoundError(f'{t} does not exist')
return lb
def __getitem__(self, index: int) -> Tuple[Any, Any]:
id = self.index[index]
image = self._load_image(id)
target = self._load_target(id)
if self.transforms is not None:
image, target = self.transforms(image, target)
return image, target
总的思路: 读取图片地址,label地址,指定索引。然后定义两个文件,label加载器就行了。
效果:
mosaic:
self.mosaic = True
初始化方法加一个
if self.mosaic:
image, target = self.load_mosaic(id, self.img_size)
__getitem__ 加一句
load_mosaic:yolo源码拷贝下来就行了,只需要改一下加载图片和标签
mixup:
同样拷贝下来源码就可以了
总结
其他一些增强的方法,都差不多。主要就是索引处理,标签和图片加载。性能肯定比yolo原生的差多了,yolo原生标签,图片在初始化就统一处理了,然后缓存。所以代码量非常大,可阅读性很差。另外torchvision.datasets这个包提供了一套编码规范逻辑很清晰。初学的话,照这个来,我觉得是比较好的选择。