pytorch dataset自定义_目标检测：SSD模型——pytorch数据载入及增广

最新推荐文章于 2022-05-07 09:58:49 发布

weixin_39683241

最新推荐文章于 2022-05-07 09:58:49 发布

阅读量533

点赞数

文章标签： pytorch dataset自定义 pytorch图片转为rgb ssd目标检测训练自己的数据

v2-4d9f16c71945beafbdcce181bb8f4cd0_1440w.jpg?source=172ae18b

进行模型训练的第一步是载入数据，使用pytorch框架载入数据需要两个步骤：构建Dataset数据集和创建Dataloader数据迭代器。pytorch要载入数据训练SSD，可以直接调用 torchvision.datasets.VOCDetection 或者 torchvision.datasets.CocoDetection ，需要做的是按照要求放置数据就好。

此处为了自己写后续的图像增广的操作，所以自定义了Dataset类进行数据读取。自定义的类是继承了 torch.utils.data.dataset.Dataset 类，必须重构实现的__init__(self, ... ) 和 __getitem__(self, index)函数，习惯上还会重写__len__(self)函数。

下面是自己重写的Dataset类，为了读入方便，采用一个txt文件去存储图片（文件格式jpg）和对应的标注文件（文件格式xml），生成txt文件代码如下：

import

下面是重写的Dataset类，其中由于要解析xml标注文件，所以定义函数 load_xml 进行读取解析；而考虑到在进行测试时也存在数据载入需要，所以有 is_trian 标志位去判断载入数据类型，如果是训练数据，则需要图像和标注label同时载入；否则仅仅载入图像数据，用于测试。

import re
import torch
from PIL import Image
import cv2
import numpy as np
from torch.utils.data import dataset
from torchvision import transforms

## 该函数用于解析xml标注文件
def load_xml(filepath):
    ## 定义正则匹配模板
    pattern1 = re.compile('<bndbox>')
    pattern2 = re.compile('<name>(.*)</name>')

    fi = open(filepath)
    content = fi.readlines()
    tag_list = []
    bndbox_list = []
    for line, line1, line2, line3, line4 in zip(content, content[1:], content[2:], content[3:], content[4:]):
        res = pattern2.search(line)
        if res != None:
            ## 正则匹配找到classname
            tag_list.append(re.findall(pattern2, line))
        ## 正则匹配找到bounding box的信息
        res = pattern1.search(line)
        if res != None:
            xmin = int(re.findall(r'd+', line1)[0])
            ymin = int(re.findall(r'd+', line2)[0])
            xmax = int(re.findall(r'd+', line3)[0])
            ymax = int(re.findall(r'd+', line4)[0])
            bndbox_list.append([xmin, ymin, xmax, ymax])

    return bndbox_list, tag_list

## 重写的载入SSD数据的Dataset类
class DetectionDataset(dataset.Dataset):
    ## init传入参数除了record_path是刚刚生成的数据txt文件，is_train是判定是都是载入训练数据，
    ## Classes是label的list（要注意background作为
的第一类存在于list里面），其他可暂时不管；因为里面部分参数是跟数据增广相关
    def __init__(self, record_path, img_shape, Classes=[], is_mixup=False, is_mosaic=False, is_train=True, backbone_name='resnet50'):
        self.data = []
        self.img_shape = img_shape
        self.Classes = Classes
        self.is_train = is_train
        self.backbone_name = backbone_name
        ## mixup or mosaic just one
        if (is_mixup and is_mosaic) or is_mosaic:
            self.mix_num = 4
            self.mosaic = imgdeal.MosaicDeal()
        elif is_mixup and not is_mosaic:
            self.mix_num = 2
        else:
            self.mix_num = 1

        ## 如果是训练数据，需要图片和标注文件一块读入
        if self.is_train:
            with open(record_path) as fp:
                for line in fp.readlines():
                    if line == 'n':
                        break
                    else:
                        tmp = line.strip("n").split(" ")
                        ## tmp[0]: is img path，tmp[1]: is img label
                        self.data.append([tmp[0], tmp[1]])

        else:
            with open(record_path) as fp:
                for line in fp.readlines():
                    if line == 'n':
                        break
                    else:
                        tmp = line.strip("n").split(" ")
                        ## tmp[0]: is img path
                        self.data.append([tmp[0]])
            self.transformations = transforms.Compose([transforms.ToTensor()])

    # get the data size
    def __len__(self):
        return len(self.data)

    # get the date one by one
    def __getitem__(self, index):
        ## 如果只是测试，仅读入图片即可
        if not self.is_train:
            im = self.pull_image(index)
            return im
        ## 训练需要同时返回图像和标注label
        im, gt = self.pull_item(index)
        return im, gt

    ## 该函数用于读入训练的图像和标注文件
    def pull_item(self, index):
        out_img = np.zeros([self.img_shape, self.img_shape, 3])
        out_target = []

        min_offset = 0.2
        cut_x = np.random.randint(int(self.img_shape * min_offset), int(self.img_shape * (1 - min_offset)))
        cut_y = np.random.randint(int(self.img_shape * min_offset), int(self.img_shape * (1 - min_offset)))

        for i in range(self.mix_num):
            if i == 0:
                img = Image.open(self.data[index][0]).convert('RGB')
                bndbox, label = load_xml(self.data[index][1])
            else:
                random_index = np.random.randint(0, len(self.data))
                img = Image.open(self.data[random_index][0]).convert('RGB')
                bndbox, label = datadeal.load_xml(self.data[random_index][1])

            ## 将tag转换成类别里面的索引
            label_index = []
            for tag in label:
                label_index.append([self.Classes.index(tag[0])])

            ## 后续为图像增广操作
            ## augment the image
            img, bndbox, label_index = imgdeal.data_augmentation(img, bndbox, label_index, self.img_shape)
            ## 合并bounding box和tag
            target = np.hstack((bndbox, label_index))

            if self.mix_num == 2:
                if i == 0:
                    old_img = img.copy()
                    old_truth = target.copy()
                else:
                    out_img = cv2.addWeighted(img, 0.5, old_img, 0.5, 0)
                    out_target = np.concatenate([old_truth, target], axis=0)

            elif self.mix_num == 4:
                oh, ow, oc = img.shape
                dh, dw, dc = np.array(np.array([oh, ow, oc]) * 0.3, dtype=np.int)

                pleft = np.random.randint(-dw, dw)
                pright = np.random.randint(-dw, dw)
                ptop = np.random.randint(-dh, dh)
                pbot = np.random.randint(-dh, dh)
                swidth = ow - pleft - pright
                sheight = oh - ptop - pbot

                left_shift = int(min(cut_x, max(0, (-int(pleft) * self.img_shape / swidth))))
                top_shift = int(min(cut_y, max(0, (-int(ptop) * self.img_shape / sheight))))
                right_shift = int(min((self.img_shape - cut_x), max(0, (-int(pright) * self.img_shape / swidth))))
                bot_shift = int(min(self.img_shape - cut_y, max(0, (-int(pbot) * self.img_shape / sheight))))

                out_img, target = self.mosaic(out_img, img.copy(), target.copy(), self.img_shape, self.img_shape,
                                                  cut_x, cut_y, i, left_shift, right_shift, top_shift, bot_shift)
                if i == 0:
                    out_target = target
                else:
                    ## --TODO
                    if len(out_target) == 0:
                        out_target = target
                    elif len(out_target) != 0 and len(target) == 0:
                        out_target = out_target
                    else:
                        out_target = np.concatenate([out_target, target], axis=0)

            else:
                out_img = img.copy()
                out_target = target.copy()

        pre_deal = datadeal.AssignGTtoDefaultBox(image_size=self.img_shape, backbone_name=self.backbone_name)
        out_img, out_target = pre_deal(out_img, out_target)

        return out_img, out_target

    ## 该函数用于读入测试图像
    def pull_image(self, index):
        img = Image.open(self.data[index][0]).resize((self.img_shape, self.img_shape)).convert('RGB')
        out_img = np.array(img) / 255
        return out_img

将训练数据载入变成Dataset之后，为了进行训练，需要定义一个DataLoader迭代器，进行数据的批量载入。关于DataLoader的参数可自行查阅，但此处要指出，由于重写了Dataset，载入训练数据同时返回图像和label，所以collate_fn这个参数需要输入取样本的方式函数。

from torch.utils.data import DataLoader

## 载入训练数据时候DataLoader读入数据的方式
def detection_collate(batch):
    targets = []
    imgs = []
    for sample in batch:
        imgs.append(sample[0])
        targets.append(torch.FloatTensor(sample[1]))
    return imgs, targets

## 载入测试数据时候DataLoader读入数据的方式
def detection_collate_test(batch):
    imgs = []
    for sample in batch:
        imgs.append(sample)
    return imgs

data_path = '/home/cat/train.txt'
Classes_type = ['__backgroud__', 'cat1', 'cat2']
img_size = 300
batch_size = 36
num_workers = 10
backbone_name = 'resnet50'  ## 这个参数是在数据增广做完后将label转成default box时候使用的

dataset = DetectionDataset(data_path, img_size, Classes=Classes_type, backbone_name=backbone_name)
data_loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers,
                             shuffle=True, drop_last=True, collate_fn=detection_collate, pin_memory=True)

至此，完成数据的读入。

数据尽管已经载入成DataLoader的形式，但其实在载入Dataset那一步，还有很重要的一步没有说明，那就是“数据增广”。数据增广是关系模型训练很重要的一步，好的数据增广方式可以很大程度上提升模型的训练效果（个人认为是因为做了数据增广，相当于扩充了数据样本的数量）。

数据增广最基本的方式是水平/垂直翻转，旋转，缩放，裁剪，剪切，平移，对比度，此处参照网上的增广方式（抱歉对于出处忘记了，后面如果找到了再补上），使用opencv实现了如下。

其中cutout操作是在图像中裁掉一块填充黑的，由于后面引入了mixup和Mosaic操作，个人感觉三种方法同时使用对于我的训练数据可能产生坏的影响（我的训练数据有两类如果裁剪不当样子是一样的），所以三选一，进行增广。另外就是在做移动或者裁剪这类操作时，应考虑重新修正bounding box。

import numpy as np
import random
from numpy import random as rd
import warnings
import math
import cv2
import torch

## 转换颜色空间
class ConvertColor(object):
    def __init__(self, current='BGR', transform='HSV'):
        self.transform = transform
        self.current = current

    def __call__(self, image, boxes=None, labels=None):
        if self.current == 'BGR' and self.transform == 'HSV':
            image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
        elif self.current == 'HSV' and self.transform == 'BGR':
            image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR)
        else:
            raise NotImplementedError
        return image, boxes, labels

## 随机改变饱和度
class RandomSaturation(object):
    def __init__(self, lower=0.5, upper=1.5, p=0.5):
        self.lower = lower
        self.upper = upper
        assert self.upper >= self.lower, "contrast upper must be >= lower."
        assert self.lower >= 0, "contrast lower must be non-negative."
        self.p = p

    def __call__(self, image, boxes=None, labels=None):
        if random.random() > self.p:
            image[:, :, 1] *= random.uniform(self.lower, self.upper)

        return image, boxes, labels

## 随机改变对比度
class RandomContrast(object):
    def __init__(self, lower=0.5, upper=1.5, p=0.5):
        self.lower = lower
        self.upper = upper
        assert self.upper >= self.lower, "contrast upper must be >= lower."
        assert self.lower >= 0, "contrast lower must be non-negative."
        self.p = p

    # expects float image
    def __call__(self, image, boxes=None, labels=None):
        if random.random() > self.p:
            alpha = random.uniform(self.lower, self.upper)
            image = image.astype(np.float32)
            image *= alpha
            image = image.astype(np.uint8)
        return image, boxes, labels

## 随机改变亮度
class RandomBrightness(object):
    def __init__(self, delta=32, p=0.5):
        assert delta >= 0.0
        assert delta <= 255.0
        self.delta = delta
        self.p = p

    def __call__(self, image, boxes=None, labels=None):
        if random.random() > self.p:
            delta = random.randint(-self.delta, self.delta)
            image = np.clip(image + delta, 0, 255)
        return image, boxes, labels

## 随机改变色调
class RandomHue(object):
    def __init__(self, delta=18.0, p=0.5):
        assert delta >= 0.0 and delta <= 360.0
        self.delta = delta
        self.p = p

    def __call__(self, image, boxes=None, labels=None):
        if random.random() > self.p:
            image[:, :, 0] += random.uniform(-self.delta, self.delta)
            image[:, :, 0][image[:, :, 0] > 360.0] -= 360.0
            image[:, :, 0][image[:, :, 0] < 0.0] += 360.0
        return image, boxes, labels

## 归一化输入图像
class NormalizeImg(object):
    def __init__(self, mean=0, std=255):
        self.mean = mean
        self.std = std

    def __call__(self, img, bndbox):
        img = (img - self.mean) / self.std
        bndbox = (np.round(np.array(bndbox) / img.shape[0], 3)).tolist()
        return img, bndbox

## 将数据转换成tensor格式
class ToTensor(object):
    def __call__(self, cvimage, boxes=None, labels=None):
        return torch.from_numpy(cvimage.astype(np.float32)).permute(2, 0, 1), boxes, labels

## 更改图像尺寸及bounding box尺寸
class AllResize(object):
    def __init__(self, size=300):
        self.size = size

    def __call__(self, image, boxes, labels=None):
        image_h, image_w, image_c = image.shape
        image_h_ratio = self.size / image_h
        image_w_ratio = self.size / image_w
        image = cv2.resize(image, (self.size, self.size))
        for box in boxes:
            box[0] = int(box[0] * image_w_ratio)
            box[1] = int(box[1] * image_h_ratio)
            box[2] = int(box[2] * image_w_ratio)
            box[3] = int(box[3] * image_h_ratio)

        return image, boxes, labels

## 图像随机裁剪（遮挡）一块区域
class Cutout(object):
    def __init__(self, scale=(0.02, 0.4), ratio=(0.4, 1 / 0.4),
                 value=(0, 255), pixel_level=False, inplace=False):
        if (scale[0] > scale[1]) or (ratio[0] > ratio[1]):
            warnings.warn("range should be of kind (min, max)")
        if scale[0] < 0 or scale[1] > 1:
            raise ValueError("range of scale should be between 0 and 1")
        self.scale = scale
        self.ratio = ratio
        self.value = value
        self.pixel_level = pixel_level
        self.inplace = inplace

    def get_params(self, img, scale, ratio):
        img = np.array(img)
        img_h, img_w, img_c = img.shape

        s = random.uniform(*scale)
        r = random.uniform(*ratio)
        s = s * img_h * img_w
        w = int(math.sqrt(s / r))
        h = int(math.sqrt(s * r))
        left = random.randint(0, img_w - w)
        top = random.randint(0, img_h - h)

        return left, top, h, w, img_c

    def cutout(self, img, i, j, h, w, v, inplace=False):
        if not inplace:
            img = img.copy()

        img[i:i + h, j:j + w, :] = v
        return img

    def __call__(self, img):
        left, top, h, w, ch = self.get_params(img, self.scale, self.ratio)
        if self.pixel_level:
            c = np.random.randint(*self.value, size=(h, w, ch))
        else:
            c = random.randint(*self.value)
        return self.cutout(img, top, left, h, w, c, self.inplace)

## 调用数据增广接口
def data_augmentation(img, bndbox, label, img_shape, h_flip_p=0.5, v_flip_p=0.5, crop_p=0.6):
    try:
        # change img to numpy
        img = np.array(img)

        # some tramsform way
        ResizeImage = AllResize(size=img_shape)
        BGR2HSV = ConvertColor(transform='HSV')
        HSV2BGR = ConvertColor(current='HSV', transform='BGR')
        ChangeHue = RandomHue()
        ChangeContrast = RandomContrast()
        ChangeSaturation = RandomSaturation()
        ChangeBrightness = RandomBrightness()
        # ChangeTensor = ToTensor()
        CropImage = RandomSampleCrop()
        NormalizeImage = NormalizeImg()

        # img param change
        img, bndbox, _ = ResizeImage(img, bndbox)
        img, _, _ = ChangeBrightness(img)
        img, _, _ = ChangeContrast(img)

        # filp
        oh, ow, _ = img.shape
        if random.random() > (1 - h_flip_p):
            img = cv2.flip(img, 1)
            for box in bndbox:
                ## 做完翻转后修正检测框坐标
                ## TODO:
                tmp1 = box[0]
                tmp2 = box[2]
                box[0] = ow - tmp2
                box[2] = ow - tmp1

        if random.random() > (1 - v_flip_p):
            img = cv2.flip(img, 0)
            for box in bndbox:
                ## 做完翻转后修正检测框坐标
                ## TODO:
                tmp1 = box[1]
                tmp2 = box[3]
                box[1] = oh - tmp2
                box[3] = oh - tmp1

        #         # crop the image
        #         bndbox = np.asarray(bndbox)
        #         label = np.asarray(label)
        #         if random.random() > (1 - crop_p):
        #             img, bndbox, label = CropImage(img, bndbox, label)
        #             img = img.astype(np.uint8)
        #             img, bndbox, _ = ResizeImage(img, bndbox)

        # normalize image
        img, bndbox = NormalizeImage(img, bndbox)

    except:
        warnings.warn("OpenCV can't augment image!")

    return img, bndbox, label

前面有提到mixup和Mosaic这两个操作，但并没有放入图像增广接口函数里面，究其原因，是因为这两个操作的对象并不是一张图像，而是除了当前图像还会随机读入多张图像进行合并处理，所以两个处理我放到了前面Dataset数据载入的__getitem__函数调用的pull_image函数里面。

mixup操作就是将一张图像通过加权的方式叠加到另一张图像上面，权重一般设置0.5，代码和实现效果如下：

for i in range(self.mix_num):
    if i == 0:
        img = Image.open(self.data[index][0]).convert('RGB')
        bndbox, label = load_xml(self.data[index][1])
    else:
        random_index = np.random.randint(0, len(self.data))
        img = Image.open(self.data[random_index][0]).convert('RGB')
        bndbox, label = datadeal.load_xml(self.data[random_index][1])

    if self.mix_num == 2:
        if i == 0:
            old_img = img.copy()
            old_truth = target.copy()
        else:
            out_img = cv2.addWeighted(img, 0.5, old_img, 0.5, 0)
            out_target = np.concatenate([old_truth, target], axis=0)

Mosaic操作是将四张图片随机裁剪拼接合并成一张图片，并且保证合成图片大小跟原来图像一致。其作用效果个人觉得应该是相当于一次学习了四个目标，在相同的训练次数的情况下，学习内容增加了，并且随机裁剪增加了输入的随机性，类似于cutout的遮挡，所以能提升模型的训练效果。代码和实现效果如下：

## mosaic deal
class MosaicDeal(object):
    def __init__(self):
        self.is_use = True

    ## 该函数作用是根据裁剪修正bounding box
    def filter_truth(self, bboxes, dx, dy, sx, sy, xd, yd):
        bboxes[:, 0] -= dx
        bboxes[:, 2] -= dx
        bboxes[:, 1] -= dy
        bboxes[:, 3] -= dy

        bboxes[:, 0] = np.clip(bboxes[:, 0], 0, sx)
        bboxes[:, 2] = np.clip(bboxes[:, 2], 0, sx)

        bboxes[:, 1] = np.clip(bboxes[:, 1], 0, sy)
        bboxes[:, 3] = np.clip(bboxes[:, 3], 0, sy)

        out_box = list(np.where(((bboxes[:, 1] == sy) & (bboxes[:, 3] == sy)) |
                                ((bboxes[:, 0] == sx) & (bboxes[:, 2] == sx)) |
                                ((bboxes[:, 1] == 0) & (bboxes[:, 3] == 0)) |
                                ((bboxes[:, 0] == 0) & (bboxes[:, 2] == 0)))[0])
        list_box = list(range(bboxes.shape[0]))
        for i in out_box:
            list_box.remove(i)
        bboxes = bboxes[list_box]

        bboxes[:, 0] += xd
        bboxes[:, 2] += xd
        bboxes[:, 1] += yd
        bboxes[:, 3] += yd

        return bboxes

    def __call__(self, out_img, img, bboxes, w, h, cut_x, cut_y, i_mixup,
                 left_shift, right_shift, top_shift, bot_shift):
        left_shift = min(left_shift, w - cut_x)
        top_shift = min(top_shift, h - cut_y)
        right_shift = min(right_shift, cut_x)
        bot_shift = min(bot_shift, cut_y)

        if i_mixup == 0:
            ## --TODO:
            ## maybe the bboxes is empty
            if len(bboxes) == 0:
                bboxes = bboxes
            else:
                bboxes = self.filter_truth(bboxes, left_shift, top_shift, cut_x, cut_y, 0, 0)
            out_img[:cut_y, :cut_x] = img[top_shift:top_shift + cut_y, left_shift:left_shift + cut_x]
        if i_mixup == 1:
            ## --TODO:
            ## maybe the bboxes is empty
            if len(bboxes) == 0:
                bboxes = bboxes
            else:
                bboxes = self.filter_truth(bboxes, cut_x - right_shift, top_shift, w - cut_x, cut_y, cut_x, 0)
            out_img[:cut_y, cut_x:] = img[top_shift:top_shift + cut_y, cut_x - right_shift:w - right_shift]
        if i_mixup == 2:
            ## --TODO:
            ## maybe the bboxes is empty
            if len(bboxes) == 0:
                bboxes = bboxes
            else:
                bboxes = self.filter_truth(bboxes, left_shift, cut_y - bot_shift, cut_x, h - cut_y, 0, cut_y)
            out_img[cut_y:, :cut_x] = img[cut_y - bot_shift:h - bot_shift, left_shift:left_shift + cut_x]
        if i_mixup == 3:
            ## --TODO:
            ## maybe the bboxes is empty
            if len(bboxes) == 0:
                bboxes = bboxes
            else:
                bboxes = self.filter_truth(bboxes, cut_x - right_shift, cut_y - bot_shift, w - cut_x, h - cut_y, cut_x,
                                           cut_y)
            out_img[cut_y:, cut_x:] = img[cut_y - bot_shift:h - bot_shift, cut_x - right_shift:w - right_shift]

        return out_img, bboxes


self.mosaic = imgdeal.MosaicDeal()
for i in range(self.mix_num):
    if i == 0:
        img = Image.open(self.data[index][0]).convert('RGB')
        bndbox, label = datadeal.load_xml(self.data[index][1])
    else:
        random_index = np.random.randint(0, len(self.data))
        img = Image.open(self.data[random_index][0]).convert('RGB')
        bndbox, label = datadeal.load_xml(self.data[random_index][1])

        if self.mix_num == 2:
                ## 此处为mixup操作，所以省略
                ....

        elif self.mix_num == 4:
            oh, ow, oc = img.shape
            dh, dw, dc = np.array(np.array([oh, ow, oc]) * 0.3, dtype=np.int)

            pleft = np.random.randint(-dw, dw)
            pright = np.random.randint(-dw, dw)
            ptop = np.random.randint(-dh, dh)
            pbot = np.random.randint(-dh, dh)
            swidth = ow - pleft - pright
            sheight = oh - ptop - pbot

            left_shift = int(min(cut_x, max(0, (-int(pleft) * self.img_shape / swidth))))
            top_shift = int(min(cut_y, max(0, (-int(ptop) * self.img_shape / sheight))))
            right_shift = int(min((self.img_shape - cut_x), max(0, (-int(pright) * self.img_shape / swidth))))
            bot_shift = int(min(self.img_shape - cut_y, max(0, (-int(pbot) * self.img_shape / sheight))))

            out_img, target = self.mosaic(out_img, img.copy(), target.copy(), self.img_shape, self.img_shape,
                                                  cut_x, cut_y, i, left_shift, right_shift, top_shift, bot_shift)

weixin_39683241

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
pytorch dataset自定义_目标检测：SSD模型——pytorch数据载入及增广

进行模型训练的第一步是载入数据，使用pytorch框架载入数据需要两个步骤：构建Dataset数据集和创建Dataloader数据迭代器。pytorch要载入数据训练SSD，可以直接调用 torchvision.datasets.VOCDetection 或者 torchvision.datasets.CocoDetection ，需要做的是按照要求放置数据就好。此处为了自己写后续的图像增广的操作...
复制链接

扫一扫