COCO数据集的下载、介绍及如何使用（数据载入及数据增广，含代码）

本文链接：https://blog.csdn.net/qq_41847324/article/details/86224628

如何使用COCO数据集
COCO数据集可以说是语义分割等计算机视觉任务中应用较为广泛的一个数据集，具体可以应用到物体识别、语义分割及目标检测等方面。我是在做语义分割方面任务时用到了COCO数据集，但本文主要讲解的是数据载入方面，因此可以通用。

一、下载COCO数据集

首先，我们要下载COCO数据集，本文主要使用的是COCO2014和COCO2017,因为是国外数据集，因此下载需要翻墙下载。
MSCOCO数据集的官网为：http://mscoco.org/
具体来说，如果想只下载COCO2017/COCO2014的话，可以不需要翻墙下载，复制以下链接打开迅雷等下载软件下载即可，网速还可以。
COCO2017 训练数据：http://images.cocodataset.org/zips/train2017.zip
http://images.cocodataset.org/annotations/annotations_trainval2017.zip
COCO2017验证数据：http://images.cocodataset.org/zips/val2017.zip
http://images.cocodataset.org/annotations/stuff_annotations_trainval2017.zip
COCO2017测试数据集：http://images.cocodataset.org/zips/test2017.zip
http://images.cocodataset.org/annotations/image_info_test2017.zip

COCO2014的相关数据只需要将以上链接中的7改成4即可。

二、COCO数据集介绍

网上关于COCO数据集的介绍多如牛毛，本文就不过多的加以介绍了，简要的介绍以下。
以COCO2014为例：
下载完COCO2014后进行解压后，目录如下：

images
- train2014
- val2014
- test2014
annotations
其中，images中的文件夹各自放置了训练、验证和测试的数据集图片。annotations文件夹中放置了标签文件，可以理解为Label，简要的来说，就是包含了某一类在图片中的具体位置的信息，详细可见以下链接：https://blog.csdn.net/happyhorizion/article/details/77894205#semantic-scene-labeling图像分割

三、COCO数据集使用（数据载入）

所需环境为：

numpy
torch
tqdm(可视化数据载入)
os
pycocotools(coco数据集的应用API)
torchvision
PIL

如何安装pycocotools

相信能用到COCO数据集做语义分割等任务的大佬们应该都能安装以上绝大多数库，这里主要讲一下如何安装pycocotools库。作者在安装这个库的时候遇到了一些问题，不过及时的解决了。
步骤如下：

首先下载cocoapi，在终端输入

git clone git@github.com:lucky-ing/cocoapi.git

此时可以看到一个叫coco的文件夹，进入coco/PythonAPI中，懒人操作如下：

cd coco/PythonAPI

开始安装，在终端输入以下命令
如果使用的是python2:

python setup.py build_ext install

如果使用的是python3

python3 setup.py build_ext install

如果一切顺利，安装完成，即可进入下一章节具体使用，作者在安装时遇到了以下问题。

error: command 'C:\Program Files (x86)\Microsoft Visual Studio\2017\BuildTools\VC\Tools\MSVC\14.16.27023\bin\HostX86\x64\cl.exe' failed with exit status 2

解决方法很简单，在终端安装cython即可，在终端输入：

conda install cython

若是没有使用conda，在终端输入

pip install cython

COCO数据集的载入

dataloader

import numpy as np
import torch
from torch.utils.data import Dataset
from tqdm import trange
import os
from pycocotools.coco import COCO
from pycocotools import mask
from torchvision import transforms
import custom_transforms as tr
from PIL import Image, ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True


class COCOSegmentation(Dataset):
    NUM_CLASSES = 21
    CAT_LIST = [0, 5, 2, 16, 9, 44, 6, 3, 17, 62, 21, 67, 18, 19, 4,
        1, 64, 20, 63, 7, 72]

    def __init__(self,
                 args,
                 base_dir=’./Path/COCO/‘,
                 split='train',
                 year='2014'):
        super().__init__()
        ann_file = os.path.join(base_dir, 'annotations/instances_{}{}.json'.format(split, year))
        ids_file = os.path.join(base_dir, 'annotations/{}_ids_{}.pth'.format(split, year))
        self.img_dir = os.path.join(base_dir, 'images/{}{}'.format(split, year))
        self.split = split
        self.coco = COCO(ann_file)
        self.coco_mask = mask
        if os.path.exists(ids_file):
            self.ids = torch.load(ids_file)
        else:
            ids = list(self.coco.imgs.keys())
            self.ids = self._preprocess(ids, ids_file)
        self.args = args

    def __getitem__(self, index):
        _img, _target = self._make_img_gt_point_pair(index)
        sample = {'image': _img, 'label': _target}

        if self.split == "train":
            return self.transform_tr(sample)
        elif self.split == 'val':
            return self.transform_val(sample)

    def _make_img_gt_point_pair(self, index):
        coco = self.coco
        img_id = self.ids[index]
        img_metadata = coco.loadImgs(img_id)[0]
        path = img_metadata['file_name']
        _img = Image.open(os.path.join(self.img_dir, path)).convert('RGB')
        cocotarget = coco.loadAnns(coco.getAnnIds(imgIds=img_id))
        _target = Image.fromarray(self._gen_seg_mask(
            cocotarget, img_metadata['height'], img_metadata['width']))

        return _img, _target

    def _preprocess(self, ids, ids_file):
        print("Preprocessing mask, this will take a while. " + \
              "But don't worry, it only run once for each split.")
        tbar = trange(len(ids))
        new_ids = []
        for i in tbar:
            img_id = ids[i]
            cocotarget = self.coco.loadAnns(self.coco.getAnnIds(imgIds=img_id))
            img_metadata = self.coco.loadImgs(img_id)[0]
            mask = self._gen_seg_mask(cocotarget, img_metadata['height'],
                                      img_metadata['width'])
            # more than 1k pixels
            if (mask > 0).sum() > 1000:
                new_ids.append(img_id)
            tbar.set_description('Doing: {}/{}, got {} qualified images'. \
                                 format(i, len(ids), len(new_ids)))
        print('Found number of qualified images: ', len(new_ids))
        torch.save(new_ids, ids_file)
        return new_ids

    def _gen_seg_mask(self, target, h, w):
        mask = np.zeros((h, w), dtype=np.uint8)
        coco_mask = self.coco_mask
        for instance in target:
            rle = coco_mask.frPyObjects(instance['segmentation'], h, w)
            m = coco_mask.decode(rle)
            cat = instance['category_id']
            if cat in self.CAT_LIST:
                c = self.CAT_LIST.index(cat)
            else:
                continue
            if len(m.shape) < 3:
                mask[:, :] += (mask == 0) * (m * c)
            else:
                mask[:, :] += (mask == 0) * (((np.sum(m, axis=2)) > 0) * c).astype(np.uint8)
        return mask

    def transform_tr(self, sample):
        composed_transforms = transforms.Compose([
            tr.RandomHorizontalFlip(),
            tr.RandomScaleCrop(base_size=self.args.base_size, crop_size=self.args.crop_size),
            tr.RandomGaussianBlur(),
            tr.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
            tr.ToTensor()])

        return composed_transforms(sample)

    def transform_val(self, sample):

        composed_transforms = transforms.Compose([
            tr.FixScaleCrop(crop_size=self.args.crop_size),
            tr.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
            tr.ToTensor()])

        return composed_transforms(sample)


    def __len__(self):
        return len(self.ids)



if __name__ == "__main__":
    from dataloaders import custom_transforms as tr
    from dataloaders.utils import decode_segmap
    from torch.utils.data import DataLoader
    from torchvision import transforms
    import matplotlib.pyplot as plt
    import argparse

    parser = argparse.ArgumentParser()
    args = parser.parse_args()
    args.base_size = 513
    args.crop_size = 513

    coco_val = COCOSegmentation(args, split='val', year='2017')

    dataloader = DataLoader(coco_val, batch_size=4, shuffle=True, num_workers=0)

    for ii, sample in enumerate(dataloader):
        for jj in range(sample["image"].size()[0]):
            img = sample['image'].numpy()
            gt = sample['label'].numpy()
            tmp = np.array(gt[jj]).astype(np.uint8)
            segmap = decode_segmap(tmp, dataset='coco')
            img_tmp = np.transpose(img[jj], axes=[1, 2, 0])
            img_tmp *= (0.229, 0.224, 0.225)
            img_tmp += (0.485, 0.456, 0.406)
            img_tmp *= 255.0
            img_tmp = img_tmp.astype(np.uint8)
            plt.figure()
            plt.title('display')
            plt.subplot(211)
            plt.imshow(img_tmp)
            plt.subplot(212)
            plt.imshow(segmap)

        if ii == 1:
            break

    plt.show(block=True)

下面的main函数为测试使用。

custom_transforms.py 是数据增广的代码

import torch
import random
import numpy as np

from PIL import Image, ImageOps, ImageFilter

class Normalize(object):
   """Normalize a tensor image with mean and standard deviation.
   Args:
       mean (tuple): means for each channel.
       std (tuple): standard deviations for each channel.
   """
   def __init__(self, mean=(0., 0., 0.), std=(1., 1., 1.)):
       self.mean = mean
       self.std = std

   def __call__(self, sample):
       img = sample['image']
       mask = sample['label']
       img = np.array(img).astype(np.float32)
       mask = np.array(mask).astype(np.float32)
       img /= 255.0
       img -= self.mean
       img /= self.std

       return {'image': img,
               'label': mask}

class Normalize_test(object):
   """Normalize a tensor image with mean and standard deviation.
   Args:
       mean (tuple): means for each channel.
       std (tuple): standard deviations for each channel.
   """
   def __init__(self, mean=(0., 0., 0.), std=(1., 1., 1.)):
       self.mean = mean
       self.std = std

   def __call__(self, sample):
       img = sample['image']
       #mask = sample['label']
       img = np.array(img).astype(np.float32)
       #mask = np.array(mask).astype(np.float32)
       img /= 255.0
       img -= self.mean
       img /= self.std

       return {'image': img}


class ToTensor(object):
   """Convert ndarrays in sample to Tensors."""

   def __call__(self, sample):
       # swap color axis because
       # numpy image: H x W x C
       # torch image: C X H X W
       img = sample['image']
       mask = sample['label']
       img = np.array(img).astype(np.float32).transpose((2, 0, 1))
       mask = np.array(mask).astype(np.float32)

       img = torch.from_numpy(img).float()
       mask = torch.from_numpy(mask).float()

       return {'image': img,
               'label': mask}

class ToTensor_test(object):
   """Convert ndarrays in sample to Tensors."""

   def __call__(self, sample):
       # swap color axis because
       # numpy image: H x W x C
       # torch image: C X H X W
       img = sample['image']
       #mask = sample['label']
       img = np.array(img).astype(np.float32).transpose((2, 0, 1))
       #mask = np.array(mask).astype(np.float32)

       img = torch.from_numpy(img).float()
       #mask = torch.from_numpy(mask).float()

       return {'image': img}


class RandomHorizontalFlip(object):
   def __call__(self, sample):
       img = sample['image']
       mask = sample['label']
       if random.random() < 0.5:
           img = img.transpose(Image.FLIP_LEFT_RIGHT)
           mask = mask.transpose(Image.FLIP_LEFT_RIGHT)

       return {'image': img,
               'label': mask}


class RandomRotate(object):
   def __init__(self, degree):
       self.degree = degree

   def __call__(self, sample):
       img = sample['image']
       mask = sample['label']
       rotate_degree = random.uniform(-1*self.degree, self.degree)
       img = img.rotate(rotate_degree, Image.BILINEAR)
       mask = mask.rotate(rotate_degree, Image.NEAREST)

       return {'image': img,
               'label': mask}


class RandomGaussianBlur(object):
   def __call__(self, sample):
       img = sample['image']
       mask = sample['label']
       if random.random() < 0.5:
           img = img.filter(ImageFilter.GaussianBlur(
               radius=random.random()))

       return {'image': img,
               'label': mask}


class RandomScaleCrop(object):
   def __init__(self, base_size, crop_size, fill=0):
       self.base_size = base_size
       self.crop_size = crop_size
       self.fill = fill

   def __call__(self, sample):
       img = sample['image']
       mask = sample['label']
       # random scale (short edge)
       short_size = random.randint(int(self.base_size * 0.5), int(self.base_size * 2.0))
       w, h = img.size
       if h > w:
           ow = short_size
           oh = int(1.0 * h * ow / w)
       else:
           oh = short_size
           ow = int(1.0 * w * oh / h)
       img = img.resize((ow, oh), Image.BILINEAR)
       mask = mask.resize((ow, oh), Image.NEAREST)
       # pad crop
       if short_size < self.crop_size:
           padh = self.crop_size - oh if oh < self.crop_size else 0
           padw = self.crop_size - ow if ow < self.crop_size else 0
           img = ImageOps.expand(img, border=(0, 0, padw, padh), fill=0)
           mask = ImageOps.expand(mask, border=(0, 0, padw, padh), fill=self.fill)
       # random crop crop_size
       w, h = img.size
       x1 = random.randint(0, w - self.crop_size)
       y1 = random.randint(0, h - self.crop_size)
       img = img.crop((x1, y1, x1 + self.crop_size, y1 + self.crop_size))
       mask = mask.crop((x1, y1, x1 + self.crop_size, y1 + self.crop_size))

       return {'image': img,
               'label': mask}


class FixScaleCrop(object):
   def __init__(self, crop_size):
       self.crop_size = crop_size

   def __call__(self, sample):
       img = sample['image']
       mask = sample['label']
       w, h = img.size
       if w > h:
           oh = self.crop_size
           ow = int(1.0 * w * oh / h)
       else:
           ow = self.crop_size
           oh = int(1.0 * h * ow / w)
       img = img.resize((ow, oh), Image.BILINEAR)
       mask = mask.resize((ow, oh), Image.NEAREST)
       # center crop
       w, h = img.size
       x1 = int(round((w - self.crop_size) / 2.))
       y1 = int(round((h - self.crop_size) / 2.))
       img = img.crop((x1, y1, x1 + self.crop_size, y1 + self.crop_size))
       mask = mask.crop((x1, y1, x1 + self.crop_size, y1 + self.crop_size))

       return {'image': img,
               'label': mask}

class FixedResize(object):
   def __init__(self):
       self.size = (size, size)  # size: (h, w)

   def __call__(self, sample):
       img = sample['image']
       mask = sample['label']

       assert img.size == mask.size

       img = img.resize(self.size, Image.BILINEAR)
       mask = mask.resize(self.size, Image.NEAREST)

       return {'image': img,
               'label': mask}

class FixedResize_test(object):
   def __init__(self):
       super().__init__()
       #self.size = (size, size)  # size: (h, w)

   def __call__(self, sample):
       img = sample['image']
       w, h = img.size
       #mask = sample['label']

       #assert img.size == mask.size

       img = img.resize(img.size, Image.BILINEAR)
       #mask = mask.resize(self.size, Image.NEAREST)

       return {'image': img}