如何使用COCO数据集
COCO数据集可以说是语义分割等计算机视觉任务中应用较为广泛的一个数据集,具体可以应用到物体识别、语义分割及目标检测等方面。我是在做语义分割方面任务时用到了COCO数据集,但本文主要讲解的是数据载入方面,因此可以通用。
一、下载COCO数据集
首先,我们要下载COCO数据集,本文主要使用的是COCO2014和COCO2017,因为是国外数据集,因此下载需要翻墙下载。
MSCOCO数据集的官网为:http://mscoco.org/
具体来说,如果想只下载COCO2017/COCO2014的话,可以不需要翻墙下载,复制以下链接打开迅雷等下载软件下载即可,网速还可以。
COCO2017 训练数据:http://images.cocodataset.org/zips/train2017.zip
http://images.cocodataset.org/annotations/annotations_trainval2017.zip
COCO2017验证数据:http://images.cocodataset.org/zips/val2017.zip
http://images.cocodataset.org/annotations/stuff_annotations_trainval2017.zip
COCO2017测试数据集:http://images.cocodataset.org/zips/test2017.zip
http://images.cocodataset.org/annotations/image_info_test2017.zip
COCO2014的相关数据只需要将以上链接中的7改成4即可。
二、COCO数据集介绍
网上关于COCO数据集的介绍多如牛毛,本文就不过多的加以介绍了,简要的介绍以下。
以COCO2014为例:
下载完COCO2014后进行解压后,目录如下:
- images
- train2014
- val2014
- test2014
- annotations
其中,images中的文件夹各自放置了训练、验证和测试的数据集图片。annotations文件夹中放置了标签文件,可以理解为Label,简要的来说,就是包含了某一类在图片中的具体位置的信息,详细可见以下链接:https://blog.csdn.net/happyhorizion/article/details/77894205#semantic-scene-labeling图像分割
三、COCO数据集使用(数据载入)
所需环境为:
- numpy
- torch
- tqdm(可视化数据载入)
- os
- pycocotools(coco数据集的应用API)
- torchvision
- PIL
如何安装pycocotools
相信能用到COCO数据集做语义分割等任务的大佬们应该都能安装以上绝大多数库,这里主要讲一下如何安装pycocotools库。作者在安装这个库的时候遇到了一些问题,不过及时的解决了。
步骤如下:
- 首先下载cocoapi,在终端输入
git clone git@github.com:lucky-ing/cocoapi.git
- 此时可以看到一个叫coco的文件夹,进入coco/PythonAPI中,懒人操作如下:
cd coco/PythonAPI
- 开始安装,在终端输入以下命令
如果使用的是python2:
python setup.py build_ext install
如果使用的是python3
python3 setup.py build_ext install
- 如果一切顺利,安装完成,即可进入下一章节具体使用,作者在安装时遇到了以下问题。
error: command 'C:\Program Files (x86)\Microsoft Visual Studio\2017\BuildTools\VC\Tools\MSVC\14.16.27023\bin\HostX86\x64\cl.exe' failed with exit status 2
解决方法很简单,在终端安装cython即可,在终端输入:
conda install cython
若是没有使用conda,在终端输入
pip install cython
COCO数据集的载入
- dataloader
import numpy as np
import torch
from torch.utils.data import Dataset
from tqdm import trange
import os
from pycocotools.coco import COCO
from pycocotools import mask
from torchvision import transforms
import custom_transforms as tr
from PIL import Image, ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
class COCOSegmentation(Dataset):
NUM_CLASSES = 21
CAT_LIST = [0, 5, 2, 16, 9, 44, 6, 3, 17, 62, 21, 67, 18, 19, 4,
1, 64, 20, 63, 7, 72]
def __init__(self,
args,
base_dir=’./Path/COCO/‘,
split='train',
year='2014'):
super().__init__()
ann_file = os.path.join(base_dir, 'annotations/instances_{}{}.json'.format(split, year))
ids_file = os.path.join(base_dir, 'annotations/{}_ids_{}.pth'.format(split, year))
self.img_dir = os.path.join(base_dir, 'images/{}{}'.format(split, year))
self.split = split
self.coco = COCO(ann_file)
self.coco_mask = mask
if os.path.exists(ids_file):
self.ids = torch.load(ids_file)
else:
ids = list(self.coco.imgs.keys())
self.ids = self._preprocess(ids, ids_file)
self.args = args
def __getitem__(self, index):
_img, _target = self._make_img_gt_point_pair(index)
sample = {'image': _img, 'label': _target}
if self.split == "train":
return self.transform_tr(sample)
elif self.split == 'val':
return self.transform_val(sample)
def _make_img_gt_point_pair(self, index):
coco = self.coco
img_id = self.ids[index]
img_metadata = coco.loadImgs(img_id)[0]
path = img_metadata['file_name']
_img = Image.open(os.path.join(self.img_dir, path)).convert('RGB')
cocotarget = coco.loadAnns(coco.getAnnIds(imgIds=img_id))
_target = Image.fromarray(self._gen_seg_mask(
cocotarget, img_metadata['height'], img_metadata['width']))
return _img, _target
def _preprocess(self, ids, ids_file):
print("Preprocessing mask, this will take a while. " + \
"But don't worry, it only run once for each split.")
tbar = trange(len(ids))
new_ids = []
for i in tbar:
img_id = ids[i]
cocotarget = self.coco.loadAnns(self.coco.getAnnIds(imgIds=img_id))
img_metadata = self.coco.loadImgs(img_id)[0]
mask = self._gen_seg_mask(cocotarget, img_metadata['height'],
img_metadata['width'])
# more than 1k pixels
if (mask > 0).sum() > 1000:
new_ids.append(img_id)
tbar.set_description('Doing: {}/{}, got {} qualified images'. \
format(i, len(ids), len(new_ids)))
print('Found number of qualified images: ', len(new_ids))
torch.save(new_ids, ids_file)
return new_ids
def _gen_seg_mask(self, target, h, w):
mask = np.zeros((h, w), dtype=np.uint8)
coco_mask = self.coco_mask
for instance in target:
rle = coco_mask.frPyObjects(instance['segmentation'], h, w)
m = coco_mask.decode(rle)
cat = instance['category_id']
if cat in self.CAT_LIST:
c = self.CAT_LIST.index(cat)
else:
continue
if len(m.shape) < 3:
mask[:, :] += (mask == 0) * (m * c)
else:
mask[:, :] += (mask == 0) * (((np.sum(m, axis=2)) > 0) * c).astype(np.uint8)
return mask
def transform_tr(self, sample):
composed_transforms = transforms.Compose([
tr.RandomHorizontalFlip(),
tr.RandomScaleCrop(base_size=self.args.base_size, crop_size=self.args.crop_size),
tr.RandomGaussianBlur(),
tr.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
tr.ToTensor()])
return composed_transforms(sample)
def transform_val(self, sample):
composed_transforms = transforms.Compose([
tr.FixScaleCrop(crop_size=self.args.crop_size),
tr.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
tr.ToTensor()])
return composed_transforms(sample)
def __len__(self):
return len(self.ids)
if __name__ == "__main__":
from dataloaders import custom_transforms as tr
from dataloaders.utils import decode_segmap
from torch.utils.data import DataLoader
from torchvision import transforms
import matplotlib.pyplot as plt
import argparse
parser = argparse.ArgumentParser()
args = parser.parse_args()
args.base_size = 513
args.crop_size = 513
coco_val = COCOSegmentation(args, split='val', year='2017')
dataloader = DataLoader(coco_val, batch_size=4, shuffle=True, num_workers=0)
for ii, sample in enumerate(dataloader):
for jj in range(sample["image"].size()[0]):
img = sample['image'].numpy()
gt = sample['label'].numpy()
tmp = np.array(gt[jj]).astype(np.uint8)
segmap = decode_segmap(tmp, dataset='coco')
img_tmp = np.transpose(img[jj], axes=[1, 2, 0])
img_tmp *= (0.229, 0.224, 0.225)
img_tmp += (0.485, 0.456, 0.406)
img_tmp *= 255.0
img_tmp = img_tmp.astype(np.uint8)
plt.figure()
plt.title('display')
plt.subplot(211)
plt.imshow(img_tmp)
plt.subplot(212)
plt.imshow(segmap)
if ii == 1:
break
plt.show(block=True)
下面的main函数为测试使用。
- custom_transforms.py 是数据增广的代码
import torch
import random
import numpy as np
from PIL import Image, ImageOps, ImageFilter
class Normalize(object):
"""Normalize a tensor image with mean and standard deviation.
Args:
mean (tuple): means for each channel.
std (tuple): standard deviations for each channel.
"""
def __init__(self, mean=(0., 0., 0.), std=(1., 1., 1.)):
self.mean = mean
self.std = std
def __call__(self, sample):
img = sample['image']
mask = sample['label']
img = np.array(img).astype(np.float32)
mask = np.array(mask).astype(np.float32)
img /= 255.0
img -= self.mean
img /= self.std
return {'image': img,
'label': mask}
class Normalize_test(object):
"""Normalize a tensor image with mean and standard deviation.
Args:
mean (tuple): means for each channel.
std (tuple): standard deviations for each channel.
"""
def __init__(self, mean=(0., 0., 0.), std=(1., 1., 1.)):
self.mean = mean
self.std = std
def __call__(self, sample):
img = sample['image']
#mask = sample['label']
img = np.array(img).astype(np.float32)
#mask = np.array(mask).astype(np.float32)
img /= 255.0
img -= self.mean
img /= self.std
return {'image': img}
class ToTensor(object):
"""Convert ndarrays in sample to Tensors."""
def __call__(self, sample):
# swap color axis because
# numpy image: H x W x C
# torch image: C X H X W
img = sample['image']
mask = sample['label']
img = np.array(img).astype(np.float32).transpose((2, 0, 1))
mask = np.array(mask).astype(np.float32)
img = torch.from_numpy(img).float()
mask = torch.from_numpy(mask).float()
return {'image': img,
'label': mask}
class ToTensor_test(object):
"""Convert ndarrays in sample to Tensors."""
def __call__(self, sample):
# swap color axis because
# numpy image: H x W x C
# torch image: C X H X W
img = sample['image']
#mask = sample['label']
img = np.array(img).astype(np.float32).transpose((2, 0, 1))
#mask = np.array(mask).astype(np.float32)
img = torch.from_numpy(img).float()
#mask = torch.from_numpy(mask).float()
return {'image': img}
class RandomHorizontalFlip(object):
def __call__(self, sample):
img = sample['image']
mask = sample['label']
if random.random() < 0.5:
img = img.transpose(Image.FLIP_LEFT_RIGHT)
mask = mask.transpose(Image.FLIP_LEFT_RIGHT)
return {'image': img,
'label': mask}
class RandomRotate(object):
def __init__(self, degree):
self.degree = degree
def __call__(self, sample):
img = sample['image']
mask = sample['label']
rotate_degree = random.uniform(-1*self.degree, self.degree)
img = img.rotate(rotate_degree, Image.BILINEAR)
mask = mask.rotate(rotate_degree, Image.NEAREST)
return {'image': img,
'label': mask}
class RandomGaussianBlur(object):
def __call__(self, sample):
img = sample['image']
mask = sample['label']
if random.random() < 0.5:
img = img.filter(ImageFilter.GaussianBlur(
radius=random.random()))
return {'image': img,
'label': mask}
class RandomScaleCrop(object):
def __init__(self, base_size, crop_size, fill=0):
self.base_size = base_size
self.crop_size = crop_size
self.fill = fill
def __call__(self, sample):
img = sample['image']
mask = sample['label']
# random scale (short edge)
short_size = random.randint(int(self.base_size * 0.5), int(self.base_size * 2.0))
w, h = img.size
if h > w:
ow = short_size
oh = int(1.0 * h * ow / w)
else:
oh = short_size
ow = int(1.0 * w * oh / h)
img = img.resize((ow, oh), Image.BILINEAR)
mask = mask.resize((ow, oh), Image.NEAREST)
# pad crop
if short_size < self.crop_size:
padh = self.crop_size - oh if oh < self.crop_size else 0
padw = self.crop_size - ow if ow < self.crop_size else 0
img = ImageOps.expand(img, border=(0, 0, padw, padh), fill=0)
mask = ImageOps.expand(mask, border=(0, 0, padw, padh), fill=self.fill)
# random crop crop_size
w, h = img.size
x1 = random.randint(0, w - self.crop_size)
y1 = random.randint(0, h - self.crop_size)
img = img.crop((x1, y1, x1 + self.crop_size, y1 + self.crop_size))
mask = mask.crop((x1, y1, x1 + self.crop_size, y1 + self.crop_size))
return {'image': img,
'label': mask}
class FixScaleCrop(object):
def __init__(self, crop_size):
self.crop_size = crop_size
def __call__(self, sample):
img = sample['image']
mask = sample['label']
w, h = img.size
if w > h:
oh = self.crop_size
ow = int(1.0 * w * oh / h)
else:
ow = self.crop_size
oh = int(1.0 * h * ow / w)
img = img.resize((ow, oh), Image.BILINEAR)
mask = mask.resize((ow, oh), Image.NEAREST)
# center crop
w, h = img.size
x1 = int(round((w - self.crop_size) / 2.))
y1 = int(round((h - self.crop_size) / 2.))
img = img.crop((x1, y1, x1 + self.crop_size, y1 + self.crop_size))
mask = mask.crop((x1, y1, x1 + self.crop_size, y1 + self.crop_size))
return {'image': img,
'label': mask}
class FixedResize(object):
def __init__(self):
self.size = (size, size) # size: (h, w)
def __call__(self, sample):
img = sample['image']
mask = sample['label']
assert img.size == mask.size
img = img.resize(self.size, Image.BILINEAR)
mask = mask.resize(self.size, Image.NEAREST)
return {'image': img,
'label': mask}
class FixedResize_test(object):
def __init__(self):
super().__init__()
#self.size = (size, size) # size: (h, w)
def __call__(self, sample):
img = sample['image']
w, h = img.size
#mask = sample['label']
#assert img.size == mask.size
img = img.resize(img.size, Image.BILINEAR)
#mask = mask.resize(self.size, Image.NEAREST)
return {'image': img}
将以上两个文件加入到你的代码中,就完成了COCO数据集的载入啦~