mmdetection的生物图像实例分割一:自定义数据集制作

mmdetection的生物图像实例分割全流程记录

第一章 自定义数据集的转换与制作



前言

最近尝试做一些细胞器,细胞结构的重建任务,需要验证一些实例分割模型,例如MRCNN,和生物医学图像中常用的nnUNet等语义分割模型的优劣。而在mmdetection 3.3.0中,一些常用的接口进行了较大的改变,这里索性记录步骤,方便后续的分享与学习。
在这里插入图片描述

本次的数据集选择了AC3AC4电镜数据集,分辨率为 29 n m × 6 n m × 6 n m 29nm \times 6nm \times 6nm 29nm×6nm×6nm,成像方式是SBEM,样本取自小鼠大脑皮层,是哈佛大学Kasthuri15数据的子集。 其中AC3体块大小为 256 × 1024 × 1024 256 \times 1024 \times 1024 256×1024×1024,AC4体块大小为 100 × 1024 × 1024 100 \times 1024 \times 1024 100×1024×1024,曾在ISBI2013数据集中作作为SNEMI挑战赛的数据。数据集的链接为:https://lichtman.rc.fas.harvard.edu/vast/

一、单个体积数据的分块划分与写入

mmdetection主要使用的还是2D图像的目标检测框架。对于电镜数据来说,其单张2D图像尺寸可能与通用的自然图像不同,长和宽经常会达到上万甚至上十万的像素规模。为此这里对图像进行分块处理。

我们将下载的图像存放在’Path/to/your/DataOri/AC3AC4’, 准备写入的图像存在在’Path/to/your/DataSingle/AC3AC4’, 每个2D图像的命名方式为 stack_z_y_x

# stack_z_y_x
import os
from os.path import join
import cv2
import numpy as np
from skimage import io
from skimage import measure
from tqdm import tqdm


def write_single_img(img_array, y_range, x_range):
    img_part_list = []
    img_name_list = []
    for y_item_index in range(len(y_range)):
        for x_item_index in range(len(x_range)):
            y_item = y_range[y_item_index]
            x_item = x_range[x_item_index]
            img_part_list.append(img_array[y_item[0]:y_item[1], x_item[0]:x_item[1]].copy())
            img_name_list.append('%d_%d'%(y_item_index, x_item_index))
    return img_part_list, img_name_list

def ac3ac4_write(source_path = '/home/guojy2/share/guojy/SynDet2024/DataOri/AC3AC4',
                save_path = '/home/guojy2/share/guojy/SynDet2024/DataSingle/AC3AC4',
                y_range = [[0, 1024]],
                x_range = [[0, 1024]]):
    # 1024*1024 
    # smaller example: y_range = [[0, 640], [383, 1024]], x_range = [[0, 640], [383, 1024]]
    save_images_path = join(save_path, 'images')
    save_labels_path = join(save_path, 'labels')
    os.makedirs(save_images_path, exist_ok=True)
    os.makedirs(save_labels_path, exist_ok=True)
    # ac3
    images_ac3 = io.imread(join(source_path, 'AC3Images.tif'))
    for layer in tqdm(range(images_ac3.shape[0])):
        img_part_list, img_name_list = write_single_img(images_ac3[layer], y_range, x_range)
        for img_index in range(len(img_part_list)):
            io.imsave(join(save_images_path, 'ac3_%d_'%layer + img_name_list[img_index] + '.tif'), img_part_list[img_index])
    
    labels_ac3 = io.imread(join(source_path, 'AC3Labels.tif'))
    for layer in tqdm(range(labels_ac3.shape[0])):
        single_label_ac3 = (labels_ac3[layer].copy() > 0).astype(np.uint8)
        # single_label_ac3 = measure.label(single_label_ac3, connectivity=2).astype(np.uint16)
        img_part_list, img_name_list = write_single_img(single_label_ac3, y_range, x_range)
        for img_index in range(len(img_part_list)):
            io.imsave(join(save_labels_path, 'ac3_%d_'%layer + img_name_list[img_index] + '.tif'), img_part_list[img_index])
    # ac4
    images_ac4 = io.imread(join(source_path, 'AC4Images.tif'))
    for layer in tqdm(range(images_ac4.shape[0])):
        img_part_list, img_name_list = write_single_img(images_ac4[layer], y_range, x_range)
        for img_index in range(len(img_part_list)):
            io.imsave(join(save_images_path, 'ac4_%d_'%layer + img_name_list[img_index] + '.tif'), img_part_list[img_index])
    
    labels_ac4 = io.imread(join(source_path, 'AC4Labels.tif'))
    for layer in tqdm(range(labels_ac4.shape[0])):
        single_label_ac4 = (labels_ac4[layer].copy() > 0).astype(np.uint8)
        # single_label_ac4 = measure.label(single_label_ac4, connectivity=2).astype(np.uint16)
        img_part_list, img_name_list = write_single_img(single_label_ac4, y_range, x_range)
        for img_index in range(len(img_part_list)):
            io.imsave(join(save_labels_path, 'ac4_%d_'%layer + img_name_list[img_index] + '.tif'), img_part_list[img_index])
    print('ac3ac4 ok')

if __name__ == '__main__':
    print('Begin AC3AC4...')
    ac3ac4_write(source_path = 'Path/to/your/DataOri/AC3AC4',
                save_path = 'Path/to/your/DataSingle/AC3AC4',
                y_range = [[0, 1024]],
                x_range = [[0, 1024]])
    print('all ok')

运行后,图像的格式是这样的,图像与标签同名:
在这里插入图片描述

二、分块2D图像的coco数据集格式转化

由于 mmdetection 主要使用coco数据集作为基础框架,因此需要将原有的Image和Labe图像进行转换,这里

import json
import cv2
import os
from os.path import join
import shutil
import numpy as np
from PIL import Image
from pycocotools import mask

from tqdm import tqdm

def multyins_mask2coco(image_dir, mask_dir, output_path, my_label={"background": 0, "date": 1, "fig": 2, "hazelnut": 3}):
    # 初始化需要保存的coco字典
    coco = dict()
    coco['images'] = []
    coco['type'] = 'instances'
    coco['annotations'] = []
    coco['categories'] = []

    # 初始化相关的变量与标记
    image_set = set()
    category_item_id = 0
    annotation_id = 0
    # 1.添加coco的categories
    my_label = sorted(my_label.items(), key=lambda item: item[1])
    for val in my_label:
        category_item = dict()
        category_item['supercategory'] = 'none'
        category_item_id = val[1]
        if 0 == category_item_id:
            continue
        category_item['id'] = category_item_id
        category_item['name'] = val[0]
        coco['categories'].append(category_item)

    # 2.添加coco的images 加载图片信息
    imageListFile = os.listdir(image_dir)

    annotationListFile = os.listdir(mask_dir)
    assert len(imageListFile) == len(annotationListFile)

    for imageId in range(len(imageListFile)):
        imageListFile[imageId] == annotationListFile[imageId]

        annotationPath = join(mask_dir, annotationListFile[imageId])
        annotationGray = cv2.imread(annotationPath, -1)
        if len(annotationGray.shape) == 3:
            annotationGray = cv2.cvtColor(annotationGray, cv2.COLOR_BGR2GRAY)

        image_item = dict()
        image_item['id'] = imageId
        image_item['file_name'] = imageListFile[imageId]
        image_item['width'] = annotationGray.shape[1]  # size['width']
        image_item['height'] = annotationGray.shape[0]  # size['height']
        coco['images'].append(image_item)
        image_set.add(imageListFile[imageId])

        # 3.添加coco的annotations
        for current_category_id in range(1, len(my_label)):
            img_bi = np.zeros(annotationGray.shape, dtype='uint8')
            img_bi[annotationGray == current_category_id] = 255
            my_contours, _ = cv2.findContours(img_bi, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
            for c in my_contours:
                area_t = cv2.contourArea(c)
                # 这里设定阈值进行筛选
                if 0 == len(c) or area_t < 20:
                    continue
                L_pt = c
                # x,y,w,h
                bbox = cv2.boundingRect(L_pt)
                x1, y1, w1, h1 = bbox
                # 标记超过原图界限,进行忽略
                if x1 < 0 or y1 < 0 or x1 + w1 > annotationGray.shape[1] or y1 + h1 > annotationGray.shape[0]:
                    continue
                seg = []
                for val in L_pt:
                    x = val[0][0]
                    y = val[0][1]
                    seg.append(int(x))
                    seg.append(int(y))

                bbox = list(bbox)

                annotation_item = dict()
                annotation_item['segmentation'] = []
                annotation_item['segmentation'].append(seg)

                annotation_item['area'] = area_t
                annotation_item['iscrowd'] = 0
                annotation_item['ignore'] = 0
                annotation_item['image_id'] = imageId
                annotation_item['bbox'] = bbox
                annotation_item['category_id'] = current_category_id
                annotation_id += 1
                annotation_item['id'] = annotation_id
                coco['annotations'].append(annotation_item)

    json.dump(coco, open(output_path, 'w'))


def copy_file(source, destination):
    try:
        shutil.copy(source, destination)
        # print(f"文件 {source} 已成功复制到 {destination}")
    except FileNotFoundError:
        print("FileNotFoundError")
        # print(f"文件 {source} 不存在")
    except PermissionError:
        print("PermissionError")
        # print(f"没有权限复制文件 {source}")

def ac3ac4coco(source_ac3ac4_path, save_ac3ac4_path):
    # ac3 200, 56, ac4 100 split
    source_ac3ac4_images = join(source_ac3ac4_path, 'images')
    images_list_all = os.listdir(source_ac3ac4_images)
    source_ac3ac4_labels = join(source_ac3ac4_path, 'labels')
    labels_list_all = os.listdir(source_ac3ac4_labels)

    train_path = join(save_ac3ac4_path, 'train2017')
    os.makedirs(train_path, exist_ok=True)
    val_path = join(save_ac3ac4_path, 'val2017')
    os.makedirs(val_path, exist_ok=True)
    test_path = join(save_ac3ac4_path, 'test2017')
    os.makedirs(test_path, exist_ok=True)
    anno_path = join(save_ac3ac4_path, 'annotations')
    os.makedirs(anno_path, exist_ok=True)

    train_path_labels = join(save_ac3ac4_path, 'train_labels')
    os.makedirs(train_path_labels, exist_ok=True)
    val_path_labels = join(save_ac3ac4_path, 'val_labels')
    os.makedirs(val_path_labels, exist_ok=True)
    test_path_labels = join(save_ac3ac4_path, 'test_labels')
    os.makedirs(test_path_labels, exist_ok=True)

    images_list_train = list(filter(lambda x:x.split('_')[0]=='ac3' and int(x.split('_')[1])<200, images_list_all))
    images_list_val = list(filter(lambda x:x.split('_')[0]=='ac3' and int(x.split('_')[1])>=200, images_list_all))
    images_list_test = list(filter(lambda x:x.split('_')[0]=='ac4', images_list_all))

    labels_list_train = list(filter(lambda x:x.split('_')[0]=='ac3' and int(x.split('_')[1])<200, labels_list_all))
    labels_list_val = list(filter(lambda x:x.split('_')[0]=='ac3' and int(x.split('_')[1])>=200, labels_list_all))
    labels_list_test = list(filter(lambda x:x.split('_')[0]=='ac4', labels_list_all))

    for image_name in tqdm(images_list_train):
        copy_file(join(source_ac3ac4_images, image_name), train_path)
    for image_name in tqdm(images_list_val):
        copy_file(join(source_ac3ac4_images, image_name), val_path)
    for image_name in tqdm(images_list_test):
        copy_file(join(source_ac3ac4_images, image_name), test_path)

    for image_name in tqdm(labels_list_train):
        copy_file(join(source_ac3ac4_labels, image_name), train_path_labels)
    for image_name in tqdm(labels_list_val):
        copy_file(join(source_ac3ac4_labels, image_name), val_path_labels)
    for image_name in tqdm(labels_list_test):
        copy_file(join(source_ac3ac4_labels, image_name), test_path_labels)

    category_id_to_name = {"background":0, "synapse":1}
    multyins_mask2coco(image_dir=train_path, mask_dir=train_path_labels, 
                       output_path=join(anno_path, "instances_train2017.json"), 
                       my_label=category_id_to_name)
    multyins_mask2coco(image_dir=val_path, mask_dir=val_path_labels, 
                       output_path=join(anno_path, "instances_val2017.json"), 
                       my_label=category_id_to_name)
    multyins_mask2coco(image_dir=test_path, mask_dir=test_path_labels, 
                       output_path=join(anno_path, "instances_test2017.json"), 
                       my_label=category_id_to_name)
    print('ac3ac4 ok')

if __name__ == '__main__':
    source_path = 'Path/to/your/DataSingle/'
    save_path = 'Path/to/your/DataCOCO/'
    ac3ac4coco(source_ac3ac4_path=join(source_path, 'AC3AC4'), save_ac3ac4_path=join(save_path, 'AC3AC4'))

    print('all ok')

转换后的文件夹如下:
在这里插入图片描述

总结

数据集的转换比较简单,但也是坑比较多的过程,例如有些键值对,例如下面的

category_id_to_name = {“background”:0, “synapse”:1}

注意要与后面数据集的labe name,label id相匹配,否则可能会出现标签不匹配,或者

need at least one array to concatenate mmdetection

等错误。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值