mmdetection的生物图像实例分割全流程记录
第一章 自定义数据集的转换与制作
前言
最近尝试做一些细胞器,细胞结构的重建任务,需要验证一些实例分割模型,例如MRCNN,和生物医学图像中常用的nnUNet等语义分割模型的优劣。而在mmdetection 3.3.0中,一些常用的接口进行了较大的改变,这里索性记录步骤,方便后续的分享与学习。
本次的数据集选择了AC3AC4电镜数据集,分辨率为 29 n m × 6 n m × 6 n m 29nm \times 6nm \times 6nm 29nm×6nm×6nm,成像方式是SBEM,样本取自小鼠大脑皮层,是哈佛大学Kasthuri15数据的子集。 其中AC3体块大小为 256 × 1024 × 1024 256 \times 1024 \times 1024 256×1024×1024,AC4体块大小为 100 × 1024 × 1024 100 \times 1024 \times 1024 100×1024×1024,曾在ISBI2013数据集中作作为SNEMI挑战赛的数据。数据集的链接为:https://lichtman.rc.fas.harvard.edu/vast/
一、单个体积数据的分块划分与写入
mmdetection主要使用的还是2D图像的目标检测框架。对于电镜数据来说,其单张2D图像尺寸可能与通用的自然图像不同,长和宽经常会达到上万甚至上十万的像素规模。为此这里对图像进行分块处理。
我们将下载的图像存放在’Path/to/your/DataOri/AC3AC4’, 准备写入的图像存在在’Path/to/your/DataSingle/AC3AC4’, 每个2D图像的命名方式为 stack_z_y_x
# stack_z_y_x
import os
from os.path import join
import cv2
import numpy as np
from skimage import io
from skimage import measure
from tqdm import tqdm
def write_single_img(img_array, y_range, x_range):
img_part_list = []
img_name_list = []
for y_item_index in range(len(y_range)):
for x_item_index in range(len(x_range)):
y_item = y_range[y_item_index]
x_item = x_range[x_item_index]
img_part_list.append(img_array[y_item[0]:y_item[1], x_item[0]:x_item[1]].copy())
img_name_list.append('%d_%d'%(y_item_index, x_item_index))
return img_part_list, img_name_list
def ac3ac4_write(source_path = '/home/guojy2/share/guojy/SynDet2024/DataOri/AC3AC4',
save_path = '/home/guojy2/share/guojy/SynDet2024/DataSingle/AC3AC4',
y_range = [[0, 1024]],
x_range = [[0, 1024]]):
# 1024*1024
# smaller example: y_range = [[0, 640], [383, 1024]], x_range = [[0, 640], [383, 1024]]
save_images_path = join(save_path, 'images')
save_labels_path = join(save_path, 'labels')
os.makedirs(save_images_path, exist_ok=True)
os.makedirs(save_labels_path, exist_ok=True)
# ac3
images_ac3 = io.imread(join(source_path, 'AC3Images.tif'))
for layer in tqdm(range(images_ac3.shape[0])):
img_part_list, img_name_list = write_single_img(images_ac3[layer], y_range, x_range)
for img_index in range(len(img_part_list)):
io.imsave(join(save_images_path, 'ac3_%d_'%layer + img_name_list[img_index] + '.tif'), img_part_list[img_index])
labels_ac3 = io.imread(join(source_path, 'AC3Labels.tif'))
for layer in tqdm(range(labels_ac3.shape[0])):
single_label_ac3 = (labels_ac3[layer].copy() > 0).astype(np.uint8)
# single_label_ac3 = measure.label(single_label_ac3, connectivity=2).astype(np.uint16)
img_part_list, img_name_list = write_single_img(single_label_ac3, y_range, x_range)
for img_index in range(len(img_part_list)):
io.imsave(join(save_labels_path, 'ac3_%d_'%layer + img_name_list[img_index] + '.tif'), img_part_list[img_index])
# ac4
images_ac4 = io.imread(join(source_path, 'AC4Images.tif'))
for layer in tqdm(range(images_ac4.shape[0])):
img_part_list, img_name_list = write_single_img(images_ac4[layer], y_range, x_range)
for img_index in range(len(img_part_list)):
io.imsave(join(save_images_path, 'ac4_%d_'%layer + img_name_list[img_index] + '.tif'), img_part_list[img_index])
labels_ac4 = io.imread(join(source_path, 'AC4Labels.tif'))
for layer in tqdm(range(labels_ac4.shape[0])):
single_label_ac4 = (labels_ac4[layer].copy() > 0).astype(np.uint8)
# single_label_ac4 = measure.label(single_label_ac4, connectivity=2).astype(np.uint16)
img_part_list, img_name_list = write_single_img(single_label_ac4, y_range, x_range)
for img_index in range(len(img_part_list)):
io.imsave(join(save_labels_path, 'ac4_%d_'%layer + img_name_list[img_index] + '.tif'), img_part_list[img_index])
print('ac3ac4 ok')
if __name__ == '__main__':
print('Begin AC3AC4...')
ac3ac4_write(source_path = 'Path/to/your/DataOri/AC3AC4',
save_path = 'Path/to/your/DataSingle/AC3AC4',
y_range = [[0, 1024]],
x_range = [[0, 1024]])
print('all ok')
运行后,图像的格式是这样的,图像与标签同名:
二、分块2D图像的coco数据集格式转化
由于 mmdetection 主要使用coco数据集作为基础框架,因此需要将原有的Image和Labe图像进行转换,这里
import json
import cv2
import os
from os.path import join
import shutil
import numpy as np
from PIL import Image
from pycocotools import mask
from tqdm import tqdm
def multyins_mask2coco(image_dir, mask_dir, output_path, my_label={"background": 0, "date": 1, "fig": 2, "hazelnut": 3}):
# 初始化需要保存的coco字典
coco = dict()
coco['images'] = []
coco['type'] = 'instances'
coco['annotations'] = []
coco['categories'] = []
# 初始化相关的变量与标记
image_set = set()
category_item_id = 0
annotation_id = 0
# 1.添加coco的categories
my_label = sorted(my_label.items(), key=lambda item: item[1])
for val in my_label:
category_item = dict()
category_item['supercategory'] = 'none'
category_item_id = val[1]
if 0 == category_item_id:
continue
category_item['id'] = category_item_id
category_item['name'] = val[0]
coco['categories'].append(category_item)
# 2.添加coco的images 加载图片信息
imageListFile = os.listdir(image_dir)
annotationListFile = os.listdir(mask_dir)
assert len(imageListFile) == len(annotationListFile)
for imageId in range(len(imageListFile)):
imageListFile[imageId] == annotationListFile[imageId]
annotationPath = join(mask_dir, annotationListFile[imageId])
annotationGray = cv2.imread(annotationPath, -1)
if len(annotationGray.shape) == 3:
annotationGray = cv2.cvtColor(annotationGray, cv2.COLOR_BGR2GRAY)
image_item = dict()
image_item['id'] = imageId
image_item['file_name'] = imageListFile[imageId]
image_item['width'] = annotationGray.shape[1] # size['width']
image_item['height'] = annotationGray.shape[0] # size['height']
coco['images'].append(image_item)
image_set.add(imageListFile[imageId])
# 3.添加coco的annotations
for current_category_id in range(1, len(my_label)):
img_bi = np.zeros(annotationGray.shape, dtype='uint8')
img_bi[annotationGray == current_category_id] = 255
my_contours, _ = cv2.findContours(img_bi, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
for c in my_contours:
area_t = cv2.contourArea(c)
# 这里设定阈值进行筛选
if 0 == len(c) or area_t < 20:
continue
L_pt = c
# x,y,w,h
bbox = cv2.boundingRect(L_pt)
x1, y1, w1, h1 = bbox
# 标记超过原图界限,进行忽略
if x1 < 0 or y1 < 0 or x1 + w1 > annotationGray.shape[1] or y1 + h1 > annotationGray.shape[0]:
continue
seg = []
for val in L_pt:
x = val[0][0]
y = val[0][1]
seg.append(int(x))
seg.append(int(y))
bbox = list(bbox)
annotation_item = dict()
annotation_item['segmentation'] = []
annotation_item['segmentation'].append(seg)
annotation_item['area'] = area_t
annotation_item['iscrowd'] = 0
annotation_item['ignore'] = 0
annotation_item['image_id'] = imageId
annotation_item['bbox'] = bbox
annotation_item['category_id'] = current_category_id
annotation_id += 1
annotation_item['id'] = annotation_id
coco['annotations'].append(annotation_item)
json.dump(coco, open(output_path, 'w'))
def copy_file(source, destination):
try:
shutil.copy(source, destination)
# print(f"文件 {source} 已成功复制到 {destination}")
except FileNotFoundError:
print("FileNotFoundError")
# print(f"文件 {source} 不存在")
except PermissionError:
print("PermissionError")
# print(f"没有权限复制文件 {source}")
def ac3ac4coco(source_ac3ac4_path, save_ac3ac4_path):
# ac3 200, 56, ac4 100 split
source_ac3ac4_images = join(source_ac3ac4_path, 'images')
images_list_all = os.listdir(source_ac3ac4_images)
source_ac3ac4_labels = join(source_ac3ac4_path, 'labels')
labels_list_all = os.listdir(source_ac3ac4_labels)
train_path = join(save_ac3ac4_path, 'train2017')
os.makedirs(train_path, exist_ok=True)
val_path = join(save_ac3ac4_path, 'val2017')
os.makedirs(val_path, exist_ok=True)
test_path = join(save_ac3ac4_path, 'test2017')
os.makedirs(test_path, exist_ok=True)
anno_path = join(save_ac3ac4_path, 'annotations')
os.makedirs(anno_path, exist_ok=True)
train_path_labels = join(save_ac3ac4_path, 'train_labels')
os.makedirs(train_path_labels, exist_ok=True)
val_path_labels = join(save_ac3ac4_path, 'val_labels')
os.makedirs(val_path_labels, exist_ok=True)
test_path_labels = join(save_ac3ac4_path, 'test_labels')
os.makedirs(test_path_labels, exist_ok=True)
images_list_train = list(filter(lambda x:x.split('_')[0]=='ac3' and int(x.split('_')[1])<200, images_list_all))
images_list_val = list(filter(lambda x:x.split('_')[0]=='ac3' and int(x.split('_')[1])>=200, images_list_all))
images_list_test = list(filter(lambda x:x.split('_')[0]=='ac4', images_list_all))
labels_list_train = list(filter(lambda x:x.split('_')[0]=='ac3' and int(x.split('_')[1])<200, labels_list_all))
labels_list_val = list(filter(lambda x:x.split('_')[0]=='ac3' and int(x.split('_')[1])>=200, labels_list_all))
labels_list_test = list(filter(lambda x:x.split('_')[0]=='ac4', labels_list_all))
for image_name in tqdm(images_list_train):
copy_file(join(source_ac3ac4_images, image_name), train_path)
for image_name in tqdm(images_list_val):
copy_file(join(source_ac3ac4_images, image_name), val_path)
for image_name in tqdm(images_list_test):
copy_file(join(source_ac3ac4_images, image_name), test_path)
for image_name in tqdm(labels_list_train):
copy_file(join(source_ac3ac4_labels, image_name), train_path_labels)
for image_name in tqdm(labels_list_val):
copy_file(join(source_ac3ac4_labels, image_name), val_path_labels)
for image_name in tqdm(labels_list_test):
copy_file(join(source_ac3ac4_labels, image_name), test_path_labels)
category_id_to_name = {"background":0, "synapse":1}
multyins_mask2coco(image_dir=train_path, mask_dir=train_path_labels,
output_path=join(anno_path, "instances_train2017.json"),
my_label=category_id_to_name)
multyins_mask2coco(image_dir=val_path, mask_dir=val_path_labels,
output_path=join(anno_path, "instances_val2017.json"),
my_label=category_id_to_name)
multyins_mask2coco(image_dir=test_path, mask_dir=test_path_labels,
output_path=join(anno_path, "instances_test2017.json"),
my_label=category_id_to_name)
print('ac3ac4 ok')
if __name__ == '__main__':
source_path = 'Path/to/your/DataSingle/'
save_path = 'Path/to/your/DataCOCO/'
ac3ac4coco(source_ac3ac4_path=join(source_path, 'AC3AC4'), save_ac3ac4_path=join(save_path, 'AC3AC4'))
print('all ok')
转换后的文件夹如下:
总结
数据集的转换比较简单,但也是坑比较多的过程,例如有些键值对,例如下面的
category_id_to_name = {“background”:0, “synapse”:1}
注意要与后面数据集的labe name,label id相匹配,否则可能会出现标签不匹配,或者
need at least one array to concatenate mmdetection
等错误。