使用nnUNetv1进行3D肺炎分割
去年9月份国创项目中跑过的一个模型,效果挺好的,简单做一个回顾整理
github: MIC-DKFZ/nnUNet at nnunetv1 (github.com)
1、制作数据集
先在nnUNetFrame文件夹中创建一个名为DATASET的文件夹,用来存放数据
然后DATASET文件夹中创建三个文件夹,它们分别是nnUNet_raw,nnUNet_preprocessed,nnUNet_trained_models,如图所示
进入上面第二个文件夹nnUNet_raw,创建nnUNet_cropped_data文件夹和nnUNet_raw_data文件夹,右边存放原始数据,左边存放crop以后的数据。
进入文件夹nnUNet_raw_data,创建一个名为Task66_Pneumonia的文件夹(初始的文件名关系不大,调用nnUNet_convert_decathlon_task 方法可以格式转换为Task066_Pneumonia)
由于我的初始数据集是以不同的病原体类型为目录,以患者id、日期、序列等为文件名的nii.gz格式的CT图像,因此我写了对应的脚本 dataset_process.py 将数据集划分为训练集imagesTr、LabelTr 和测试集imagesTs、LabelTs,具体代码如下
# -*- coding: utf-8 -*-
"""
Created on 2023/9/25 15:47
@author: zhengjie
"""
import glob
import os
import os.path as osp
import shutil
import nibabel
import numpy as np
from sklearn.model_selection import train_test_split
root_dir = '/x32001067/pneumonia'
print("Processing datas from {0}".format(root_dir))
# 指定输出目录
output_dir = '/x32001067/nnUNetv1/nnUNetFrame/DATASET/nnUNet_raw/nnUNet_raw_data/Task666_Pneumonia'
if not osp.exists(output_dir): os.makedirs(output_dir, exist_ok=True)
# 新子目录
output_imagesTr = osp.join(output_dir, 'imagesTr')
output_imagesTs = osp.join(output_dir, 'imagesTs')
output_labelsTr = osp.join(output_dir, 'labelsTr')
output_labelsTs = osp.join(output_dir, 'labelsTs')
# 创建新目录(如果不存在)
os.makedirs(output_imagesTr, exist_ok=True)
os.makedirs(output_imagesTs, exist_ok=True)
os.makedirs(output_labelsTr, exist_ok=True)
os.makedirs(output_labelsTs, exist_ok=True)
im_list = []
# 获取所有的.nii.gz文件路径,包括其所有子文件夹
nii_files = glob.glob(osp.join(root_dir, '**', '*.nii.gz'), recursive=True)
for nii_file in nii_files:
im_file = nii_file.replace('_mask', '') # 图片
mask_file = nii_file # 掩码
pid = osp.basename(im_file).split('_')[0] # 病人的标识 id
clinic_file = osp.join(osp.dirname(im_file), pid + '_clinics.xlsx')
ct_observ_file = osp.join(osp.dirname(im_file), pid + '_ct_obervations.xlsx')
# 将肺炎类型作为文件名的一部分:合并目录
file_name = osp.basename(im_file)
dir_name = osp.dirname(im_file)
parts = dir_name.split('/') # 使用反斜杠进行分割
folder_name = parts[-1]
im_file_name = f'{folder_name}_{file_name}'
file_name = osp.basename(mask_file)
dir_name = osp.dirname(mask_file)
parts = dir_name.split('/') # 使用反斜杠进行分割
folder_name = parts[-1]
mask_file_name = f'{folder_name}_{file_name}'
info = (im_file, mask_file, clinic_file, ct_observ_file, im_file_name, mask_file_name)
if os.path.getsize(im_file) == 0 or os.path.getsize(mask_file) == 0:
print("{0} File is empty.".format(im_file))
continue
# 过滤无法打开的文件
try:
volume = nibabel.load(im_file).get_fdata()
except Exception as e:
print("{0} File is error.".format(im_file))
continue
mask = nibabel.load(mask_file).get_fdata()
mask_array = np.array(mask)
if np.max(mask_array) == 0 or np.max(mask_array) > 1 or np.count_nonzero(mask_array) < 100:
print("{0} mask File is incompatible.".format(im_file))
continue
im_list.append(info)
print('Total samples:', len(im_list))
# 划分数据集
train_size = 0.8 # 80% 作为训练集
train_list, test_list = train_test_split(im_list, train_size=train_size, random_state=42)
for info in train_list:
shutil.copy2(info[0], osp.join(output_imagesTr, info[-2]))
shutil.copy2(info[1], osp.join(output_labelsTr, info[-1]))
for info in test_list:
shutil.copy2(info[0], osp.join(output_imagesTs, info[-2]))
shutil.copy2(info[1], osp.join(output_labelsTs, info[-1]))
print("Train samples:", len(train_list))
print("Test samples:", len(test_list))
print('finished')
2、获得对应的json文件
需要修改部分代码
运行utils.py
# -*- coding: utf-8 -*-
"""
Created on 2023/9/26 10:02
@author: zhengjie
"""
from typing import Tuple
import numpy as np
from batchgenerators.utilities.file_and_folder_operations import *
def get_identifiers_from_splitted_files(folder: str):
uniques = np.unique([i[:-7] for i in subfiles(folder, suffix='.nii.gz', join=False)])
return uniques
def generate_dataset_json(output_file: str, imagesTr_dir: str, imagesTs_dir: str, modalities: Tuple,
labels: dict, dataset_name: str, sort_keys=True, license: str = "hands off!",
dataset_description: str = "",
dataset_reference="", dataset_release='0.0'):
"""
:param output_file: This needs to be the full path to the dataset.json you intend to write, so
output_file='DATASET_PATH/dataset.json' where the folder DATASET_PATH points to is the one with the
imagesTr and labelsTr subfolders
:param imagesTr_dir: path to the imagesTr folder of that dataset
:param imagesTs_dir: path to the imagesTs folder of that dataset. Can be None
:param modalities: tuple of strings with modality names. must be in the same order as the images (first entry
corresponds to _0000.nii.gz, etc). Example: ('T1', 'T2', 'FLAIR').
:param labels: dict with int->str (key->value) mapping the label IDs to label names. Note that 0 is always
supposed to be background! Example: {0: 'background', 1: 'edema', 2: 'enhancing tumor'}
:param dataset_name: The name of the dataset. Can be anything you want
:param sort_keys: In order to sort or not, the keys in dataset.json
:param license:
:param dataset_description:
:param dataset_reference: website of the dataset, if available
:param dataset_release:
:return:
"""
train_identifiers = get_identifiers_from_splitted_files(imagesTr_dir)
if imagesTs_dir is not None:
test_identifiers = get_identifiers_from_splitted_files(imagesTs_dir)
else:
test_identifiers = []
json_dict = {}
json_dict['name'] = dataset_name
json_dict['description'] = dataset_description
json_dict['tensorImageSize'] = "3D"
json_dict['reference'] = dataset_reference
json_dict['licence'] = license
json_dict['release'] = dataset_release
json_dict['modality'] = {str(i): modalities[i] for i in range(len(modalities))}
json_dict['labels'] = {str(i): labels[i] for i in labels.keys()}
json_dict['numTraining'] = len(train_identifiers)
json_dict['numTest'] = len(test_identifiers)
json_dict['training'] = [
{'image': "./imagesTr/%s.nii.gz" % i, "label": "./labelsTr/%s_mask.nii.gz" % i} for i
in
train_identifiers]
#json_dict['test'] = ["./imagesTs/%s.nii.gz" % i for i in test_identifiers]
json_dict['test'] = [
{'image': "./imagesTs/%s.nii.gz" % i, "label": "./labelsTs/%s_mask.nii.gz" % i} for i
in
test_identifiers]
save_json(json_dict, os.path.join(output_file, "dataset.json"), sort_keys=sort_keys)
if __name__ == "__main__":
dataset_name = "Task666_Pneumonia"
output_file = '/x32001067/nnUNetv1/nnUNetFrame/DATASET/nnUNet_raw/nnUNet_raw_data/Task66_Pneumonia'
imagesTr_dir = '/x32001067/nnUNetv1/nnUNetFrame/DATASET/nnUNet_raw/nnUNet_raw_data/Task66_Pneumonia/imagesTr'
imagesTs_dir = '/x32001067/nnUNetv1/nnUNetFrame/DATASET/nnUNet_raw/nnUNet_raw_data/Task66_Pneumonia/imagesTs'
labelsTr = '/x32001067/nnUNetv1/nnUNetFrame/DATASET/nnUNet_raw/nnUNet_raw_data/Task66_Pneumonia/labelsTr'
modalities = ("CT",)
labels = {
"0": "background",
"1": "Pneumonia"
}
get_identifiers_from_splitted_files(output_file)
generate_dataset_json(output_file,
imagesTr_dir,
imagesTs_dir,
modalities,
labels,
dataset_name
)
print("finished")
需要在.bashrc中添加环境变量(/home)
export nnUNet_raw_data_base="/x32001067/nnUNetv1/nnUNetFrame/DATASET/nnUNet_raw"
export nnUNet_preprocessed="/x32001067/nnUNetv1/nnUNetFrame/DATASET/nnUNet_preprocessed"
export RESULTS_FOLDER="/x32001067/nnUNetv1/nnUNetFrame/DATASET/nnUNet_trained_models"
3、 转换数据集
nnUNet对数据集的名称特别严格,因此需要按照它的标准进行转换
nnUNet_convert_decathlon_task -i /x32001067/nnUNetv1/nnUNetFrame/DATASET/nnUNet_raw/nnUNet_raw_data/Task66_Pneumonia
命名包括任务与任务ID,即三位整数和相关联的任务名称
比如Task066_Pneumonia的任务名称为“肺炎”,任务ID为66。
在每个任务文件夹中,预期的结构如下:
Task066_Pneumonia/
├── dataset.json
├── imagesTr
├── (imagesTs)
└── labelsTr
4、 对数据集进行预处理
nnUNet精髓之一 ,066为数据集编号
nnUNet_plan_and_preprocess -t 066
5、 训练
066是任务编号 4是指5折交叉验证中的第4折
nnUNet_train 3d_fullres nnUNetTrainerV2 066 4
6、推理
nnUNet_predict -i /x32001067/nnUNetv1/nnUNetFrame/DATASET/nnUNet_raw/nnUNet_raw_data/Task066_Pneumonia/imagesTs/ -o /x32001067/nnUNetv1/nnUNetFrame/DATASET/nnUNet_raw/nnUNet_raw_data/Task066_Pneumonia/inferTs -t 066 -m 3d_fullres -f 4
7、计算Dice
(自己写的python脚本)
python Dice_cal.py
Dice_cal.py
# -*- coding: utf-8 -*-
"""
Created on 2023/9/30 9:40
@author: zhengjie
"""
import torch
import torch.nn as nn
from glob import glob
import SimpleITK as sitk
def dice(predict, soft_y):
"""
get dice scores for each class in predict and soft_y
"""
tensor_dim = len(predict.size())
num_class = list(predict.size())[1]
if (tensor_dim == 5):
soft_y = soft_y.permute(0, 2, 3, 4, 1)
predict = predict.permute(0, 2, 3, 4, 1)
elif (tensor_dim == 4):
soft_y = soft_y.permute(0, 2, 3, 1)
predict = predict.permute(0, 2, 3, 1)
else:
raise ValueError("{0:}D tensor not supported".format(tensor_dim))
soft_y = torch.reshape(soft_y, (-1, num_class))
predict = torch.reshape(predict, (-1, num_class))
y_vol = torch.sum(soft_y, dim=0)
p_vol = torch.sum(predict, dim=0)
intersect = torch.sum(soft_y * predict, dim=0)
dice_score = (2.0 * intersect + 1e-5) / (y_vol + p_vol + 1e-5)
return dice_score
if __name__ == "__main__":
infer_path = r"F:\ComputerVision\nnUNet-nnunetv1\nnUNetFrame\DATASET\nnUNet_raw\nnUNet_raw_data\Task066_Pneumonia" \
r"\inferTs\*" # 推理结果地址
label_path = r"F:\ComputerVision\nnUNet-nnunetv1\nnUNetFrame\DATASET\nnUNet_raw\nnUNet_raw_data\Task066_Pneumonia" \
r"\labelsTs\*" # 测试集label地址
infer = sorted(glob(infer_path))
label = sorted(glob(label_path))
score_avg = 0
for i in range(len(label)):
inf, lab = infer[i], label[i]
inf, lab = sitk.ReadImage(inf, sitk.sitkFloat32), sitk.ReadImage(lab, sitk.sitkFloat32)
inf, lab = sitk.GetArrayFromImage(inf), sitk.GetArrayFromImage(lab)
inf, lab = torch.from_numpy(inf), torch.from_numpy(lab)
inf, lab = inf.unsqueeze(0).unsqueeze(0), lab.unsqueeze(0).unsqueeze(0)
score = dice(inf, lab)
print(i, infer[i])
print(score)
score_avg += score
score_avg /= len(label)
print("avg dice is ", score_avg)
8、修改最大轮数
max_num_epochs修改完,需要重新运行pip install . 进行本地编译,否则不会生效
可以通过查看 debug.json 对各个参数进行校验
tips:
ubuntu下实时查看GPU使用情况的两种方式
1.nvidia-smi
但是,这样只能查看某一时刻的显卡使用情况。为了实时显示显卡的使用情况,可以添加参数。在终端输入命令
watch -n 5 nvidia-smi
这样系统就会每隔5秒刷新一次使用情况。这里的5
可以依据需要进行修改。 如上图所示,在下面的虚线框里显示了占用GPU的进程信息。每个进程有一个唯一的PID,如果想关闭掉某个进程,可以使用命令:
sudo kill -9 PID
2.gpustat
pip install gpustat