nnUNetv1在linux平台上训练自己的数据集

爱运动的IT小白

已于 2024-01-19 14:02:35 修改

阅读量1k

点赞数 7

文章标签：深度学习 pytorch 人工智能

于 2024-01-19 11:51:14 首次发布

本文链接：https://blog.csdn.net/weixin_53297048/article/details/135691463

版权

使用nnUNetv1进行3D肺炎分割

去年9月份国创项目中跑过的一个模型，效果挺好的，简单做一个回顾整理

github: MIC-DKFZ/nnUNet at nnunetv1 (github.com)

1、制作数据集

先在nnUNetFrame文件夹中创建一个名为DATASET的文件夹，用来存放数据

然后DATASET文件夹中创建三个文件夹，它们分别是nnUNet_raw，nnUNet_preprocessed，nnUNet_trained_models，如图所示

进入上面第二个文件夹nnUNet_raw，创建nnUNet_cropped_data文件夹和nnUNet_raw_data文件夹，右边存放原始数据，左边存放crop以后的数据。

进入文件夹nnUNet_raw_data，创建一个名为Task66_Pneumonia的文件夹（初始的文件名关系不大，调用nnUNet_convert_decathlon_task 方法可以格式转换为Task066_Pneumonia）

由于我的初始数据集是以不同的病原体类型为目录，以患者id、日期、序列等为文件名的nii.gz格式的CT图像，因此我写了对应的脚本 dataset_process.py 将数据集划分为训练集imagesTr、LabelTr 和测试集imagesTs、LabelTs，具体代码如下

# -*- coding: utf-8 -*-
"""
Created on 2023/9/25 15:47

@author: zhengjie
"""

import glob
import os
import os.path as osp
import shutil

import nibabel
import numpy as np
from sklearn.model_selection import train_test_split

root_dir = '/x32001067/pneumonia'
print("Processing datas from {0}".format(root_dir))

# 指定输出目录
output_dir = '/x32001067/nnUNetv1/nnUNetFrame/DATASET/nnUNet_raw/nnUNet_raw_data/Task666_Pneumonia'
if not osp.exists(output_dir): os.makedirs(output_dir, exist_ok=True)
# 新子目录
output_imagesTr = osp.join(output_dir, 'imagesTr')
output_imagesTs = osp.join(output_dir, 'imagesTs')
output_labelsTr = osp.join(output_dir, 'labelsTr')
output_labelsTs = osp.join(output_dir, 'labelsTs')
# 创建新目录（如果不存在）
os.makedirs(output_imagesTr, exist_ok=True)
os.makedirs(output_imagesTs, exist_ok=True)
os.makedirs(output_labelsTr, exist_ok=True)
os.makedirs(output_labelsTs, exist_ok=True)

im_list = []

# 获取所有的.nii.gz文件路径，包括其所有子文件夹
nii_files = glob.glob(osp.join(root_dir, '**', '*.nii.gz'), recursive=True)
for nii_file in nii_files:
    im_file = nii_file.replace('_mask', '')  # 图片
    mask_file = nii_file  # 掩码
    pid = osp.basename(im_file).split('_')[0]  # 病人的标识 id
    clinic_file = osp.join(osp.dirname(im_file), pid + '_clinics.xlsx')
    ct_observ_file = osp.join(osp.dirname(im_file), pid + '_ct_obervations.xlsx')

    # 将肺炎类型作为文件名的一部分:合并目录
    file_name = osp.basename(im_file)
    dir_name = osp.dirname(im_file)
    parts = dir_name.split('/')  # 使用反斜杠进行分割
    folder_name = parts[-1]
    im_file_name = f'{folder_name}_{file_name}'

    file_name = osp.basename(mask_file)
    dir_name = osp.dirname(mask_file)
    parts = dir_name.split('/')  # 使用反斜杠进行分割
    folder_name = parts[-1]
    mask_file_name = f'{folder_name}_{file_name}'

    info = (im_file, mask_file, clinic_file, ct_observ_file, im_file_name, mask_file_name)

    if os.path.getsize(im_file) == 0 or os.path.getsize(mask_file) == 0:
        print("{0} File is empty.".format(im_file))
        continue

    # 过滤无法打开的文件
    try:
        volume = nibabel.load(im_file).get_fdata()
    except Exception as e:
        print("{0} File is error.".format(im_file))
        continue

    mask = nibabel.load(mask_file).get_fdata()
    mask_array = np.array(mask)
    if np.max(mask_array) == 0 or np.max(mask_array) > 1 or np.count_nonzero(mask_array) < 100:
        print("{0} mask File is incompatible.".format(im_file))
        continue
    im_list.append(info)
print('Total samples:', len(im_list))
# 划分数据集
train_size = 0.8  # 80% 作为训练集
train_list, test_list = train_test_split(im_list, train_size=train_size, random_state=42)
for info in train_list:
    shutil.copy2(info[0], osp.join(output_imagesTr, info[-2]))
    shutil.copy2(info[1], osp.join(output_labelsTr, info[-1]))

for info in test_list:
    shutil.copy2(info[0], osp.join(output_imagesTs, info[-2]))
    shutil.copy2(info[1], osp.join(output_labelsTs, info[-1]))
print("Train samples:", len(train_list))
print("Test samples:", len(test_list))
print('finished')

2、获得对应的json文件

需要修改部分代码

运行utils.py

# -*- coding: utf-8 -*-
"""
Created on 2023/9/26 10:02

@author: zhengjie
"""

from typing import Tuple
import numpy as np
from batchgenerators.utilities.file_and_folder_operations import *


def get_identifiers_from_splitted_files(folder: str):
    uniques = np.unique([i[:-7] for i in subfiles(folder, suffix='.nii.gz', join=False)])
    return uniques


def generate_dataset_json(output_file: str, imagesTr_dir: str, imagesTs_dir: str, modalities: Tuple,
                          labels: dict, dataset_name: str, sort_keys=True, license: str = "hands off!",
                          dataset_description: str = "",
                          dataset_reference="", dataset_release='0.0'):
    """
    :param output_file: This needs to be the full path to the dataset.json you intend to write, so
    output_file='DATASET_PATH/dataset.json' where the folder DATASET_PATH points to is the one with the
    imagesTr and labelsTr subfolders
    :param imagesTr_dir: path to the imagesTr folder of that dataset
    :param imagesTs_dir: path to the imagesTs folder of that dataset. Can be None
    :param modalities: tuple of strings with modality names. must be in the same order as the images (first entry
    corresponds to _0000.nii.gz, etc). Example: ('T1', 'T2', 'FLAIR').
    :param labels: dict with int->str (key->value) mapping the label IDs to label names. Note that 0 is always
    supposed to be background! Example: {0: 'background', 1: 'edema', 2: 'enhancing tumor'}
    :param dataset_name: The name of the dataset. Can be anything you want
    :param sort_keys: In order to sort or not, the keys in dataset.json
    :param license:
    :param dataset_description:
    :param dataset_reference: website of the dataset, if available
    :param dataset_release:
    :return:
    """
    train_identifiers = get_identifiers_from_splitted_files(imagesTr_dir)

    if imagesTs_dir is not None:
        test_identifiers = get_identifiers_from_splitted_files(imagesTs_dir)
    else:
        test_identifiers = []

    json_dict = {}
    json_dict['name'] = dataset_name
    json_dict['description'] = dataset_description
    json_dict['tensorImageSize'] = "3D"
    json_dict['reference'] = dataset_reference
    json_dict['licence'] = license
    json_dict['release'] = dataset_release
    json_dict['modality'] = {str(i): modalities[i] for i in range(len(modalities))}
    json_dict['labels'] = {str(i): labels[i] for i in labels.keys()}

    json_dict['numTraining'] = len(train_identifiers)
    json_dict['numTest'] = len(test_identifiers)
    json_dict['training'] = [
        {'image': "./imagesTr/%s.nii.gz" % i, "label": "./labelsTr/%s_mask.nii.gz" % i} for i
        in
        train_identifiers]
    #json_dict['test'] = ["./imagesTs/%s.nii.gz" % i for i in test_identifiers]
    json_dict['test'] = [
        {'image': "./imagesTs/%s.nii.gz" % i, "label": "./labelsTs/%s_mask.nii.gz" % i} for i
        in
        test_identifiers]

    save_json(json_dict, os.path.join(output_file, "dataset.json"), sort_keys=sort_keys)

if __name__ == "__main__":
    dataset_name = "Task666_Pneumonia"
    output_file = '/x32001067/nnUNetv1/nnUNetFrame/DATASET/nnUNet_raw/nnUNet_raw_data/Task66_Pneumonia'
    imagesTr_dir = '/x32001067/nnUNetv1/nnUNetFrame/DATASET/nnUNet_raw/nnUNet_raw_data/Task66_Pneumonia/imagesTr'
    imagesTs_dir = '/x32001067/nnUNetv1/nnUNetFrame/DATASET/nnUNet_raw/nnUNet_raw_data/Task66_Pneumonia/imagesTs'
    labelsTr = '/x32001067/nnUNetv1/nnUNetFrame/DATASET/nnUNet_raw/nnUNet_raw_data/Task66_Pneumonia/labelsTr'

    modalities = ("CT",)
    labels = {
        "0": "background",
        "1": "Pneumonia"
    }

    get_identifiers_from_splitted_files(output_file)
    generate_dataset_json(output_file,
                          imagesTr_dir,
                          imagesTs_dir,
                          modalities,
                          labels,
                          dataset_name
                          )
    print("finished")

需要在.bashrc中添加环境变量（/home）

export nnUNet_raw_data_base="/x32001067/nnUNetv1/nnUNetFrame/DATASET/nnUNet_raw"
export nnUNet_preprocessed="/x32001067/nnUNetv1/nnUNetFrame/DATASET/nnUNet_preprocessed"
export RESULTS_FOLDER="/x32001067/nnUNetv1/nnUNetFrame/DATASET/nnUNet_trained_models"

3、转换数据集

nnUNet对数据集的名称特别严格，因此需要按照它的标准进行转换

nnUNet_convert_decathlon_task -i /x32001067/nnUNetv1/nnUNetFrame/DATASET/nnUNet_raw/nnUNet_raw_data/Task66_Pneumonia

命名包括任务与任务ID，即三位整数和相关联的任务名称

比如Task066_Pneumonia的任务名称为“肺炎”，任务ID为66。

在每个任务文件夹中，预期的结构如下：

Task066_Pneumonia/
├── dataset.json
├── imagesTr
├── (imagesTs)
└── labelsTr

4、对数据集进行预处理

nnUNet精髓之一 ,066为数据集编号

nnUNet_plan_and_preprocess -t 066

5、训练

066是任务编号 4是指5折交叉验证中的第4折

nnUNet_train 3d_fullres nnUNetTrainerV2 066 4

6、推理

nnUNet_predict -i /x32001067/nnUNetv1/nnUNetFrame/DATASET/nnUNet_raw/nnUNet_raw_data/Task066_Pneumonia/imagesTs/ -o /x32001067/nnUNetv1/nnUNetFrame/DATASET/nnUNet_raw/nnUNet_raw_data/Task066_Pneumonia/inferTs -t 066 -m 3d_fullres -f 4

7、计算Dice

（自己写的python脚本）

python Dice_cal.py

Dice_cal.py

# -*- coding: utf-8 -*-
"""
Created on 2023/9/30 9:40

@author: zhengjie
"""

import torch
import torch.nn as nn
from glob import glob
import SimpleITK as sitk


def dice(predict, soft_y):
    """
    get dice scores for each class in predict and soft_y
    """
    tensor_dim = len(predict.size())
    num_class = list(predict.size())[1]
    if (tensor_dim == 5):
        soft_y = soft_y.permute(0, 2, 3, 4, 1)
        predict = predict.permute(0, 2, 3, 4, 1)
    elif (tensor_dim == 4):
        soft_y = soft_y.permute(0, 2, 3, 1)
        predict = predict.permute(0, 2, 3, 1)
    else:
        raise ValueError("{0:}D tensor not supported".format(tensor_dim))

    soft_y = torch.reshape(soft_y, (-1, num_class))
    predict = torch.reshape(predict, (-1, num_class))

    y_vol = torch.sum(soft_y, dim=0)
    p_vol = torch.sum(predict, dim=0)
    intersect = torch.sum(soft_y * predict, dim=0)
    dice_score = (2.0 * intersect + 1e-5) / (y_vol + p_vol + 1e-5)
    return dice_score


if __name__ == "__main__":
    infer_path = r"F:\ComputerVision\nnUNet-nnunetv1\nnUNetFrame\DATASET\nnUNet_raw\nnUNet_raw_data\Task066_Pneumonia" \
                 r"\inferTs\*"  # 推理结果地址
    label_path = r"F:\ComputerVision\nnUNet-nnunetv1\nnUNetFrame\DATASET\nnUNet_raw\nnUNet_raw_data\Task066_Pneumonia" \
                 r"\labelsTs\*"  # 测试集label地址
    infer = sorted(glob(infer_path))
    label = sorted(glob(label_path))
    score_avg = 0
    for i in range(len(label)):
        inf, lab = infer[i], label[i]
        inf, lab = sitk.ReadImage(inf, sitk.sitkFloat32), sitk.ReadImage(lab, sitk.sitkFloat32)
        inf, lab = sitk.GetArrayFromImage(inf), sitk.GetArrayFromImage(lab)
        inf, lab = torch.from_numpy(inf), torch.from_numpy(lab)
        inf, lab = inf.unsqueeze(0).unsqueeze(0), lab.unsqueeze(0).unsqueeze(0)
        score = dice(inf, lab)
        print(i, infer[i])
        print(score)
        score_avg += score
    score_avg /= len(label)
    print("avg dice is ", score_avg)