数据预处理prepare_data

梦里梦见梦不见的497

已于 2023-12-16 13:09:46 修改

阅读量361

点赞数 8

于 2023-12-12 19:42:26 首次发布

本文链接：https://blog.csdn.net/weixin_73101319/article/details/134957302

版权

深度学习专栏收录该内容

40 篇文章 0 订阅

订阅专栏

MR乳腺图像的预处理：

单序列的数据预处理：

# 作者:  王文江
# 开发日期: 2023/12/12
import random
import os
import h5py
import pydicom
import numpy as np
import glob
from skimage import exposure

label_set = ["预后良好", "预后不良"]

class MedicalImageDeal:
    def __init__(self, img, path, percent=1):
        self.img = img
        self.percent = percent
        self.p = path

    @property
    def valid_img(self):
        cdf = exposure.cumulative_distribution(self.img)
        if len(cdf[0]) == 0:
            watershed = self.img.min()
        else:
            watershed = cdf[1][cdf[0] >= self.percent][0]
        return np.clip(self.img, self.img.min(), watershed)

    @property
    def norm_img(self):
        return ((self.img - self.img.min()) / (self.img.max() - self.img.min())).astype(np.float32)

    @property
    def width_center1(self):
        image = pydicom.dcmread(self.p).pixel_array
        cdf = exposure.cumulative_distribution(image)
        if len(cdf[0]) == 0:
            return np.zeros_like(image)

        cdf_min = np.min(cdf[1])
        a = np.diff(cdf)
        if len(a) == 0:
            return np.zeros_like(image)
        aa = a[0][500 - cdf_min:1000]
        if len(aa) < 3:
            return np.zeros_like(image)

        aa_max = np.max(aa)
        xx = []
        for i in range(aa.shape[0]):
            if aa[i] > aa_max * 0.005:
                xx.append(i + 500)
        if len(xx) < 3:
            return np.zeros_like(image)

        ds = np.clip(image, xx[0], xx[len(xx) - 3])
        max_val = np.max(ds)
        min_val = np.min(ds)
        if max_val == min_val:
            return np.zeros_like(ds)
        ds = (ds - min_val) / (max_val - min_val)
        return ds
def generate_data(root_data, index, save_dir):
    low_energy_fold = os.listdir(os.path.join(root_data, label_set[index], "T1"))
    r_num = 0
    missing_pixel_data_files = []

    for patient_name in low_energy_fold:
        low_energy_list = glob.glob(os.path.join(root_data, label_set[index], "T1", patient_name, "*.dcm"))
        low_energy_list = sorted(low_energy_list)

        for low_energy_file in low_energy_list:
            dicom_data = pydicom.dcmread(low_energy_file)

            # 检查DICOM文件是否包含像素数据
            if hasattr(dicom_data, 'PixelData') and dicom_data.PixelData is not None:
                image = dicom_data.pixel_array
                low_energy_image = MedicalImageDeal(image, low_energy_file).valid_img

                R = random.randint(1, 100)
                if R >= 0 and R <= 70:
                    subdir = "train"
                elif R > 70 and R <= 90:
                    subdir = "valid"
                else:
                    subdir = "test"

                # 创建子目录（如果不存在）
                subdir_path = os.path.join(save_dir, subdir)
                os.makedirs(subdir_path, exist_ok=True)

                file_name = '{}_{}.h5'.format(patient_name, r_num)
                file_path = os.path.join(subdir_path, file_name)
                with h5py.File(file_path, 'w') as f:
                    f.create_dataset('LOW_ENERGY', data=low_energy_image, compression="gzip")
                    f.create_dataset('label', data=index)

                r_num += 1
                print("Processed {} - UID: {} - Label: {}".format(r_num, patient_name, label_set[index]))
            else:
                missing_pixel_data_files.append(low_energy_file)

    if missing_pixel_data_files:
        print("The following DICOM files are missing pixel data and were skipped:")
        for missing_file in missing_pixel_data_files:
            print(missing_file)


root_data = r"H:\王文江DL"
save_dir = r"I:\代码\MR_fussion\MF\h5py"

# 读取dMMR数据
generate_data(root_data, 0, save_dir)
# 读取pMMR数据
generate_data(root_data, 1, save_dir)

多序列融合的数据预处理：

import random
from skimage import transform
import numpy as np
import matplotlib.pyplot as plt
import nibabel as nib
import SimpleITK as sitk
import math
import glob
import os
import h5py
import pydicom
import cv2
from skimage.measure import label
# import gdcm

label_set = ["良性","恶性"]

def label_to_num(label):
    return label_set.index(label)
def num_to_label(n):
    return label_set[n]

import random

class MedicalImageDeal(object):
    def __init__(self, img, path, percent=1):
        self.img = img
        self.percent = percent
        self.p = path

    @property
    def valid_img(self):
        from skimage import exposure
        cdf = exposure.cumulative_distribution(self.img)
        watershed = cdf[1][cdf[0] >= self.percent][0]
        return np.clip(self.img, self.img.min(), watershed)

    @property
    def norm_img(self):
        return ((self.img - self.img.min()) / (self.img.max() - self.img.min())).astype(np.float32)

    @property
    def width_center1(self):
        from skimage import exposure
        image = pydicom.read_file(self.p).pixel_array
        cdf = exposure.cumulative_distribution(image)
        cdf_min = np.min(cdf[1])
        a = np.diff(cdf)
        aa = a[0][500 - cdf_min:1000]
        aa_max = np.max(aa)
        xx = []
        for i in range(aa.shape[0]):
            if aa[i] > aa_max * 0.005:
                xx.append(i + 500)
        ds = np.clip(image, xx[0], xx[len(xx) - 3])
        max = np.max(ds)
        min = np.min(ds)
        ds = (ds-min)/(max-min)
        return ds

def generate_data(root_data,index,save_dir):
    low_energy_fold = os.listdir(os.path.join(root_data,label_set[index],"T1"))
    high_energy_fold = os.listdir(os.path.join(root_data,label_set[index],"T2"))
    enhance_fold = os.listdir(os.path.join(root_data,label_set[index],"enhance"))
    
    patient_names = low_energy_fold
    images = []
    labs = []
    n = 0
    for i in range(len(patient_names)):  # len(patient_names)
        low_energy_list = glob.glob(os.path.join(root_data,label_set[index],"T1",patient_names[i],"*.dcm"))
        high_energy_list = glob.glob(os.path.join(root_data,label_set[index],"T2",patient_names[i],"*.dcm"))
        enhance_list = glob.glob(os.path.join(root_data,label_set[index],"enhance",patient_names[i],"*.dcm"))
        
        low_energy_list = sorted(low_energy_list)
        high_energy_list = sorted(high_energy_list)
        enhance_list = sorted(enhance_list)
        
        for j in range(len(low_energy_list)):
            item = {"low_energy":low_energy_list[j],
                    "high_energy":high_energy_list[j],
                    "enhance":enhance_list[j]}
            images.append(item)
            labs.append(index)
        r_num = 0
    for image_file,lab in zip(images,labs):
        (flepath, flename) = os.path.split(image_file["low_energy"])
        (rename, suffix) = os.path.splitext(flename)
        uid = rename
        low_energy_image = MedicalImageDeal(pydicom.dcmread(image_file["low_energy"]),
                                    image_file["low_energy"]).width_center1
        high_energy_image = MedicalImageDeal(pydicom.dcmread(image_file["high_energy"]),
                                    image_file["high_energy"]).width_center1
        enhance_image = MedicalImageDeal(pydicom.dcmread(image_file["enhance"]),
                                    image_file["enhance"]).width_center1
        R = random.randint(1, 100)
        if R >= 0 and R <= 70:
            f = h5py.File(save_dir+"/train" + '/{}_{}.h5'.format(uid, r_num), 'w')
        elif R > 70 and R <= 90:
            f = h5py.File(save_dir+"/valid" + '/{}_{}.h5'.format(uid, r_num), 'w')
        else:
            f = h5py.File(save_dir+"/test" + '/{}_{}.h5'.format(uid, r_num), 'w')
        f.create_dataset('LOW_ENERGY', data=low_energy_image, compression="gzip")
        f.create_dataset('HIGH_ENERGY', data=high_energy_image, compression="gzip")
        f.create_dataset('ENHANCE', data=enhance_image, compression="gzip")
        f.create_dataset('label', data=lab)
        f.close()
        r_num += 1
        print("process {} uid = {} label={}".format(r_num, uid,lab))

root_data = r"F:\300多的乳腺MR图像"
save_dir = r"F:\乳腺癌数据代码\MF\h5py"

# 读取恶性数据
generate_data(root_data,0,save_dir)
# 读取良性数据
generate_data(root_data,1,save_dir)

以上是处理MR乳腺癌的预处理过程。用于CT的肺并不适用，现在修改了代码，用于处理肺CT肺窗的图像预处理。加入了均值化。去掉了width_center1。

肺CT肺窗图像的预处理

import random
from skimage import transform
import numpy as np
import matplotlib.pyplot as plt
import nibabel as nib
import SimpleITK as sitk
import math
import glob
import os
import h5py
import pydicom
import cv2
from skimage.measure import label

# import gdcm

label_set = ["预后良好", "预后不良"]


def label_to_num(label):
    return label_set.index(label)


def num_to_label(n):
    return label_set[n]


import random


class MedicalImageDeal(object):
    def __init__(self, img, path, percent=1):
        self.img = img
        self.percent = percent
        self.p = path

    @property
    def valid_img(self):
        from skimage import exposure
        cdf = exposure.cumulative_distribution(self.img)
        watershed = cdf[1][cdf[0] >= self.percent][0]
        return np.clip(self.img, self.img.min(), watershed)

    @property
    def norm_img(self):
        return ((self.img - self.img.min()) / (self.img.max() - self.img.min())).astype(np.float32)


    @property
    def Histogram_equalization(self):
        from skimage import exposure
        # Read the image
        img = pydicom.dcmread(self.p).pixel_array
        # Perform histogram equalization
        img_res = exposure.equalize_hist(img)
        return img_res

def generate_data(root_data, index, save_dir):
    low_energy_fold = os.listdir(os.path.join(root_data, label_set[index], "T1"))
    high_energy_fold = os.listdir(os.path.join(root_data, label_set[index], "T2"))
    enhance_fold = os.listdir(os.path.join(root_data, label_set[index], "enhance"))

    patient_names = low_energy_fold
    images = []
    labs = []
    n = 0
    for i in range(len(patient_names)):  # len(patient_names)
        low_energy_list = glob.glob(os.path.join(root_data, label_set[index], "T1", patient_names[i], "*.dcm"))
        high_energy_list = glob.glob(os.path.join(root_data, label_set[index], "T2", patient_names[i], "*.dcm"))
        enhance_list = glob.glob(os.path.join(root_data, label_set[index], "enhance", patient_names[i], "*.dcm"))

        low_energy_list = sorted(low_energy_list)
        high_energy_list = sorted(high_energy_list)
        enhance_list = sorted(enhance_list)

        for j in range(len(low_energy_list)):
            item = {"low_energy": low_energy_list[j],
                    "high_energy": high_energy_list[j],
                    "enhance": enhance_list[j]}
            images.append(item)
            labs.append(index)
        r_num = 0
    for image_file, lab in zip(images, labs):
        (flepath, flename) = os.path.split(image_file["low_energy"])
        (rename, suffix) = os.path.splitext(flename)
        uid = rename
        low_energy_image = MedicalImageDeal(pydicom.dcmread(image_file["low_energy"]),
                                            image_file["low_energy"]).Histogram_equalization
        high_energy_image = MedicalImageDeal(pydicom.dcmread(image_file["high_energy"]),
                                             image_file["high_energy"]).Histogram_equalization
        enhance_image = MedicalImageDeal(pydicom.dcmread(image_file["enhance"]),
                                         image_file["enhance"]).Histogram_equalization
        R = random.randint(1, 100)
        if R >= 0 and R <= 70:
            f = h5py.File(save_dir + "/train" + '/{}_{}.h5'.format(uid, r_num), 'w')
        elif R > 70 and R <= 90:
            f = h5py.File(save_dir + "/valid" + '/{}_{}.h5'.format(uid, r_num), 'w')
        else:
            f = h5py.File(save_dir + "/test" + '/{}_{}.h5'.format(uid, r_num), 'w')
        f.create_dataset('LOW_ENERGY', data=low_energy_image, compression="gzip")
        f.create_dataset('HIGH_ENERGY', data=high_energy_image, compression="gzip")
        f.create_dataset('ENHANCE', data=enhance_image, compression="gzip")
        f.create_dataset('label', data=lab)
        f.close()
        r_num += 1
        print("process {} uid = {} label={}".format(r_num, uid, lab))


root_data = r"F:\王文江DL"
save_dir = r"F:\h5py"

# 读取恶性数据
generate_data(root_data, 0, save_dir)
# 读取良性数据
generate_data(root_data, 1, save_dir)