(CNS复现)CLAM——Chapter_02

(CNS复现)CLAM——Chapter_02

CLAM: A Deep-Learning-based Pipeline for Data Efficient and Weakly Supervised Whole-Slide-level Analysis


前言

(CNS复现)CLAM——Chapter_00

(CNS复现)CLAM——Chapter_01

(CNS复现)CLAM——Chapter_02

(CNS复现)CLAM——Chapter_03

这一部分主要是针对数据预处理的了解 create_patches_fp.py 中获取 patch 的方法和具体流程

该步骤的目的是讲WSI分割成固定大小的patch,将这些patch作为一个WSI的候选特征图。

这样就解决了超大的WSI图片无法训练的问题

Step-01: imports

# imports
# 这一部分都是基础包,不需要讲解
import os
import numpy as np
import time
import pdb
import cv2
import matplotlib.pyplot as plt  
plt.rcParams['axes.unicode_minus'] = False
import pandas as pd
import warnings

# my imports
from wsi_core.batch_process_utils import initialize_df # 这个包的作用是将输入的参数转换为EXCEL,方便记录
from wsi_core.WholeSlideImage import WholeSlideImage # WSI对象,里面的功能到时候一一讲解
from wsi_core.wsi_utils import StitchCoords # 这个是一个可视化工具

# other options
warnings.filterwarnings("ignore")

Step-02:初始化配置信息

这个配置内容基本上和上次的教程完全一样,就不复制了

# 初始化配置信息
# 元路径
save_dir = '/media/yuansh/14THHD/CLAM/Result'
source = '/media/yuansh/14THHD/CLAM/DataSet/LUAD'

# 输出文件路径
patch_save_dir = os.path.join(save_dir, 'patches') # patch文件,以.H5形式保存,其中记录了各个patch 的四个坐标的位置
mask_save_dir = os.path.join(save_dir, 'masks') # mask文件,即去除背景色后的jpg文件,可以显著减少image大小
stitch_save_dir = os.path.join(save_dir, 'stitches') # 可视化文件

directories = {'source': source,
               'save_dir': save_dir,
               'patch_save_dir': patch_save_dir,
               'mask_save_dir': mask_save_dir,
               'stitch_save_dir': stitch_save_dir}

# 判断目标文件夹是否存在,如果不存在则创建文件夹
for key, val in directories.items():
    print("{} : {}".format(key, val))
    if key not in ['source']:
        os.makedirs(val, exist_ok=True)

# 设置基本参数
process_list = None # 一个文本文件,里面存放各个数据的处理参数
patch_size = 256 # 每个patch 的大小
step_size = 256 # 每个patch搜索框的步长,当与patch size相同的时候每个patch没有交集
patch_level = 0 # 下采样水平,水平越高,下采样程度越大,数据大小越小
use_default_params = False # 是否使用初始化参数
seg = True # 是否切割图片
save_mask = True # 保存切割后的数据
stitch = True # 是否对输出的patch数据可视化
patch = False # 是否输出patch文件
auto_skip = False # 是否跳过已经处理的数据

# 操作参数
seg_params = {'seg_level': -1, 'sthresh': 8, 'mthresh': 7, 'close': 4, 'use_otsu': False,
              'keep_ids': 'none', 'exclude_ids': 'none'}
filter_params = {'a_t': 100, 'a_h': 16, 'max_n_holes': 8}
vis_params = {'vis_level': -1, 'line_thickness': 500}
patch_params = {'use_padding': True, 'contour_fn': 'four_pt'}
parameters = {'seg_params': seg_params,
              'filter_params': filter_params,
              'patch_params': patch_params,
              'vis_params': vis_params}
# 如果有参数表,则输入参数表
if process_list:
    process_list = os.path.join(save_dir, process_list)

else:
    process_list = None
source : /media/yuansh/14THHD/CLAM/DataSet/LUAD
save_dir : /media/yuansh/14THHD/CLAM/Result
patch_save_dir : /media/yuansh/14THHD/CLAM/Result/patches
mask_save_dir : /media/yuansh/14THHD/CLAM/Result/masks
stitch_save_dir : /media/yuansh/14THHD/CLAM/Result/stitches

Step-03:调试代码

这里我们需要参考 WholeSlideImage 这个文件

这一次主要调试的对象是 WSI_object.process_contours 该部分主要由两函数构成:

  1. process_contours

  2. process_contour

怕篇幅占的太多,具体的函数内容就不展示了

Step-03.01 初始化配置文件

上次讲过,这次不讲了

注意,有些 False 的代码块我直接删掉,太占篇幅

# 初始化配置文件 & 函数
slides = sorted(os.listdir(source))
slides = [slide for slide in slides if os.path.isfile(
    os.path.join(source, slide))]
if process_list is None:
    df = initialize_df(slides, seg_params, filter_params,
                       vis_params, patch_params)
else:
    df = pd.read_csv(process_list)
    df = initialize_df(df, seg_params, filter_params,
                       vis_params, patch_params)


mask = df['process'] == 1
process_stack = df[mask]

total = len(process_stack)


def segment(WSI_object, seg_params, filter_params):
    # Start Seg Timer
    start_time = time.time()

    # Segment
    WSI_object.segmentTissue(**seg_params, filter_params=filter_params)

    # Stop Seg Timers
    seg_time_elapsed = time.time() - start_time
    return WSI_object, seg_time_elapsed


def patching(WSI_object, **kwargs):
    # Start Patch Timer
    start_time = time.time()

    # Patch
    file_path = WSI_object.process_contours(**kwargs)

    # Stop Patch Timer
    patch_time_elapsed = time.time() - start_time
    return file_path, patch_time_elapsed
Step-03.02 初始化WSI

注意,有些 False 的代码块我直接删掉,太占篇幅

# 初始化WSI对象
i = 50 # 输入索引
idx = process_stack.index[i] # 获取索引
slide = process_stack.loc[idx, 'slide_id'] # 获取slide id
print("\n\nprogress: {:.2f}, {}/{}".format(i/total, i, total))
print('processing {}'.format(slide))

df.loc[idx, 'process'] = 0 # 正在处理第 i 个文件
slide_id, _ = os.path.splitext(slide) # 获取id前缀

# 跳过已经处理过的数据,我这里点False
if auto_skip and os.path.isfile(os.path.join(patch_save_dir, slide_id + '.h5')):
    print('{} already exist in destination location, skipped'.format(slide_id))
    df.loc[idx, 'status'] = 'already_exist'
#    continue

# 初始化WSI对象
# 基本属性包括:
#         1. 样本名
#         2. wsi对象
#         3. level_dim 下采样对应维度
#         4. 可下采样水平
full_path = os.path.join(source, slide)
WSI_object = WholeSlideImage(full_path)
legacy_support = False

if use_default_params:
    current_vis_params = vis_params.copy()
    current_filter_params = filter_params.copy()
    current_seg_params = seg_params.copy()
    current_patch_params = patch_params.copy()

else:
    current_vis_params = {}
    current_filter_params = {}
    current_seg_params = {}
    current_patch_params = {}

    for key in vis_params.keys():
        if legacy_support and key == 'vis_level':
            df.loc[idx, key] = -1
        current_vis_params.update({key: df.loc[idx, key]})

    for key in filter_params.keys():
        if legacy_support and key == 'a_t':
            old_area = df.loc[idx, 'a']
            seg_level = df.loc[idx, 'seg_level']
            scale = WSI_object.level_downsamples[seg_level]
            adjusted_area = int(
                old_area * (scale[0] * scale[1]) / (512 * 512))
            current_filter_params.update({key: adjusted_area})
            df.loc[idx, key] = adjusted_area
        current_filter_params.update({key: df.loc[idx, key]})

    for key in seg_params.keys():
        if legacy_support and key == 'seg_level':
            df.loc[idx, key] = -1
        current_seg_params.update({key: df.loc[idx, key]})

    for key in patch_params.keys():
        current_patch_params.update({key: df.loc[idx, key]})

        
        
        
if current_vis_params['vis_level'] < 0:
    if len(WSI_object.level_dim) == 1:
        current_vis_params['vis_level'] = 0

    else:
        wsi = WSI_object.getOpenSlide()
        best_level = wsi.get_best_level_for_downsample(64)
        current_vis_params['vis_level'] = best_level

if current_seg_params['seg_level'] < 0:
    if len(WSI_object.level_dim) == 1:
        current_seg_params['seg_level'] = 0

    else:
        wsi = WSI_object.getOpenSlide()
        best_level = wsi.get_best_level_for_downsample(64)
        current_seg_params['seg_level'] = best_level  

keep_ids = str(current_seg_params['keep_ids'])
if keep_ids != 'none' and len(keep_ids) > 0:
    str_ids = current_seg_params['keep_ids']
    current_seg_params['keep_ids'] = np.array(
        str_ids.split(',')).astype(int)
else:
    current_seg_params['keep_ids'] = []

exclude_ids = str(current_seg_params['exclude_ids'])
if exclude_ids != 'none' and len(exclude_ids) > 0:
    str_ids = current_seg_params['exclude_ids']
    current_seg_params['exclude_ids'] = np.array(
        str_ids.split(',')).astype(int)
else:
    current_seg_params['exclude_ids'] = []

w, h = WSI_object.level_dim[current_seg_params['seg_level']]
if w * h > 1e8:
    print(
        'level_dim {} x {} is likely too large for successful segmentation, aborting'.format(w, h))
    df.loc[idx, 'status'] = 'failed_seg'
#    continue

df.loc[idx, 'vis_level'] = current_vis_params['vis_level']
df.loc[idx, 'seg_level'] = current_seg_params['seg_level']
WSI_object.level_dim
WSI_object.level_downsamples
use_default_params
legacy_support

current_seg_params
current_filter_params

这一部分我们已经复现出来了,所以直接运行就可以

WSI_object, seg_time_elapsed = segment(WSI_object, current_seg_params, current_filter_params) 
mask = WSI_object.visWSI(**current_vis_params)
mask

在这里插入图片描述

从上图可以看出,这个wsi有三个轮廓(绿色)

Step-03.03 初始化分割patch参数

这里主要涉及到WSI类方法process_contoursprocess_contour

在下面的代码中,我把两部分合并了

还有运用到了一个patch的判定方法 isInContourV3_Easy

# 查看配置信息
current_patch_params.update({
    'patch_level': patch_level,
    'patch_size': patch_size,
    'step_size': step_size,
    'save_path': patch_save_dir
})
current_patch_params

初始化输入数据,若输入参数在字典中则使用字典中的参数否则使用默认参数

# 现在进行类方法调试
# 初始化输入数据
# 若输入参数在字典中则使用字典中的参数否则使用默认参数
use_padding= True
contour_fn= 'four_pt'
patch_level= 0
patch_size= 256
step_size= 256
save_path= '/media/yuansh/14THHD/CLAM/Result/patches'
bot_right=None
top_left=None
contour_holes = WSI_object.holes_tissue[0]

# import
import math
import multiprocessing as mp

# my imports
from wsi_core.util_classes import isInContourV3_Easy # 用于判定patchs 是否属于 轮廓内
from wsi_core.wsi_utils import  save_hdf5
Step-03.04 patch 分割步骤解读

着一个步骤,代码实现起来明显比上一个步骤简单,但是讲解的话用文字太过于抽象,所以我弄了个大概的图:

  1. 获取所有的候选轮廓在这里插入图片描述

  2. 获取轮廓的bbox在这里插入图片描述

  3. 由于有些轮廓处在图像的边缘,且bbox本身就应该要比图像边界略大,所以很有可能bbox会超出图像的大小在这里插入图片描述

  4. 这时候的解决方案一般有两种

    • 直接padding image
    • 是直接裁减到image 的边缘
      在这里插入图片描述
  5. 对bbox内的实例对象进行裁减分割成固定大小的小patch

    最后对patch进行筛选,并不是所有的patch都能使用,判定的表中一共有4种(上一期讲过),比如下面的代码使用的 四角简单判别法 。各个patch的四个角,只要有其中一个角落到轮廓内,则该patch可被使用。如上图:1,2,6 patch 没有被使用,3,4,5这些patch被纳入使用
    具体代码如下

save_path_hdf5 = os.path.join(save_path, str(WSI_object.name) + '.h5')
print("Creating patches for: ", WSI_object.name, "...",)
elapsed = time.time()
n_contours = len(WSI_object.contours_tissue) # 获取轮廓数量
print("Total number of contours to process: ", n_contours)

fp_chunk_size = math.ceil(n_contours * 0.05) # 每5% 输出一次进度

init = True
for idx, cont in enumerate(WSI_object.contours_tissue):
    if (idx + 1) % fp_chunk_size == fp_chunk_size:
        print('Processing contour {}/{}'.format(idx, n_contours))
    # 获取轮廓的直角坐标
    # 如果没有轮廓,则使用整张image
    start_x, start_y, w, h = cv2.boundingRect(cont) if cont is not None else (
        0, 0, WSI_object.level_dim[patch_level][0], WSI_object.level_dim[patch_level][1])
    # 下采样
    patch_downsample = (int(WSI_object.level_downsamples[patch_level][0]), int(
        WSI_object.level_downsamples[patch_level][1]))
    # patch 的尺寸
    ref_patch_size = (
        patch_size*patch_downsample[0], patch_size*patch_downsample[1])
    # 获取原始Image的大小
    img_w, img_h = WSI_object.level_dim[0]
    # 如果对图片进行padding操作的话,则终止点就是bbox的终止点
    # 这里注意以下,之所以这么做的原因是,如果组织块所处的位置比较边缘,那么bbox可能会超出图像大小
    # 所以要对bbox进行矫正
    # 矫正方法有两种
    # 第一种直接padding image
    # 第二种 是直接裁减到image 的边缘
    if use_padding:
        stop_y = start_y+h
        stop_x = start_x+w
    else:
        stop_y = min(start_y+h, img_h-ref_patch_size[1]+1)
        stop_x = min(start_x+w, img_w-ref_patch_size[0]+1)

    print("Bounding Box:", start_x, start_y, w, h)
    print("Contour Area:", cv2.contourArea(cont))
    
    # 如果有给出默认的左上角或者右下角的坐标的话,就对image 进行矫正
    if bot_right is not None:
        stop_y = min(bot_right[1], stop_y)
        stop_x = min(bot_right[0], stop_x)
    if top_left is not None:
        start_y = max(top_left[1], start_y)
        start_x = max(top_left[0], start_x)

    if bot_right is not None or top_left is not None:
        w, h = stop_x - start_x, stop_y - start_y
        if w <= 0 or h <= 0:
            print("Contour is not in specified ROI, skip")
            asset_dict = {}
            attr_dict = {}
        else:
            print("Adjusted Bounding Box:", start_x, start_y, w, h)
            
    # 根据不同的筛选标准过滤掉轮廓外的patch
    if isinstance(contour_fn, str):
        if contour_fn == 'four_pt':
            cont_check_fn = isInContourV3_Easy(
                contour=cont, patch_size=ref_patch_size[0], center_shift=0.5)
        elif contour_fn == 'four_pt_hard':
            cont_check_fn = isInContourV3_Hard(
                contour=cont, patch_size=ref_patch_size[0], center_shift=0.5)
        elif contour_fn == 'center':
            cont_check_fn = isInContourV2(
                contour=cont, patch_size=ref_patch_size[0])
        elif contour_fn == 'basic':
            cont_check_fn = isInContourV1(contour=cont)
        else:
            raise NotImplementedError
    else:
        assert isinstance(contour_fn, Contour_Checking_fn)
        cont_check_fn = contour_fn
        
    # 获取bbox的起始位置和终止位置后,根据步长提取patch
    step_size_x = step_size * patch_downsample[0]
    step_size_y = step_size * patch_downsample[1]

    x_range = np.arange(start_x, stop_x, step=step_size_x)
    y_range = np.arange(start_y, stop_y, step=step_size_y)
    
    # 生成patch的bbox坐标矩阵
    x_coords, y_coords = np.meshgrid(x_range, y_range, indexing='ij')
    coord_candidates = np.array(
        [x_coords.flatten(), y_coords.flatten()]).transpose()
    
    # 多线程处理 (这里可以根据自己的电脑性能进行获取)
    num_workers = mp.cpu_count()
    if num_workers > 4:
        num_workers = 4
    pool = mp.Pool(num_workers)

    iterable = [(coord, contour_holes, ref_patch_size[0], cont_check_fn)
                for coord in coord_candidates]
    results = pool.starmap(WholeSlideImage.process_coord_candidate, iterable)
    pool.close()
    results = np.array([result for result in results if result is not None])

    print('Extracted {} coordinates'.format(len(results)))
    
    # 最后输出patch 的属性和patch
    # patch的属性其中包括:
    # patch 的大小
    # patch 的下采样水平
    # 下采样后原始图片的大小
    # 样本名
    # 保存路径
    #  patch 的坐标信息
    if len(results) > 1:
        asset_dict = {'coords':          results}

        attr = {'patch_size':            patch_size,  # To be considered...
                'patch_level':           patch_level,
                'downsample':             WSI_object.level_downsamples[patch_level],
                'downsampled_level_dim': tuple(np.array(WSI_object.level_dim[patch_level])),
                'level_dim':              WSI_object.level_dim[patch_level],
                'name':                   WSI_object.name,
                'save_path':              save_path}

        attr_dict = {'coords': attr}

    else:
        asset_dict = {}
        attr_dict = {}

    if len(asset_dict) > 0:
        if init:
            save_hdf5(save_path_hdf5, asset_dict, attr_dict, mode='w')
            init = False
        else:
            save_hdf5(save_path_hdf5, asset_dict, mode='a')

最后所有的数据均以 .h5 形式保存。

这里的话想必大家也发现了一个重要的问题,就是由于每一个WSI的大小是不一样的,因此patch的个数也不一样。如果构建的模型要有通用性的情况,那么必须保证每一个WSI的特征图的个数是一样的(卷积操作与通道(特征)个数相同),所以显然要在做一次数据的处理,进一步的提取特征图,在下一章节将对这一部分进行讲解。

如果我的博客您经过深度思考后仍然看不懂,可以根据以下方式联系我:

Best Regards,
Yuan.SH
---------------------------------------
School of Basic Medical Sciences,
Fujian Medical University,
Fuzhou, Fujian, China.
please contact with me via the following ways:
(a) e-mail :yuansh3354@163.com
  • 4
    点赞
  • 20
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值