(CNS复现)CLAM——Chapter_02
CLAM: A Deep-Learning-based Pipeline for Data Efficient and Weakly Supervised Whole-Slide-level Analysis
文章目录
前言
这一部分主要是针对数据预处理的了解 create_patches_fp.py
中获取 patch
的方法和具体流程
该步骤的目的是讲WSI分割成固定大小的patch,将这些patch作为一个WSI的候选特征图。
这样就解决了超大的WSI图片无法训练的问题
Step-01: imports
# imports
# 这一部分都是基础包,不需要讲解
import os
import numpy as np
import time
import pdb
import cv2
import matplotlib.pyplot as plt
plt.rcParams['axes.unicode_minus'] = False
import pandas as pd
import warnings
# my imports
from wsi_core.batch_process_utils import initialize_df # 这个包的作用是将输入的参数转换为EXCEL,方便记录
from wsi_core.WholeSlideImage import WholeSlideImage # WSI对象,里面的功能到时候一一讲解
from wsi_core.wsi_utils import StitchCoords # 这个是一个可视化工具
# other options
warnings.filterwarnings("ignore")
Step-02:初始化配置信息
这个配置内容基本上和上次的教程完全一样,就不复制了
# 初始化配置信息
# 元路径
save_dir = '/media/yuansh/14THHD/CLAM/Result'
source = '/media/yuansh/14THHD/CLAM/DataSet/LUAD'
# 输出文件路径
patch_save_dir = os.path.join(save_dir, 'patches') # patch文件,以.H5形式保存,其中记录了各个patch 的四个坐标的位置
mask_save_dir = os.path.join(save_dir, 'masks') # mask文件,即去除背景色后的jpg文件,可以显著减少image大小
stitch_save_dir = os.path.join(save_dir, 'stitches') # 可视化文件
directories = {'source': source,
'save_dir': save_dir,
'patch_save_dir': patch_save_dir,
'mask_save_dir': mask_save_dir,
'stitch_save_dir': stitch_save_dir}
# 判断目标文件夹是否存在,如果不存在则创建文件夹
for key, val in directories.items():
print("{} : {}".format(key, val))
if key not in ['source']:
os.makedirs(val, exist_ok=True)
# 设置基本参数
process_list = None # 一个文本文件,里面存放各个数据的处理参数
patch_size = 256 # 每个patch 的大小
step_size = 256 # 每个patch搜索框的步长,当与patch size相同的时候每个patch没有交集
patch_level = 0 # 下采样水平,水平越高,下采样程度越大,数据大小越小
use_default_params = False # 是否使用初始化参数
seg = True # 是否切割图片
save_mask = True # 保存切割后的数据
stitch = True # 是否对输出的patch数据可视化
patch = False # 是否输出patch文件
auto_skip = False # 是否跳过已经处理的数据
# 操作参数
seg_params = {'seg_level': -1, 'sthresh': 8, 'mthresh': 7, 'close': 4, 'use_otsu': False,
'keep_ids': 'none', 'exclude_ids': 'none'}
filter_params = {'a_t': 100, 'a_h': 16, 'max_n_holes': 8}
vis_params = {'vis_level': -1, 'line_thickness': 500}
patch_params = {'use_padding': True, 'contour_fn': 'four_pt'}
parameters = {'seg_params': seg_params,
'filter_params': filter_params,
'patch_params': patch_params,
'vis_params': vis_params}
# 如果有参数表,则输入参数表
if process_list:
process_list = os.path.join(save_dir, process_list)
else:
process_list = None
source : /media/yuansh/14THHD/CLAM/DataSet/LUAD
save_dir : /media/yuansh/14THHD/CLAM/Result
patch_save_dir : /media/yuansh/14THHD/CLAM/Result/patches
mask_save_dir : /media/yuansh/14THHD/CLAM/Result/masks
stitch_save_dir : /media/yuansh/14THHD/CLAM/Result/stitches
Step-03:调试代码
这里我们需要参考 WholeSlideImage
这个文件
这一次主要调试的对象是 WSI_object.process_contours
该部分主要由两函数构成:
-
process_contours
-
process_contour
怕篇幅占的太多,具体的函数内容就不展示了
Step-03.01 初始化配置文件
上次讲过,这次不讲了
注意,有些 False
的代码块我直接删掉,太占篇幅
# 初始化配置文件 & 函数
slides = sorted(os.listdir(source))
slides = [slide for slide in slides if os.path.isfile(
os.path.join(source, slide))]
if process_list is None:
df = initialize_df(slides, seg_params, filter_params,
vis_params, patch_params)
else:
df = pd.read_csv(process_list)
df = initialize_df(df, seg_params, filter_params,
vis_params, patch_params)
mask = df['process'] == 1
process_stack = df[mask]
total = len(process_stack)
def segment(WSI_object, seg_params, filter_params):
# Start Seg Timer
start_time = time.time()
# Segment
WSI_object.segmentTissue(**seg_params, filter_params=filter_params)
# Stop Seg Timers
seg_time_elapsed = time.time() - start_time
return WSI_object, seg_time_elapsed
def patching(WSI_object, **kwargs):
# Start Patch Timer
start_time = time.time()
# Patch
file_path = WSI_object.process_contours(**kwargs)
# Stop Patch Timer
patch_time_elapsed = time.time() - start_time
return file_path, patch_time_elapsed
Step-03.02 初始化WSI
注意,有些 False
的代码块我直接删掉,太占篇幅
# 初始化WSI对象
i = 50 # 输入索引
idx = process_stack.index[i] # 获取索引
slide = process_stack.loc[idx, 'slide_id'] # 获取slide id
print("\n\nprogress: {:.2f}, {}/{}".format(i/total, i, total))
print('processing {}'.format(slide))
df.loc[idx, 'process'] = 0 # 正在处理第 i 个文件
slide_id, _ = os.path.splitext(slide) # 获取id前缀
# 跳过已经处理过的数据,我这里点False
if auto_skip and os.path.isfile(os.path.join(patch_save_dir, slide_id + '.h5')):
print('{} already exist in destination location, skipped'.format(slide_id))
df.loc[idx, 'status'] = 'already_exist'
# continue
# 初始化WSI对象
# 基本属性包括:
# 1. 样本名
# 2. wsi对象
# 3. level_dim 下采样对应维度
# 4. 可下采样水平
full_path = os.path.join(source, slide)
WSI_object = WholeSlideImage(full_path)
legacy_support = False
if use_default_params:
current_vis_params = vis_params.copy()
current_filter_params = filter_params.copy()
current_seg_params = seg_params.copy()
current_patch_params = patch_params.copy()
else:
current_vis_params = {}
current_filter_params = {}
current_seg_params = {}
current_patch_params = {}
for key in vis_params.keys():
if legacy_support and key == 'vis_level':
df.loc[idx, key] = -1
current_vis_params.update({key: df.loc[idx, key]})
for key in filter_params.keys():
if legacy_support and key == 'a_t':
old_area = df.loc[idx, 'a']
seg_level = df.loc[idx, 'seg_level']
scale = WSI_object.level_downsamples[seg_level]
adjusted_area = int(
old_area * (scale[0] * scale[1]) / (512 * 512))
current_filter_params.update({key: adjusted_area})
df.loc[idx, key] = adjusted_area
current_filter_params.update({key: df.loc[idx, key]})
for key in seg_params.keys():
if legacy_support and key == 'seg_level':
df.loc[idx, key] = -1
current_seg_params.update({key: df.loc[idx, key]})
for key in patch_params.keys():
current_patch_params.update({key: df.loc[idx, key]})
if current_vis_params['vis_level'] < 0:
if len(WSI_object.level_dim) == 1:
current_vis_params['vis_level'] = 0
else:
wsi = WSI_object.getOpenSlide()
best_level = wsi.get_best_level_for_downsample(64)
current_vis_params['vis_level'] = best_level
if current_seg_params['seg_level'] < 0:
if len(WSI_object.level_dim) == 1:
current_seg_params['seg_level'] = 0
else:
wsi = WSI_object.getOpenSlide()
best_level = wsi.get_best_level_for_downsample(64)
current_seg_params['seg_level'] = best_level
keep_ids = str(current_seg_params['keep_ids'])
if keep_ids != 'none' and len(keep_ids) > 0:
str_ids = current_seg_params['keep_ids']
current_seg_params['keep_ids'] = np.array(
str_ids.split(',')).astype(int)
else:
current_seg_params['keep_ids'] = []
exclude_ids = str(current_seg_params['exclude_ids'])
if exclude_ids != 'none' and len(exclude_ids) > 0:
str_ids = current_seg_params['exclude_ids']
current_seg_params['exclude_ids'] = np.array(
str_ids.split(',')).astype(int)
else:
current_seg_params['exclude_ids'] = []
w, h = WSI_object.level_dim[current_seg_params['seg_level']]
if w * h > 1e8:
print(
'level_dim {} x {} is likely too large for successful segmentation, aborting'.format(w, h))
df.loc[idx, 'status'] = 'failed_seg'
# continue
df.loc[idx, 'vis_level'] = current_vis_params['vis_level']
df.loc[idx, 'seg_level'] = current_seg_params['seg_level']
WSI_object.level_dim
WSI_object.level_downsamples
use_default_params
legacy_support
current_seg_params
current_filter_params
这一部分我们已经复现出来了,所以直接运行就可以
WSI_object, seg_time_elapsed = segment(WSI_object, current_seg_params, current_filter_params)
mask = WSI_object.visWSI(**current_vis_params)
mask
从上图可以看出,这个wsi有三个轮廓(绿色)
Step-03.03 初始化分割patch参数
这里主要涉及到WSI类方法process_contours
和 process_contour
在下面的代码中,我把两部分合并了
还有运用到了一个patch的判定方法 isInContourV3_Easy
# 查看配置信息
current_patch_params.update({
'patch_level': patch_level,
'patch_size': patch_size,
'step_size': step_size,
'save_path': patch_save_dir
})
current_patch_params
初始化输入数据,若输入参数在字典中则使用字典中的参数否则使用默认参数
# 现在进行类方法调试
# 初始化输入数据
# 若输入参数在字典中则使用字典中的参数否则使用默认参数
use_padding= True
contour_fn= 'four_pt'
patch_level= 0
patch_size= 256
step_size= 256
save_path= '/media/yuansh/14THHD/CLAM/Result/patches'
bot_right=None
top_left=None
contour_holes = WSI_object.holes_tissue[0]
# import
import math
import multiprocessing as mp
# my imports
from wsi_core.util_classes import isInContourV3_Easy # 用于判定patchs 是否属于 轮廓内
from wsi_core.wsi_utils import save_hdf5
Step-03.04 patch 分割步骤解读
着一个步骤,代码实现起来明显比上一个步骤简单,但是讲解的话用文字太过于抽象,所以我弄了个大概的图:
-
获取所有的候选轮廓
-
获取轮廓的bbox
-
由于有些轮廓处在图像的边缘,且bbox本身就应该要比图像边界略大,所以很有可能bbox会超出图像的大小
-
这时候的解决方案一般有两种
- 直接padding image
- 是直接裁减到image 的边缘
-
对bbox内的实例对象进行裁减分割成固定大小的小patch
最后对patch进行筛选,并不是所有的patch都能使用,判定的表中一共有4种(上一期讲过),比如下面的代码使用的四角简单判别法
。各个patch的四个角,只要有其中一个角落到轮廓内,则该patch可被使用。如上图:1,2,6 patch 没有被使用,3,4,5这些patch被纳入使用
具体代码如下
save_path_hdf5 = os.path.join(save_path, str(WSI_object.name) + '.h5')
print("Creating patches for: ", WSI_object.name, "...",)
elapsed = time.time()
n_contours = len(WSI_object.contours_tissue) # 获取轮廓数量
print("Total number of contours to process: ", n_contours)
fp_chunk_size = math.ceil(n_contours * 0.05) # 每5% 输出一次进度
init = True
for idx, cont in enumerate(WSI_object.contours_tissue):
if (idx + 1) % fp_chunk_size == fp_chunk_size:
print('Processing contour {}/{}'.format(idx, n_contours))
# 获取轮廓的直角坐标
# 如果没有轮廓,则使用整张image
start_x, start_y, w, h = cv2.boundingRect(cont) if cont is not None else (
0, 0, WSI_object.level_dim[patch_level][0], WSI_object.level_dim[patch_level][1])
# 下采样
patch_downsample = (int(WSI_object.level_downsamples[patch_level][0]), int(
WSI_object.level_downsamples[patch_level][1]))
# patch 的尺寸
ref_patch_size = (
patch_size*patch_downsample[0], patch_size*patch_downsample[1])
# 获取原始Image的大小
img_w, img_h = WSI_object.level_dim[0]
# 如果对图片进行padding操作的话,则终止点就是bbox的终止点
# 这里注意以下,之所以这么做的原因是,如果组织块所处的位置比较边缘,那么bbox可能会超出图像大小
# 所以要对bbox进行矫正
# 矫正方法有两种
# 第一种直接padding image
# 第二种 是直接裁减到image 的边缘
if use_padding:
stop_y = start_y+h
stop_x = start_x+w
else:
stop_y = min(start_y+h, img_h-ref_patch_size[1]+1)
stop_x = min(start_x+w, img_w-ref_patch_size[0]+1)
print("Bounding Box:", start_x, start_y, w, h)
print("Contour Area:", cv2.contourArea(cont))
# 如果有给出默认的左上角或者右下角的坐标的话,就对image 进行矫正
if bot_right is not None:
stop_y = min(bot_right[1], stop_y)
stop_x = min(bot_right[0], stop_x)
if top_left is not None:
start_y = max(top_left[1], start_y)
start_x = max(top_left[0], start_x)
if bot_right is not None or top_left is not None:
w, h = stop_x - start_x, stop_y - start_y
if w <= 0 or h <= 0:
print("Contour is not in specified ROI, skip")
asset_dict = {}
attr_dict = {}
else:
print("Adjusted Bounding Box:", start_x, start_y, w, h)
# 根据不同的筛选标准过滤掉轮廓外的patch
if isinstance(contour_fn, str):
if contour_fn == 'four_pt':
cont_check_fn = isInContourV3_Easy(
contour=cont, patch_size=ref_patch_size[0], center_shift=0.5)
elif contour_fn == 'four_pt_hard':
cont_check_fn = isInContourV3_Hard(
contour=cont, patch_size=ref_patch_size[0], center_shift=0.5)
elif contour_fn == 'center':
cont_check_fn = isInContourV2(
contour=cont, patch_size=ref_patch_size[0])
elif contour_fn == 'basic':
cont_check_fn = isInContourV1(contour=cont)
else:
raise NotImplementedError
else:
assert isinstance(contour_fn, Contour_Checking_fn)
cont_check_fn = contour_fn
# 获取bbox的起始位置和终止位置后,根据步长提取patch
step_size_x = step_size * patch_downsample[0]
step_size_y = step_size * patch_downsample[1]
x_range = np.arange(start_x, stop_x, step=step_size_x)
y_range = np.arange(start_y, stop_y, step=step_size_y)
# 生成patch的bbox坐标矩阵
x_coords, y_coords = np.meshgrid(x_range, y_range, indexing='ij')
coord_candidates = np.array(
[x_coords.flatten(), y_coords.flatten()]).transpose()
# 多线程处理 (这里可以根据自己的电脑性能进行获取)
num_workers = mp.cpu_count()
if num_workers > 4:
num_workers = 4
pool = mp.Pool(num_workers)
iterable = [(coord, contour_holes, ref_patch_size[0], cont_check_fn)
for coord in coord_candidates]
results = pool.starmap(WholeSlideImage.process_coord_candidate, iterable)
pool.close()
results = np.array([result for result in results if result is not None])
print('Extracted {} coordinates'.format(len(results)))
# 最后输出patch 的属性和patch
# patch的属性其中包括:
# patch 的大小
# patch 的下采样水平
# 下采样后原始图片的大小
# 样本名
# 保存路径
# patch 的坐标信息
if len(results) > 1:
asset_dict = {'coords': results}
attr = {'patch_size': patch_size, # To be considered...
'patch_level': patch_level,
'downsample': WSI_object.level_downsamples[patch_level],
'downsampled_level_dim': tuple(np.array(WSI_object.level_dim[patch_level])),
'level_dim': WSI_object.level_dim[patch_level],
'name': WSI_object.name,
'save_path': save_path}
attr_dict = {'coords': attr}
else:
asset_dict = {}
attr_dict = {}
if len(asset_dict) > 0:
if init:
save_hdf5(save_path_hdf5, asset_dict, attr_dict, mode='w')
init = False
else:
save_hdf5(save_path_hdf5, asset_dict, mode='a')
最后所有的数据均以 .h5
形式保存。
这里的话想必大家也发现了一个重要的问题,就是由于每一个WSI的大小是不一样的,因此patch的个数也不一样。如果构建的模型要有通用性的情况,那么必须保证每一个WSI的特征图的个数是一样的(卷积操作与通道(特征)个数相同),所以显然要在做一次数据的处理,进一步的提取特征图,在下一章节将对这一部分进行讲解。
如果我的博客您经过深度思考后仍然看不懂,可以根据以下方式联系我:
Best Regards,
Yuan.SH
---------------------------------------
School of Basic Medical Sciences,
Fujian Medical University,
Fuzhou, Fujian, China.
please contact with me via the following ways:
(a) e-mail :yuansh3354@163.com