- 本文意在实现“Ligand-receptor interactions combinedwith histopathology for improvedprognostic modeling in HpV-negativehead and neck squamous cell carcihomia”文献中Silicon_pathological_region_selection_strategy部分。
- 文献解读可参考另一篇博客:https://blog.csdn.net/qq_44505899/article/details/147482032?spm=1001.2014.3001.5501
- Silicon_pathological_region_selection_strategy作为采用聚类的方式,实现病理区域的注释(文章结论为簇0和簇2代表靠近肿瘤组织的间质区域;簇1为肿瘤组织;簇3和簇4被归类为基质和肌肉组织;簇5则被分类为肿瘤侵袭的边缘区域。),进而比较不同病理区域选择对模型性能的影像。
实现效果如下:
一、Silicon_pathological_region_selection_strategy流程图
本文流程图如下:
Step1: 将WSI切割成N张512x512的Patch
Step2:对于每张Patch,采用ResNet50提取一个2024维特征
Step3: Patch特征PCA降维到32维
Step4:Patch进行K-means聚类并采用t-SNE进行可视化
Step5:将具有聚类标签的Patch 映射至原始WSI上
二、具体代码实现:
Step1: 将WSI切割成N张512x512的Patch
Step2:对于每张Patch,采用ResNet50提取一个2024维特征
借用CLAM完成WSI分割和Patch特征提取。结果图如下:每个文件代表一张WSI,文件内部为Patch数x2024的特征矩阵。
Step3: Patch特征PCA降维到32维
import shutil
import torch
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.datasets import load_iris
directory_path = "PCA_feature"
if os.path.exists(directory_path):
shutil.rmtree(directory_path)
print(f"目录 {directory_path} 已删除。")
os.makedirs(directory_path)
## PCA批量对resnet50提取的特征进行降维
for file in os.listdir("../Extracted_feature/pt_files/"):
file_path = '../Extracted_feature/pt_files/' + file
features = torch.load(file_path)
features = pd.DataFrame(features.numpy())
pca = PCA(n_components=32) # 加载PCA算法,设置降维后主成分数目为2
reduced_x = pca.fit_transform(features) # 对样本进行降维
reduced_x = pd.DataFrame(reduced_x)
# reduced_x = np.dot(reduced_x, pca.components_) + pca.mean_ # 还原数据
# print("降维后的数据:\n", reduced_x.shape, reduced_x)
output_path = os.path.join(directory_path,os.path.basename(file_path).replace(".pt",".csv"))
reduced_x.to_csv(output_path, index=False)
## 合并降维后的数据带单个文件中
file = os.path.join(directory_path,os.listdir(directory_path)[0])
merge_pca_feature = pd.read_csv(file)
merge_pca_feature["slide_id"] = os.listdir(directory_path)[0].replace(".csv","")
for file in os.listdir(directory_path)[1:]:
pca_feature = pd.read_csv(os.path.join(directory_path,file))
pca_feature["slide_id"] = file.replace(".csv","")
merge_pca_feature= pd.concat([merge_pca_feature,pca_feature])
merge_pca_feature.to_csv("pca_feature_in_patch.csv",index=False)
merge_pca_feature.head(2)
Step4:Patch进行K-means聚类并采用t-SNE进行可视化
- 聚类及聚类数评估
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import pandas as pd
df_feature = pd.read_csv("./pca_feature_in_patch.csv")
slide_id = df_feature["slide_id"]
df_feature = df_feature.drop(["slide_id"],axis=1)
scaler = StandardScaler().fit(df_feature.values)
df_feature = pd.DataFrame(scaler.transform(df_feature.values))
df_feature.head(2)
# 聚类数评估
SSE = []
for cluster in range(1,20):
kmeans = KMeans( n_clusters = cluster, init='k-means++')
kmeans.fit(df_feature)
SSE.append(kmeans.inertia_)
# 肘部图
# converting the results into a dataframe and plotting them
frame = pd.DataFrame({'Cluster':range(1,20), 'SSE':SSE})
plt.figure(figsize=(6,5))
plt.plot(frame['Cluster'], frame['SSE'], marker='o')
plt.title('Elbow method')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
- 获取聚类标签
best_cluster = 5
# 使用最佳聚类数重新训练 K-Means 模型
kmeans_best = KMeans(n_clusters=best_cluster, init='k-means++')
kmeans_best.fit(df_feature)
# 获取每个样本的类标签
labels = kmeans_best.labels_
df_feature['Labels'] = labels
df_feature["slide_id"]= slide_id
df_feature.to_csv("./pca_feature_in_patch_addLabel.csv",index=False)
df_feature.head()
- 聚类标签分布
import seaborn as sns
import matplotlib.pyplot as plt
df_stat = pd.DataFrame(df_feature['Labels'].value_counts()).reset_index()
# 绘制柱状图
plt.figure(figsize=(4, 3))
sns.barplot(x='Labels', y='count', data=df_stat, palette='viridis', edgecolor="black")
plt.title('Bar Chart of Label Counts')
plt.show()
- T-SNE可视化
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns
# 提取特征和标签
df = df_feature.drop(["slide_id"],axis=1)
features = df.drop('Labels', axis=1).values
labels = df['Labels'].values
# 使用 t-SNE 进行降维
tsne = TSNE(n_components=2, random_state=42) # 设置随机种子以保证结果可复现
tsne_result = tsne.fit_transform(features)
# 将降维结果和标签合并到一个新的 DataFrame 中
tsne_df = pd.DataFrame(tsne_result, columns=['x', 'y'])
tsne_df['label'] = labels
# 可视化
plt.figure(figsize=(7, 6))
sns.scatterplot(x='x', y='y', hue='label', palette='viridis', data=tsne_df, legend='full')
plt.title('Patchs Clustering')
plt.xlabel('t-SNE_1')
plt.ylabel('t-SNE_2')
plt.legend(title='Clusters',frameon=False)
plt.savefig('tsne_kmeans_label.pdf', bbox_inches='tight', pad_inches=0)
plt.show()
# 902831行,可视化约40min
Step5:将具有聚类标签的Patch 映射至原始WSI上
import h5py
import numpy as np
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
import os
import numpy as np
import shutil
directory_path = "Patch_mapping_WSI"
if os.path.exists(directory_path):
shutil.rmtree(directory_path)
print(f"目录 {directory_path} 已删除。")
os.makedirs(directory_path)
df_label = pd.read_csv("./pca_feature_in_patch_addLabel.csv")
df_label = df_label[["slide_id", "Labels"]]
wsi_lst = list(set(df_label.slide_id))
df_label.head()
for wsi_file in wsi_lst:
print(wsi_file)
# 打开 h5 文件
file_path = '../Extracted_feature/h5_files/' + wsi_file+ '.h5' # 替换为你的 h5 文件路径
with h5py.File(file_path, 'r') as f:
# 查看文件结构,找到坐标数据所在的路径
# print("Keys in the h5 file:", list(f.keys()))
coords = f['coords'][:] # 读取坐标数据
print("Coordinates shape:", coords.shape)
slide_labels = df_label[df_label.slide_id.isin([wsi_file])]
print(slide_labels.Labels.value_counts())
labels = slide_labels.Labels.values
label_colors = {
0: (236,130,114), # 红色
1: (112,190,151), # 绿色
2: (243,188,23), # 蓝色
3: (206,89,22),
4: (142,73,153),
# 5: (152,152,152)
}
# 将颜色值从 0-255 转换为 0-1 范围
label_colors_normalized = {label: (r / 255.0, g / 255.0, b / 255.0) for label, (r, g, b) in label_colors.items()}
# 创建一个空白图片
max_x = np.max(coords[:, 0])
max_y = np.max(coords[:, 1])
image = Image.new('RGB', (max_x + 256, max_y + 256), (255, 255, 255)) # 白色背景
pixels = image.load()
# 填充每个方块
block_size = 256
for (x, y), label in zip(coords, labels):
color = label_colors[label]
# 计算方块的范围
x_start = x - block_size // 2
x_end = x + block_size // 2
y_start = y - block_size // 2
y_end = y + block_size // 2
# 填充方块
for i in range(x_start, x_end):
for j in range(y_start, y_end):
if 0 <= i < image.width and 0 <= j < image.height: # 确保不超出图片范围
pixels[i, j] = color
image_np = np.array(image)
legend_elements = [Patch(facecolor=color, label=f' {label}') for label, color in label_colors_normalized.items()]
plt.figure(figsize=(10, 8))
plt.imshow(image_np)
plt.axis('off')
plt.legend(handles=legend_elements, loc='upper right', bbox_to_anchor=(1.1, 1.1),frameon=False)
outfile = directory_path +"/"+ wsi_file + ".png"
plt.savefig(outfile, bbox_inches='tight', pad_inches=0)
plt.show()
输出如下:
可采用HovertNet对病理切片进行细胞层面上的注释,可以辅助类群的解释。