1.Thesis:ResNet50 Model Main Code
# 定义一个名为 Bottleneck 的类,这是 ResNet 中的基本块结构
class Bottleneck(nn.Module):
def __init__(self, in_channels, filters, stride=1):
super(Bottleneck, self).__init__()
self.stride = stride # 步长,用于控制卷积核在输入上的移动距离
self.in_channels = in_channels # 输入特征图的通道数
F1, F2, F3 = filters # 三个卷积层的输出通道数
self.out_channels = F3 # Bottleneck的输出通道数
# 定义Bottleneck的主要部分,包括三个卷积层
self.block = nn.Sequential(
nn.Conv2d(in_channels, F1, 1, stride=stride, padding=0, bias=False), # 1x1 卷积,减少通道数
nn.BatchNorm2d(F1), # 批量归一化
nn.ReLU(inplace=True), # ReLU 激活函数
nn.Conv2d(F1, F2, 3, stride=1, padding=1, bias=False), # 3x3 卷积,保持通道数不变
nn.BatchNorm2d(F2), # 批量归一化
nn.ReLU(inplace=True), # ReLU 激活函数
nn.Conv2d(F2, F3, 1, stride=1, padding=0, bias=False), # 1x1 卷积,增加通道数
nn.BatchNorm2d(F3), # 批量归一化
)
# 定义下采样路径,用于在输入和输出特征图尺寸不一致时调整输入特征图的尺寸和通道数
self.downsample = nn.Sequential(
nn.Conv2d(in_channels=in_channels, out_channels=F3, kernel_size=1, stride=stride, bias=False), # 1x1 卷积,调整通道数
nn.BatchNorm2d(F3), # 批量归一化
)
self.relu = nn.ReLU(inplace=True) # ReLU 激活函数
def forward(self, x):
identity = x # 保存输入特征图,用于残差连接
out = self.block(x) # 通过Bottleneck的主要部分
if self.stride != 1 or self.in_channels != self.out_channels:
identity = self.downsample(x) # 当下采样时,调整输入特征图的尺寸和通道数
out += identity # 残差连接
out = self.relu(out) # 应用 ReLU 激活函数
return out
# 定义 ResNet-50 模型类
class Resnet50(nn.Module):
def __init__(self, n_class):
super(Resnet50, self).__init__()
# 定义第一个阶段,包含一个 7x7 卷积层,批量归一化,ReLU 激活函数和一个最大池化层
self.stage1 = nn.Sequential(
nn.Conv2d(3, 64, 7, stride=2, padding=3, bias=False),
nn.BatchNorm2d(64),
nn.ReLU(True),
nn.MaxPool2d(3, 2, padding=1),
)
# 定义第二个阶段,包含三个Bottleneck
self.stage2 = nn.Sequential(
Bottleneck(64, [64, 64, 256], stride=1),
Bottleneck(256, [64, 64, 256]),
Bottleneck(256, [64, 64, 256])
)
# 定义第三个阶段,包含四个Bottleneck
self.stage3 = nn.Sequential(
Bottleneck(256, [128, 128, 512], stride=2),
Bottleneck(512, [128, 128, 512]),
Bottleneck(512, [128, 128, 512]),
Bottleneck(512, [128, 128, 512])
)
# 定义第四个阶段,包含四个Bottleneck
self.stage4 = nn.Sequential(
Bottleneck(512, [256, 256, 1024], stride=2), # 这里应该是 [256, 256, 1024] 而不是 [258, 258, 1024]
Bottleneck(1024, [256, 256, 1024]),
Bottleneck(1024, [128, 128, 512]), # 这里应该是 [256, 256, 1024] 而不是 [128, 128, 512]
Bottleneck(1024, [128, 128, 512]) # 这里应该是 [256, 256, 1024] 而不是 [128, 128, 512]
)
# 定义第五个阶段,包含三个Bottleneck
self.stage5 = nn.Sequential(
Bottleneck(1024, [512, 512, 2048], stride=2),
Bottleneck(2048, [512, 512, 2048]),
Bottleneck(2048, [512, 512, 2048])
)
# 自适应平均池化层,将特征图大小调整为 1x1
self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
# 全连接层,将特征映射到类别数目
self.fc = nn.Sequential(
nn.Linear(2048, n_class)
)
def forward(self, X):
out = self.stage1(X) # 通过第一个阶段
print(out.shape) # 打印输出形状
out = self.stage2(out) # 通过第二个阶段
print(out.shape) # 打印输出形状
out = self.stage3(out) # 通过第三个阶段
print(out.shape) # 打印输出形状
out = self.stage4(out) # 通过第四个阶段
print(out.shape) # 打印输出形状
out = self.stage5(out) # 通过第五个阶段
print(out.shape) # 打印输出形状
out = self.avgpool(out) # 通过自适应平均池化层
print(out.shape) # 打印输出形状
print(out.size(0)) # 打印 batch size
out = out.view(out.size(0), 2048) # 将特征图展平为向量
out = self.fc(out) # 通过全连接层
return out
2.Thesis:Model-A Main Code
# 定义一个名为Bottleneck_A的类,它继承自nn.Module类,用于定义残差块
class Bottleneck_A(nn.Module):
def __init__(self, in_Channels, filters, stride=1):
super(Bottleneck_A, self).__init__() # 调用父类的构造函数
self.stride = stride # 设置步长
self.in_channels = in_Channels # 输入通道数
F1, F2, F3 = filters # 解包滤波器列表
self.out_channels = F3 # 输出通道数
self.block = nn.Sequential( # 定义一系列层作为顺序容器
nn.Conv2d(in_channels=self.in_channels, out_channels=F1, kernel_size=1, bias=False), # 1x1卷积
nn.BatchNorm2d(num_features=F1), # 批量归一化
nn.ReLU(inplace=True), # ReLU激活函数
nn.Conv2d(in_channels=F1, out_channels=F2, kernel_size=3, stride=self.stride, padding=1, groups=F1, bias=False), # 3x3卷积
nn.BatchNorm2d(num_features=F2), # 批量归一化
nn.ReLU(inplace=True), # ReLU激活函数
nn.Conv2d(in_channels=F2, out_channels=F3, kernel_size=1, bias=False), # 1x1卷积
nn.BatchNorm2d(num_features=F3), # 批量归一化
)
self.downsample_A = nn.Sequential( # 定义下采样序列
nn.Conv2d(in_channels=in_Channels, out_channels=F3, kernel_size=1, stride=stride, bias=False), # 1x1卷积
nn.BatchNorm2d(num_features=F3), # 批量归一化
)
self.relu = nn.ReLU(inplace=True) # ReLU激活函数
def forward(self, x):
identity = x # 保存输入值,用于残差连接
out = self.block(x) # 前向传播通过block序列
if self.stride != 1 or self.in_Channels != self.out_channels: # 如果需要下采样或通道数不匹配
identity = self.downsample_A(x) # 执行下采样操作
out += identity # 残差连接
out = self.relu(out) # ReLU激活
return out # 返回输出
# 定义一个名为Model_A的类,它继承自nn.Module类,用于定义整个神经网络模型
class Model_A(nn.Module):
def __init__(self, n_class):
super(Model_A, self).__init__() # 调用父类的构造函数
self.stage1 = nn.Sequential( # 定义第一个阶段
nn.Conv2d(3, 64, 7, stride=2, padding=3, bias=False), # 7x7卷积
nn.BatchNorm2d(64), # 批量归一化
nn.ReLU(inplace=True), # ReLU激活函数
nn.MaxPool2d(3, 2, padding=1) # 最大池化
)
self.stage2 = nn.Sequential( # 定义第二个阶段
Bottleneck_A(64, [64, 64, 256], stride=1), # 第一个残差块
Bottleneck_A(256, [64, 64, 256]), # 第二个残差块
Bottleneck_A(256, [64, 64, 256]) # 第三个残差块
)
self.stage3 = nn.Sequential( # 定义第三个阶段
Bottleneck_A(256, [128, 128, 512], stride=2), # 第一个残差块
Bottleneck_A(512, [128, 128, 512]), # 第二个残差块
Bottleneck_A(512, [128, 128, 512]), # 第三个残差块
Bottleneck_A(512, [128, 128, 512]) # 第四个残差块
)
self.stage4 = nn.Sequential( # 定义第四个阶段
Bottleneck_A(512, [256, 256, 1024], stride=2), # 第一个残差块
Bottleneck_A(1024, [256, 256, 1024]), # 第二个残差块
Bottleneck_A(1024, [256, 256, 1024]), # 第三个残差块
Bottleneck_A(1024, [256, 256, 1024]) # 第四个残差块
)
self.stage5 = nn.Sequential( # 定义第五个阶段
Bottleneck_A(1024, [512, 512, 2048], stride=2), # 第一个残差块
Bottleneck_A(2048, [512, 512, 2048]), # 第二个残差块
Bottleneck_A(2048, [512, 512, 2048]) # 第三个残差块
)
self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) # 自适应平均池化,输出尺寸为1x1
self.fc = nn.Linear(2048, n_class) # 全连接层,用于分类
def forward(self, X):
out = self.stage1(X) # 前向传播通过第一阶段
print(out.shape) # 打印输出形状
out = self.stage2(out) # 前向传播通过第二阶段
print(out.shape) # 打印输出形状
out = self.stage3(out) # 前向传播通过第三阶段
print(out.shape) # 打印输出形状
out = self.stage4(out) # 前向传播通过第四阶段
print(out.shape) # 打印输出形状
out = self.stage5(out) # 前向传播通过第五阶段
print(out.shape) # 打印输出形状
out = self.avgpool(out) # 自适应平均池化
print(out.shape) # 打印输出形状
out = out.view(out.size(0), -1) # 调整张量形状以便全连接层处理
out = self.fc(out) # 全连接层处理
return out # 返回输出
3.Thesis:Model-B Main Code
# 定义Bottleneck_B类,这是一个残差块的改良版本
class Bottleneck_B(nn.Module):
def __init__(self, in_Channels, filters, stride=1):
super(Bottleneck_B, self).__init__()
self.stride = stride
self.in_channels = in_Channels
F1, F2, F3 = filters
self.out_channels = F3
# 定义卷积层、批归一化层和激活函数
self.block = nn.Sequential(
nn.Conv2d(in_Channels, F1, 1, stride=stride, padding=0, bias=False),
nn.BatchNorm2d(F1),
nn.ReLU(inplace=True),
nn.Conv2d(F1, F2, 3, stride=1, padding=1, bias=False),
nn.BatchNorm2d(F2),
nn.ReLU(inplace=True),
nn.Conv2d(F2, F3, 1, stride=1, padding=0, bias=False),
nn.BatchNorm2d(F3)
)
# 定义下采样路径,用于处理输入输出通道不匹配的情况
self.downsample = nn.Sequential(
nn.Conv2d(in_channels=in_Channels, out_channels=F3,
kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(F3)
)
self.relu = nn.ReLU(inplace=True)
# 前向传播函数
def forward(self, x):
identity = x
out = self.block(x)
if self.stride != 1 or self.in_channels != self.out_channels:
identity = self.downsample(x)
out += identity
out = self.relu(out)
return out
# 定义整个模型类
class Model_B(nn.Module):
def __init__(self, n_class):
super(Model_B, self).__init__()
# 第一阶段,通常是卷积和池化层,用于提取特征
self.stage1 = nn.Sequential(
nn.Conv2d(3, 64, 7, stride=2, padding=3, bias=False),
nn.BatchNorm2d(64),
nn.ReLU(True),
nn.MaxPool2d(3, 2, padding=1),
)
# 第二阶段,包含多个Bottleneck_B模块
self.stage2 = nn.Sequential(
Bottleneck_B(64, [64, 64, 256], stride=1),
Bottleneck_B(256, [64, 64, 256]),
Bottleneck_B(256, [64, 64, 256])
)
# 第三阶段,同样包含多个Bottleneck_B模块
self.stage3 = nn.Sequential(
Bottleneck_B(256, [128, 128, 512], stride=2),
Bottleneck_B(512, [128, 128, 512]),
Bottleneck_B(512, [128, 128, 512]),
Bottleneck_B(512, [128, 128, 512])
)
# 第四阶段,包含多个Bottleneck_B模块
self.stage4 = nn.Sequential(
Bottleneck_B(512, [256, 256, 1024], stride=2),
Bottleneck_B(1024, [256, 256, 1024]),
Bottleneck_B(1024, [256, 256, 1024]),
Bottleneck_B(1024, [256, 256, 1024])
)
# 第五阶段,包含多个Bottleneck_B模块
self.stage5 = nn.Sequential(
Bottleneck_B(1024, [512, 512, 2048], stride=2),
Bottleneck_B(2048, [512, 512, 2048]),
Bottleneck_B(2048, [512, 512, 2048])
)
# 自适应平均池化层,用于将特征图大小调整为1x1
self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
# 全连接层,用于分类任务
self.fc = nn.Sequential(
nn.Linear(2048, n_class)
)
# 前向传播函数
def forward(self, X):
out = self.stage1(X)
print(out.shape)
out = self.stage2(out)
print(out.shape)
out = self.stage3(out)
print(out.shape)
out = self.stage4(out)
print(out.shape)
out = self.stage5(out)
print(out.shape)
out = self.avgpool(out) # torch.Size([1, 2048, 1, 1])
print(out.shape)
print(out.size(0))
out = out.view(out.size(0), 2048)
out = self.fc(out)
return out
4.Thesis:Model-C Main Code
# SE_module定义了一个Squeeze-and-Excitation模块
class SE_module(nn.Module):
def __init__(self, in_channels):
super(SE_module, self).__init__()
self.global_avg_pool = nn.AdaptiveAvgPool2d(1)
self.fc1 = nn.Linear(in_channels, 128) # 全连接层,输入特征数等于通道数
self.relu1 = nn.ReLU()
self.fc2 = nn.Linear(128, 128) # 全连接层
self.relu2 = nn.ReLU()
self.fc3 = nn.Linear(128, in_channels) # 全连接层,输出特征数等于通道数
self.relu3 = nn.ReLU()
self.sigmoid = nn.Sigmoid()
def forward(self, x):
x = self.global_avg_pool(x) # 全局平均池化
x = x.view(x.size(0), -1) # 展平张量
x = self.fc1(x) # 全连接层
x = self.relu1(x) # ReLU激活
x = self.fc2(x) # 全连接层
x = self.relu2(x) # ReLU激活
x = self.fc3(x) # 全连接层
x = self.relu3(x) # ReLU激活
x = self.sigmoid(x) # Sigmoid激活
x = x.unsqueeze(2).unsqueeze(3) # 重新塑造张量以匹配原始形状
return x
# Bottleneck定义了一个残差块
class Bottleneck(nn.Module):
def __init__(self, in_channels, filters, stride=1):
super(Bottleneck, self).__init__()
self.stride = stride
self.in_channels = in_channels
self.out_channels = filters[-1] # 最后一个元素是输出通道数
F1, F2, F3 = filters
self.block = nn.Sequential(
nn.Conv2d(in_channels, F1, 1, stride=stride, padding=0, bias=False),
nn.BatchNorm2d(F1),
nn.ReLU(inplace=True),
nn.Conv2d(F1, F2, 3, stride=1, padding=1, bias=False),
nn.BatchNorm2d(F2),
nn.ReLU(inplace=True),
nn.Conv2d(F2, F3, 1, stride=1, padding=0, bias=False),
nn.BatchNorm2d(F3)
)
self.downsample = nn.Sequential(
nn.Conv2d(in_channels, F3, 1, stride=stride, bias=False),
nn.BatchNorm2d(F3)
)
self.relu = nn.ReLU(inplace=True)
def forward(self, x):
identity = x
out = self.block(x)
if self.stride != 1 or self.in_channels != self.out_channels:
identity = self.downsample(x)
out += identity
out = self.relu(out)
return out
# Model_C定义了一个卷积神经网络模型
class Model_C(nn.Module):
def __init__(self, n_class):
super(Model_C, self).__init__()
self.stage1 = nn.Sequential(
nn.Conv2d(3, 64, 7, stride=2, padding=3, bias=False),
nn.BatchNorm2d(64),
nn.ReLU(inplace=True),
nn.MaxPool2d(3, 2, padding=1),
)
self.stage2 = nn.Sequential(
Bottleneck(64, [64, 64, 256], stride=1),
Bottleneck(256, [64, 64, 256]),
Bottleneck(256, [64, 64, 256])
)
self.stage3 = nn.Sequential(
Bottleneck(256, [128, 128, 512], stride=2),
Bottleneck(512, [128, 128, 512]),
Bottleneck(512, [128, 128, 512]),
Bottleneck(512, [128, 128, 512])
)
self.stage4 = nn.Sequential(
Bottleneck(512, [256, 256, 1024], stride=2),
Bottleneck(1024, [256, 256, 1024]),
Bottleneck(1024, [256, 256, 1024]),
Bottleneck(1024, [256, 256, 1024])
)
self.stage5 = nn.Sequential(
Bottleneck(1024, [512, 512, 2048], stride=2),
Bottleneck(2048, [512, 512, 2048]),
Bottleneck(2048, [512, 512, 2048])
)
self.se_modules = nn.ModuleList([SE_module(c) for c in [256, 256, 512, 1024, 2048]])
self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
self.fc = nn.Linear(2048, n_class)
def forward(self, X):
out = self.stage1(X)
out = out * self.se_modules(out) # 应用SE模块
out = self.stage2(out)
out = out * self.se_modules[1](out) # 应用SE模块
out = self.stage3(out)
out = out * self.se_modules[2](out) # 应用SE模块
out = self.stage4(out)
out = out * self.se_modules[3](out) # 应用SE模块
out = self.stage5(out)
out = out * self.se_modules[4](out) # 应用SE模块
out = self.avgpool(out) # 自适应平均池化
out = out.view(out.size(0), -1) # 展平张量
out = self.fc(out) # 全连接层
return out
5.函数库
函数库导入
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()
from PIL import Image
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau, StepLR, CyclicLR
import torchvision
from torchvision import datasets, models, transforms
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.utils.class_weight import compute_class_weight
from glob import glob
from skimage.io import imread
from os import listdir
import time
import copy
from tqdm import tqdm_notebook as tqdm
6.数据预处理&数据探索性分析
# 列出指定目录下的前10个文件
files[0:10]
# 获取指定目录下的所有文件列表
files = listdir("../input/breast-histopathology-images/IDC_regular_ps50_idx5/")
# 计算文件列表的长度,即文件数量
len(files)
# 设置基本路径
base_path = "../input/breast-histopathology-images/IDC_regular_ps50_idx5/"
# 列出基本路径下的所有文件夹
folder = listdir(base_path)
# 计算文件夹的数量
len(folder)
# 初始化总图像计数变量
total_images = 0
# 遍历每个文件夹
for n in range(len(folder)):
# 获取当前文件夹的标识符
patient_id = folder[n]
# 遍历类别标签0和1
for c in [0, 1]:
# 构建患者路径
patient_path = base_path + patient_id
# 构建类别路径
class_path = patient_path + "/" + str(c) + "/"
# 列出类别路径下的所有文件
subfiles = listdir(class_path)
# 累加图像数量
total_images += len(subfiles)
# 创建一个DataFrame用于存储数据
data = pd.DataFrame(index=np.arange(0, total_images), columns=["patient_id", "path", "target"])
# 初始化索引变量
k = 0
# 遍历每个文件夹
for n in range(len(folder)):
# 获取当前文件夹的标识符
patient_id = folder[n]
# 构建患者路径
patient_path = base_path + patient_id
# 遍历类别标签0和1
for c in [0,1]:
# 构建类别路径
class_path = patient_path + "/" + str(c) + "/"
# 列出类别路径下的所有文件
subfiles = listdir(class_path)
# 遍历每个文件
for m in range(len(subfiles)):
# 获取当前文件的路径
image_path = subfiles[m]
# 填充数据到DataFrame
data.iloc[k]["path"] = class_path + image_path
data.iloc[k]["target"] = c
data.iloc[k]["patient_id"] = patient_id
# 更新索引
k += 1
# 打印数据的头部几行
data.head()
# 打印数据的形状(行数和列数)
data.shape
# 计算每个患者中癌症的比例
cancer_perc = data.groupby("patient_id").target.value_counts()/ data.groupby("patient_id").target.size()
# 将结果转换为DataFrame,其中列是不同的目标值
cancer_perc = cancer_perc.unstack()
# 创建一个子图,包含3个子图
fig, ax = plt.subplots(1,3,figsize=(20,5))
# 绘制每个患者拥有的补丁数量的直方图
sns.distplot(data.groupby("patient_id").size(), ax=ax, color="Orange", kde=False, bins=30)
# 设置第一个子图的标签和标题
ax.set_xlabel("Number of patches")
ax.set_ylabel("Frequency")
ax.set_title("How many patches do we have per patient?")
# 绘制癌症比例的直方图
sns.distplot(cancer_perc.loc[:, 1]*100, ax=ax[1], color="Tomato", kde=False, bins=30)
# 设置第二个子图的标签和标题
ax[1].set_title("How much percentage of an image is covered by IDC?")
ax[1].set_ylabel("Frequency")
ax[1].set_xlabel("% of patches with IDC")
# 绘制目标类别的计数图
sns.countplot(data.target, palette="Set2", ax=ax[2])
# 设置第三个子图的标签和标题
ax[2].set_xlabel("no(0) versus yes(1)")
ax[2].set_title("How many patches show IDC?")
# 将目标列的数据类型转换为整数
data.target = data.target.astype(np.int)
# 从目标为1的索引中随机选择50个不重复的样本
pos_selection = np.random.choice(data[data.target==1].index.values, size=50, replace=False)
# 从目标为0的索引中随机选择50个不重复的样本
neg_selection = np.random.choice(data[data.target==0].index.values, size=50, replace=False)
# 创建一个5x10的子图
fig, ax = plt.subplots(5,10,figsize=(20,10))
# 遍历正样本的选择
for n in range(5):
for m in range(10):
# 获取当前选择的索引
idx = pos_selection[m + 10*n]
# 读取图像
image = imread(data.loc[idx, "path"])
# 显示图像
ax[n,m].imshow(image)
# 隐藏网格线
ax[n,m].grid(False)
# 重复上面的步骤,但这次是针对负样本的选择
fig, ax = plt.subplots(5,10,figsize=(20,10))
for n in range(5):
for m in range(10):
idx = neg_selection[m + 10*n]
image = imread(data.loc[idx, "path"])
ax[n,m].imshow(image)
ax[n,m].grid(False)
# 定义一个函数来提取坐标信息
def extract_coords(df):
# 从路径中分割出坐标信息
coord = df.path.str.rsplit("_", n=4, expand=True)
# 丢弃不需要的列
coord = coord.drop([0, 1, 4], axis=1)
# 重命名列
coord = coord.rename({2: "x", 3: "y"}, axis=1)
# 将x和y坐标转换为整数
coord.loc[:, "x"] = coord.loc[:,"x"].str.replace("x", "", case=False).astype(np.int)
coord.loc[:, "y"] = coord.loc[:,"y"].str.replace("y", "", case=False).astype(np.int)
# 将坐标信息添加到原始DataFrame中
df.loc[:, "x"] = coord.x.values
df.loc[:, "y"] = coord.y.values
return df
# 定义一个函数来获取癌症相关的DataFrame
def get_cancer_dataframe(patient_id, cancer_id):
# 构建路径
path = base_path + patient_id + "/" + cancer_id
# 列出路径下的所有文件
files = listdir(path)
# 创建一个DataFrame来存储文件名
dataframe = pd.DataFrame(files, columns=["filename"])
# 构建完整的路径名
path_names = path + "/" + dataframe.filename.values
# 从文件名中分割出坐标信息
dataframe = dataframe.filename.str.rsplit("_", n=4, expand=True)
# 添加目标列
dataframe.loc[:, "target"] = np.int(cancer_id)
# 添加路径列
dataframe.loc[:, "path"] = path_names
# 丢弃不需要的列
dataframe = dataframe.drop([0, 1, 4], axis=1)
# 重命名列
dataframe = dataframe.rename({2: "x", 3: "y"}, axis=1)
def get_patient_dataframe(patient_id):
# 获取指定患者ID的癌症数据,分为两种类型:0和1
df_0 = get_cancer_dataframe(patient_id, "0")
df_1 = get_cancer_dataframe(patient_id, "1")
# 将两种类型的数据合并为一个数据框
patient_df = df_0.append(df_1)
return patient_df
# 示例:获取第一个患者的完整数据框
example = get_patient_dataframe(data.patient_id.values)
# 查看示例数据框的前几行
example.head()
# 创建一个5x3的子图网格,设置图形大小
fig, ax = plt.subplots(5, 3, figsize=(20, 27))
# 获取所有患者的唯一ID列表
patient_ids = data.patient_id.unique()
# 遍历所有的子图
for n in range(5):
for m in range(3):
# 选择患者ID
patient_id = patient_ids[m + 3 * n]
# 获取该患者的完整数据框
example_df = get_patient_dataframe(patient_id)
# 在子图中绘制散点图,颜色根据目标值确定,使用coolwarm颜色映射
ax[n, m].scatter(example_df.x.values, example_df.y.values, c=example_df.target.values, cmap="coolwarm", s=20)
# 设置子图标题为患者ID
ax[n, m].set_title("patient " + patient_id)
# 设置子图的x轴标签
ax[n, m].set_xlabel("y coord")
# 设置子图的y轴标签
ax[n, m].set_ylabel("x coord")
# 定义一个函数,用于可视化乳腺组织
def visualise_breast_tissue(patient_id, pred_df=None):
# 获取指定患者ID的完整数据框
example_df = get_patient_dataframe(patient_id)
# 计算最大点的坐标
max_point = [example_df.y.max() - 1, example_df.x.max() - 1]
# 初始化一个网格图像,填充白色(255)
grid = 255 * np.ones(shape=(max_point + 50, max_point[1] + 50, 3)).astype(np.uint8)
# 初始化一个掩码图像,填充白色(255)
mask = 255 * np.ones(shape=(max_point + 50, max_point[1] + 50, 3)).astype(np.uint8)
# 如果有预测数据框,则复制该数据框并筛选出当前患者的记录
if pred_df is not None:
patient_df = pred_df[pred_df.patient_id == patient_id].copy()
# 初始化一个概率掩码,形状为单通道,填充0
mask_proba = np.zeros(shape=(max_point + 50, max_point[1] + 50, 1)).astype(np.float)
# 存储读取失败的图像路径
broken_patches = []
# 遍历数据框中的每条记录
for n in range(len(example_df)):
try:
# 读取图像文件
image = imread(example_df.path.values[n])
# 获取目标值
target = example_df.target.values[n]
# 获取坐标值
x_coord = np.int(example_df.x.values[n])
y_coord = np.int(example_df.y.values[n])
# 计算图像的起始和结束坐标
x_start = x_coord - 1
y_start = y_coord - 1
x_end = x_start + 50
y_end = y_start + 50
# 将图像放置到网格中的对应位置
grid[y_start:y_end, x_start:x_end] = image
# 根据目标值更新掩码图像的颜色
if target == 1:
mask[y_start:y_end, x_start:x_end, 0] = 250
mask[y_start:y_end, x_start:x_end, 1] = 0
mask[y_start:y_end, x_start:x_end, 2] = 0
# 如果有预测数据框,则更新概率掩码的值
if pred_df is not None:
proba = patient_df[(patient_df.x == x_coord) & (patient_df.y == y_coord)].proba
mask_proba[y_start:y_end, x_start:x_end, 0] = np.float(proba)
# 如果读取图像失败,将路径添加到broken_patches列表中
except ValueError:
broken_patches.append(example_df.path.values[n])
# 返回网格图像、掩码图像、损坏的图像路径和概率掩码
return grid, mask, broken_patches, mask_proba
# 定义一个示例字符串
example = "13616"
# 调用函数visualise_breast_tissue处理示例数据,返回网格图、掩码、损坏的补丁等信息
grid, mask, broken_patches, _ = visualise_breast_tissue(example)
# 创建一个子图对象fig和一个包含两行的子图数组ax
fig, ax = plt.subplots(1, 2, figsize=(20, 10))
# 在第一个子图中显示网格图,设置透明度为0.9
ax.imshow(grid, alpha=0.9)
# 在第二个子图中首先显示掩码,设置透明度为0.8,然后叠加显示网格图,设置透明度为0.7
ax[1].imshow(mask, alpha=0.8)
ax[1].imshow(grid, alpha=0.7)
# 关闭两个子图中的网格线
ax.grid(False)
ax[1].grid(False)
# 对两个子图设置相同的x轴和y轴标签
for m in range(2):
ax[m].set_xlabel("y-coord")
ax[m].set_ylabel("y-coord")
# 设置第一个子图的标题为患者ID对应的乳腺组织切片
ax.set_title("Breast tissue slice of patient: " + patient_id)
# 设置第二个子图的标题为患者ID对应的癌症组织(红色标记)切片
ax[1].set_title("Cancer tissue colored red \n of patient: " + patient_id);
# broken_patches
broken_patches
# 设置批大小为32,分类数为2
BATCH_SIZE = 32
NUM_CLASSES = 2
# 设置输出路径和模型路径
OUTPUT_PATH = ""
MODEL_PATH = "../input/breastcancermodel/"
LOSSES_PATH = "../input/breastcancermodel/"
# 设置随机种子,确保结果可重复
torch.manual_seed(0)
np.random.seed(0)
# 查看数据集的前几行
data.head()
# 将目标列转换为字符串类型
data.loc[:, "target"] = data.target.astype(np.str)
# 打印数据集的信息
data.info()
# 获取所有患者的唯一标识
patients = data.patient_id.unique()
# 使用train_test_split函数将患者ID分为训练集、子测试集
train_ids, sub_test_ids = train_test_split(patients, test_size=0.3, random_state=0)
# 将子测试集进一步分为测试集和验证集
test_ids, dev_ids = train_test_split(sub_test_ids, test_size=0.5, random_state=0)
# 打印训练集、验证集和测试集占总体的比例
print(len(train_ids)/patients.shape*100, len(dev_ids)/patients.shape*100, len(test_ids)/patients.shape*100)
# 打印训练集、验证集和测试集的大小
print(len(train_ids), len(dev_ids), len(test_ids))
# 根据患者ID筛选训练集、测试集和验证集的数据
train_df = data.loc[data.patient_id.isin(train_ids),:].copy()
test_df = data.loc[data.patient_id.isin(test_ids),:].copy()
dev_df = data.loc[data.patient_id.isin(dev_ids),:].copy()
# 提取坐标信息,可能是一个自定义函数
train_df = extract_coords(train_df)
test_df = extract_coords(test_df)
dev_df = extract_coords(dev_df)
# 创建一个新的子图对象fig和一个包含三行的子图数组ax
fig, ax = plt.subplots(1,3,figsize=(20,5))
# 使用seaborn库的countplot函数绘制训练集、验证集和测试集的目标分布
sns.countplot(train_df.target, ax=ax, palette="Reds")
ax.set_title("Train data")
sns.countplot(dev_df.target, ax=ax[1], palette="Blues")
ax[1].set_title("Dev data")
sns.countplot(test_df.target, ax=ax[2], palette="Greens");
ax[2].set_title("Test data");
# 定义一个函数my_transform,它根据传入的参数key和plot来返回不同的数据转换序列
def my_transform(key="train", plot=False):
# 定义训练数据的数据增强序列
train_sequence = [transforms.Resize((50, 50)), # 调整图像大小为50x50像素
transforms.RandomHorizontalFlip(), # 随机水平翻转
transforms.RandomVerticalFlip()] # 随机垂直翻转
# 定义验证/测试数据的数据增强序列,这里只包含了调整图像大小的操作
val_sequence = [transforms.Resize((50, 50))]
# 如果不需要绘制图像,则在数据增强序列后添加ToTensor和Normalize转换
if plot == False:
train_sequence.extend([
transforms.ToTensor(), # 将PIL Image或numpy.ndarray转换为tensor
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])]) # 标准化图像数据
val_sequence.extend([
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])
# 根据key选择不同的数据转换序列,并组合成Compose对象
data_transforms = {'train': transforms.Compose(train_sequence), 'val': transforms.Compose(val_sequence)}
return data_transforms[key]
# 定义BreastCancerDataset类,继承自PyTorch的Dataset类
class BreastCancerDataset(Dataset):
# 初始化函数,接收DataFrame和一个可选的transform参数
def __init__(self, df, transform=None):
self.states = df # 保存传入的DataFrame
self.transform = transform # 保存数据转换方法
# 返回数据集的长度,即样本数量
def __len__(self):
return len(self.states)
# 根据索引获取数据集中的单个样本
def __getitem__(self, idx):
patient_id = self.states.patient_id.values[idx] # 获取患者ID
x_coord = self.states.x.values[idx] # 获取x坐标
y_coord = self.states.y.values[idx] # 获取y坐标
image_path = self.states.path.values[idx] # 获取图像路径
image = Image.open(image_path) # 打开图像文件
image = image.convert('RGB') # 转换为RGB模式
# 如果有transform方法,则应用该方法到图像上
if self.transform:
image = self.transform(image)
# 如果有目标标签,则获取目标标签
if "target" in self.states.columns.values:
target = np.int(self.states.target.values[idx])
else:
target = None
# 返回一个字典,包含图像、标签、患者ID和坐标信息
return {"image": image,
"label": target,
"patient_id": patient_id,
"x": x_coord,
"y": y_coord}
# 创建训练数据集,应用训练数据的转换方法
train_dataset = BreastCancerDataset(train_df, transform=my_transform(key="train"))
# 创建验证数据集,应用验证数据的转换方法
dev_dataset = BreastCancerDataset(dev_df, transform=my_transform(key="val"))
# 创建测试数据集,应用验证数据的转换方法
test_dataset = BreastCancerDataset(test_df, transform=my_transform(key="val"))
# 将数据集存储在一个字典中,方便管理
image_datasets = {"train": train_dataset, "dev": dev_dataset, "test": test_dataset}
# 计算每个数据集的大小,并存储在字典中
dataset_sizes = {x: len(image_datasets[x]) for x in ["train", "dev", "test"]}
# 创建一个3x6的子图,用于展示原始图像和转换后的图像
fig, ax = plt.subplots(3,6,figsize=(20,11))
# 创建训练数据的转换方法,并设置plot为True
train_transform = my_transform(key="train", plot=True)
# 创建验证数据的转换方法,并设置plot为True
val_transform = my_transform(key="val", plot=True)
# 遍历前6个训练样本,展示原始图像、训练数据转换后的图像和验证数据转换后的图像
for m in range(6):
filepath = train_df.path.values[m]
image = Image.open(filepath)
ax[0,m].imshow(image)
transformed_img = train_transform(image)
ax[1,m].imshow(transformed_img)
ax[2,m].imshow(val_transform(image))
ax[0,m].grid(False)
ax[1,m].grid(False)
ax[2,m].grid(False)
ax[0,m].set_title(train_df.patient_id.values[m] + "\n target: " + train_df.target.values[m])
ax[1,m].set_title("Preprocessing for train")
ax[2,m].set_title("Preprocessing for val")
# 创建训练数据的数据加载器,设置批量大小、是否打乱顺序和丢弃最后一个不完整的批次
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
# 创建验证数据的数据加载器,设置批量大小、是否打乱顺序和丢弃最后一个不完整的批次
dev_dataloader = DataLoader(dev_dataset, batch_size=BATCH_SIZE, shuffle=False, drop_last=True)
# 创建测试数据的数据加载器,设置批量大小、是否打乱顺序和丢弃最后一个不完整的批次
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, drop_last=False)
# 将数据加载器存储在一个字典中,方便管理
dataloaders = {"train": train_dataloader, "dev": dev_dataloader, "test": test_dataloader}
# 打印每个数据加载器的长度,即批次数
print(len(dataloaders["train"]), len(dataloaders["dev"]), len(dataloaders["test"]))