Pytorch基础知识(5)图像二分类

图像分类又叫图像识别,是计算机视觉中的重要任务。在这个任务中,我们假设每张图像只包含一个对象。图像分类分为两种,一种叫二分类,一种叫多分类。
我们将覆盖以下内容:

  • 探索数据集
  • 创建一个数据集
  • 划分数据集
  • 数据预处理
  • 创建数据读取器
  • 构建分类模型
  • 定义损失函数
  • 定义优化器
  • 模型训练与评估
  • 模型部署
  • 在测试集进行模型推理

探索数据集

数据准备

Histopathologic Cancer Detection数据集
下载以后,解压到文件夹名为data中。
在data文件夹下,有两个文件夹:train和test。train文件夹下包含96x96大小的220025张.tif的图像。图像文件名就是图像的ID。train_labels.csv文件提供了train文件夹中图片的真实值。

  1. 读取train_labels.csv并且打印它的头:
import pandas as pd
path2csv = "./data/train_labels.csv"
labels_df = pd.read_csv(path2csv)
labels_df.head()

在这里插入图片描述

  1. 打印各类别的数目
print(labels_df['label'].value_counts())
# 0 130908
# 1 89117
# Name: label, dtype:int64
  1. 查看标签直方图
labels_df['label'].hist()

在这里插入图片描述

  1. 可视化图像
import matplotlib.pylab as plt
from PIL import Image, ImageDraw
import numpy as np
import os
# %matplotlib inline

# 获得有害图片的IDs
# get IDs for malignant images
malignantIds = labels_df.loc[labels_df['label']==1]['id'].values

# data is stored here
path2train = "./data/train/"

# show images in grayscale, if you want color change it to True
color = False

# set figure size
plt.rcParams['figure.figsize'] = (10.0, 10.0)
plt.subplots_adjust(wspace=0, hspace=0)
nrows,ncols=3,3

# display the images
for i, id_ in enumerate(malignantIds[:nrows*ncols]):
	full_filenames = os.path.join(path2train, id_ + '.tif')
	# load image
	img = Image.open(full_filenames)
	# draw a 32*32 rectangle
	draw = ImageDraw.Draw(img)
	draw.rectangle(((32, 32), (64, 64)), outline="green")
	plt.subplot(nrows, ncols, i+1)
	if color is True:
		plt.imshow(np.array(img))
	else:
		plt.imshow(np.array(img)[:,:,0], cmap="gray")
	plt.axis('off')
plt.show()

在这里插入图片描述

  1. 获取图片形状大小以及最大最小像素值
print("image shape:", np.array(img).shape)
print("pixel values range from %s to %s" % (np.min(img), np.max(img)))

# image shape: (96, 96, 3)
# pixel values range from 0 to 255

创建自己的数据集

我们可以通过PyTorch Dataset类来创建自定义Dataset类。创建自定义数据集类时,请确保定义两个基本函数:len__和__getitem。__len__函数返回数据集的长度,这个函数可以通过Python的len函数调用;__getitem__函数返回指定索引的图像。

  1. 首先,导入相关包,定义histCancerDataset类:
from PIL import Image
import torch
from torch.utils.data import Dataset
import pandas as pd
import torchvision.transforms as transforms
import os

# dont forget to fix the random seed for reproducibility
# fix torch random seed
torch.manual_seed(0)

class histCancerDataset(Dataset):
	def __init__(self, data_dir, transform, data_type="train"):
		# path to images
		path2data = os.path.join(data_dir, data_type)
		# get a list of images
		filenames = os.listdir(path2data)
		# get the full path to images
		self.full_filenames = [os.path.join(path2data, f) for f in filenames]
		# labels are in a csv file named train_labels.csv
		csv_filename = data_type + "_labels.csv"
		path2csvLabels = os.path.join(data_dir, csv_filename)
		labels_df = pd.read_csv(path2csvLabels)
		# set data frame index to id
		labels_df.set_index("id", inplace=True)
		
		# obtain labels from data frame
		self.labels = [labels_df.loc[filename[:-4]].values[0] for filename in filenames]
		self.transform = transform
	def __len__(self):
		# return size of dataset
		return len(self.full_filenames)
	
	def __getitem__(self, idx):
		# open image, apply transforms and return with label
		image = Image.open(self.full_filenames[idx])  # PIL image
		image = self.transform(image)
		return image, self.labels[idx]

  1. 图片变换
import torchvision.transforms as transforms
data_transformer = transforms.Compose([
	transforms.ToTensor()
])
  1. 定义一个普通数据集的对象
data_dir = "./data/"
histo_dataset = histCancerDataset(data_dir, data_transformer, "train")
print(len(histo_dataset))

# 220025
  1. 使用自定义的类读取一张图片
#load an image
img,label = histo_dataset[9]
print(img.shape, torch.min(img), torch.max(img))

# torch.Size([3, 96, 96]) tensor(0.) tensor(1.)

拆分数据集

我们将把数据集分为训练集和验证集,同时显示分别显示一些样本。

  1. 拆分histo_dataset
from torch.utils.data import random_split
len_histo = len(histo_dataset)
len_train = int(0.8 * len_histo)
len_val = len_histo - len_train
train_ds, val_ds = random_split(histo_dataset, [len_train, len_val])
print("train dataset length: ", len(train_ds))
print("val dataset length: ", len(val_ds))

# train dataset length: 176020
# val dataset length: 44005
  1. 获取训练集中的一张图片
for x, y in train_ds:
	print(x.shape, y)
	break

# torch.Size([3, 96, 96]) 1
  1. 获取验证集中的一张图片
for x, y in val_ds:
	print(x.shape, y)
	break

# torch.Size([3, 96, 96]) 1
  1. 显示训练集中的一些样本图片
# import the required package
from torchvision import utils
import numpy as np
import matplotlib.pyplot as plt
# %matplotlib inline
np.random.seed(0)

# define a function to show image:
def show(img, y, color=False):
	# convert tensor to numpy array
	npimg = img.numpy()
	# convert to H*W*C shape
	npimg_tr = np.transpose(npimg, (1, 2, 0))
	if color==False:
		npimg_tr = npimg_tr[:,:,0]
		plt.imshow(npimg_tr , interpolation="nearest", cmap="gray")
	else:
		plt.imshow(npimg_tr, interpolation="nearest")
	plt.title("label: " + str(y))
	plt.show()

# create a grid of sample images:
grid_size = 4
rnd_inds = np.random.randint(0, len(train_ds), grid_size)
print("image indices: ", rnd_inds)
x_grid_train = [train_ds[i][0] for i in rnd_inds]
y_grid_train = [train_ds[i][1] for i in rnd_inds]

x_grid_train = utils.make_grid(x_grid_train, nrow=4, padding=2)
print(x_grid_train.shape)

# display the grid
plt.rcParams["figure.figsize"] = (10.0, 5)
show(x_grid_train, y_grid_train)

# image indices: [43567 173685 117852 152315]
# torch.Size([3, 100, 394])

在这里插入图片描述

  1. 显示val_ds中的一些样例
grid_size = 4
rnd_inds = np.random.randint(0, len(val_ds), grid_size)
print("image indices:", rnd_inds)
x_grid_val = [val_ds[i][0] for i in range(grid_size)]
y_grid_val = [val_ds[i][1] for i in range(grid_size)]

x_grid_val = utils.make_grid(x_grid_val, nrow=4, padding=2)
print(x_grid_val.shape)

show(x_grid_val, y_grid_val)

# image indices: [30112 23456 121345 45673]
# torch.Size([3, 99, 393])

在这里插入图片描述

数据变换与数据增强

图像变换和图像增强是深度学习模型训练所必须的。通过使用图像变换,我们可以扩展我们的数据集并且通过规范化以实现更好的模型性能。典型的变换包括水平和垂直翻转、旋转和调整大小。我们可以对二分类模型使用各种图像变换,而不需要更改标签。例如,如果我们旋转或翻转一个恶性肿瘤图像,它仍然是恶性肿瘤。在本教程中,您将学习如何使用火炬视觉包在训练期间执行实时图像转换。

  1. 首先,为训练集定义如下变换函数
train_transformer = transforms.Compose([
	transforms.RandomHorizontalFlip(p=0.5),
	transforms.RandomVerticalFlip(p=0.5),
	transforms.RandomRotation(45),
	transforms.RandomResizedCrop(96, scale=(0.8, 1.0), ratio=(1.0, 1.0)),
	transforms.ToTensor()
])
  1. 对于验证集,我们不需要任何数据增强。所以我们只需要把图像转换为tensors
val_transformer = transforms.Compose([transforms.ToTensor()])
  1. 更新train_ds和val_ds变换函数
# overwrite the transform functions
train_ds.transform = train_transformer
val_ds.transform = val_transformer 

创建dataloaders

我们准备创建一个PyTorch数据加载器。如果我们不使用数据加载器,我们必须编写代码来循环数据集并提取一批数据。这个过程可以使用PyTorch数据加载器自动完成。

  1. 首先,分别定义训练集和验证集的数据加载器
from torch.utils.data import DataLoader
train_dl = DataLoader(train_ds, batch_size=32, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=32, shuffle=False)
  1. 然后,从训练集的数据加载器中获取一批数据
# extract a batch from training data
for x, y in train_dl:
	print(x.shape)
	print(y.shape)
	break
# torch.Size([32, 3, 96, 96])
# torch.Size([32])
  1. 最后,从验证集的数据加载器中获取一批数据
# extract a batch from val data
for x, y in val_dl:
	print(x.shape)
	print(y.shape)
	break
# torch.Size([32, 3, 96, 96])
# torch.Size([32])

构建分类模型

我们将定义一个模型,把模型移到GPU设备上,并且获得模型概要。

  1. 为验证集创建baseline
# get labels for validation dataset
y_val = [ y for _, y in val_ds]
def accuracy(labels, out):
	return np.sum(out==labels)/float(len(labels))
# accuracy all zeros predictions
acc_all_zeros = accuracy(y_val, np.zeros_like(y_val))
print("accuracy all zero prediction: %.2f" % acc_all_zeros)

# accuracy all zero prediction: 0.60

# accuracy all ones predictions
acc_all_ones = accuracy(y_val, np.ones_like(y_val))
print("accuracy all one prediction: %.2f" % acc_all_ones )
# accuracy all one prediction: 0.40

# accuracy random predictions
acc_random = accuracy(y_val, np.random.randint(2, size=len(y_val)))
print("accuracy random prediction:%.2f"%acc_random)
# accuracy random prediction: 0.50
  1. 构建函数实现计算CNN layer的输出尺寸的功能
import torch.nn as nn
import numpy as np
def findConv2dOutShape(H_in, W_in, conv, pool=2):
	# get conv arguments
	kernel_size = conv.kernel_size
	stride = conv.stride
	padding = conv.padding
	dilation = conv.dilation
	H_out = np.floor((H_in+2*padding[0]-dilation[0]*(kernel_size[0]-1)-1)/stride[0]+1)
	W_out = np.floor((H_in+2*padding[1]-dilation[1]*(kernel_size[1]-1)-1)/stride[1]+1)
	if pool:
		H_out/=pool
		W_out/=pool
	return int(H_out), int(W_out)

# for example
conv1 = nn.Conv2d(3, 8, kernel_size=3)
h,w = findConv2dOutShape(96, 96, conv1)
print(h, w)

# 47 47
  1. 下一步,我们实现CNN模型
import torch.nn as nn
import torch.nn.functional as F
class Net(nn.Module):
	def __init__(self, params):
		super(Net, self).__init__()
		C_in,H_in,W_in = params["input_shape"]
		init_f = params["initial_filters"]
		num_fc1 = params["num_fc1"]
		num_classes = params["num_classes"]
		self.dropout_rate = params["dropout_rate"]
		self.conv1 = nn.Conv2d(C_in, init_f, kernel_size=3)
		h,w = findConv2dOutShape(H_in, W_in, self.conv1)
		self.conv2 = nn.Conv2d(init_f, 2*init_f, kernel_size=3)
		h,w=findConv2dOutShape(h,w,self.conv2)
		self.conv3 = nn.Conv2d(2*init_f, 4*init_f, kernel_size=3)
		h,w=findConv2dOutShape(h,w,self.conv3)
		
		self.conv4 = nn.Conv2d(4*init_f, 8*init_f, kernel_size=3)
		h,w=findConv2dOutShape(h, w, self.conv4)
		# compute the flatten size
		self.num_flatten = h*w*8*init_f
		self.fc1 = nn.Linear(self.num_flatten, num_fc1)
		self.fc2 = nn.Linear(num_fc1, num_classes)
	def forward(self, x):
		x = F.relu(self.conv1(x))
		x = F.max_pool2d(x, 2, 2)
		x = F.relu(self.conv2(x))
		x = F.max_pool2d(x, 2, 2)
		x = F.relu(self.conv3(x))
		x = F.max_pool2d(x, 2, 2)
		x = F.relu(self.conv4(x))
		x = F.max_pool2d(x, 2, 2)
		x = x.view(-1, self.num_flatten)
		x = F.relu(self.fc1(x))
		x = F.dropout(x, self.dropout_rate, training=self.training)
		x = self.fc2(x)
		return F.log_softmax(x, dim=1)	
  1. 创建一个Net对象
# dict to define model parameters
params_model = {"input_shape": (3, 96, 96),
				"initial_filters": 8,
				"num_fc1": 100,
				"dropout_rate": 0.25,
				"num_classes": 2
				}
# create model
cnn_model = Net(params_model)
  1. 把模型移到GPU上
# move model to cuda/gpu device
if torch.cuda.is_available():
	device = torch.device("cuda")
	cnn_model = cnn_model.to(device)
  1. 打印模型
print(cnn_model)

在这里插入图片描述

  1. 验证模型设备
print(next(cnn_model.parameters()).device)
# cuda:0
  1. 获取模型摘要
from torchsummary import summary
summary(cnn_model, input_size=(3, 96, 96), device=device.type)

在这里插入图片描述

定义损失函数

分类任务的标准损失函数是交叉熵损失或logloss。但是,在定义损失函数时,我们需要考虑模型输出及其激活函数。对于二分类任务,我们可以选择一个或两个输出。下表为不同激活函数对应的损失函数:

推荐使用log_softmax函数,因为它更容易扩展到多分类。由于数值稳定性和速度,PyTorch将log和softmax操作合并到一个函数中。

# 首先定义损失函数
loss_func = nn.NLLLoss(reduction="sum")
# 简单样例
# fix random seed
torch.manual_seed(0)

n,c = 8,2
y = torch.randn(n, c, requires_grad=True)
ls_F = nn.LogSoftmax(dim=1)
y_out = ls_F(y)
print(y_out.shape)

target = torch.randint(c, size=(n,))
print(target.shape)

loss = loss_func(y_out, target)
print(loss.item())

# torch.Size([8, 2])
# torch.Size([8])
# 5.266995429992676

# 然后计算损失相对于y的梯度
loss.backward()
print(y.data)

# tensor([[-1.1258, -1.1524],
#       [-0.2506, -0.4339],
#        [ 0.8487,  0.6920],
#        [-0.3160, -2.1152],
#        [ 0.3223, -1.2633],
#        [ 0.3500,  0.3081],
#        [ 0.1198,  1.2377],
#        [ 1.1168, -0.2473]])

定义优化器

torch.optim包提供了通用优化器的实现。优化器将保存当前状态,并根据计算出的梯度更新参数。对于二分类任务,SGD和Adam优化器使用最多。torch.optim包中另一个有用的工具就是(learning schedule)学习计划。学习计划(learning schedule)是在训练过程中自动调整学习率以提高模型性能的有效工具。
这里,将会定义一个优化器,获取当前学习率并且定义一个学习计划。

# 1.首先定义一个学习率为3e-4的Adam优化器对象
from torch import optim
opt = optim.Adam(cnn_model.parameters(), lr=3e-4)

# 2.获取当前的学习率
def get_lr(opt):
	for param_group in opt.param_groups:
		return param_group['lr']
current_lr = get_lr(opt)
print("current lr={}".format(current_lr))

# current lr=0.0003
# 3.定义ReduceLROnPlateau学习计划
from torch.optim.lr_scheduler import ReduceLROnPlateau
# define learning rate schedule
# mode参数定义了度量量在训练期间是增加还是减少。例如,如果我们监视loss值,我们设置mode='min'。
# 如果我们监控accuracy,我们应该设置mode='max'。
lr_scheduler = ReduceLROnPlateau(opt, mode="min", factor=0.5, patience=20, verbose=1)

# 4.我们将使用下面的例子学习学习速率计划是如何工作的
for i in range(100):
	lr_scheduler.step(1)

# Epoch 21: reducing learning rate of group 0 to 1.5000e-04.
# Epoch 42: reducing learning rate of group 0 to 7.5000e-05.
# Epoch 63: reducing learning rate of group 0 to 3.7500e-05.
# Epoch 84: reducing learning rate of group 0 to 1.8750e-05.

模型训练与评估

到目前为止,我们已经创建了数据集,建立了模型,并定义了损失函数和优化器。在本教程中,我们将实现训练和验证脚本。训练和验证脚本可能很长且有重复的内容。为了更好的代码可读性和避免代码重复,我们将首先构建几个函数。

# 1.首先实现计算小批量准确个数的函数
def metrics_batch(output, target):
	pred = output.argmax(dim=1, keepdim=True)
	corrects = pred.eq(target.view_as(pred)).sum().item()
	return corrects
# 2.然后实现计算小批量损失的函数
def loss_batch(loss_func, output, target, opt=None):
	loss = loss_func(output, target)
	with torch.no_grad():
		metric_b = metrics_batch(output, target)
	if opt is not None:
		opt.zero_grad()
		loss.backward()
		opt.step()
	return loss.item(), metric_b
# 3.下一步实现计算每个epoch的损失值和性能
def loss_epoch(model, loss_func, dataset_dl, sanity_check=False,opt=None):
	running_loss=0.0
	running_metric=0.0
	len_data=len(dataset_dl.dataset)
	for xb, yb in dataset_dl:
		# move batch to device
		xb = xb.to(device)
		yb = yb.to(device)
		# get model output
		output = model(xb)
		# get loss per batch
		loss_b,metric_b=loss_batch(loss_func,output,yb,opt)
		# update running loss
		running_loss += loss_b
		# update running metric
		if metric_b is not None:
			running_metric+=metric_b
		# break the loop in case of sanity check
		if sanity_check is True:
			break
	# average loss value
	loss=running_loss/float(len_data)
	# average metric value
	metric=running_metric/float(len_data)
	return loss,metric
# 4.实现train_val函数
def train_val(model, params):
	# extract model params
	num_epochs = params["num_epochs"]
	loss_func = params["loss_func"]
	opt=params["optimizer"]
	train_dl=params["train_dl"]
	val_dl=params["val_dl"]
	sanity_check=params["sanity_check"]
	lr_scheduler=params["lr_scheduler"]
	path2weights=params["path2weights"]
	# history of loss values in each epoch
	loss_history = {
		"train":[],
		"val":[]
	}
	# history of metric values in each epoch
	metric_history={
		"train":[],
		"val":[]
	}
	# a deep copy of weights for the best performing model
	best_model_wts = copy.deepcopy(model.state_dict())
	# initilaize best loss to a large value
	best_loss = float("inf")
	
	# main loop
	for epoch in range(num_epochs):
		# get current learning rate
		current_lr = get_lr(opt)
		print("Epoch {}/{}, current lr={}".format(epoch, num_epochs-1, current_lr))
		# train model on training dataset
		model.train()
		train_loss, train_metric=loss_epoch(model, loss_func, train_dl, sanity_check, opt)
		# collect loss and metric for training dataset
		loss_history["train"].append(train_loss)
		metric_history["train"].append(train_metric)
		# evaluate model on validation dataset
		model.eval()
		with torch.no_grad():
			val_loss, val_metric=loss_epoch(model, loss_func, val_dl, sanity_check)
		# collect loss and metric for validation dataset
		loss_history["val"].append(val_loss)
		metric_history["val"].append(val_metric)
		# store the best weights
		if val_loss < best_loss:
			best_loss = val_loss
			best_model_wts = copy.deepcopy(model.state_dict())
			# store weights into a local file
			torch.save(model.state_dict(), path2weights)
			print("Copied best model weights!")
		# learning rate scheduler 监视验证集损失函数
		lr_scheduler.step(val_loss)
		# 每次学习率降低,从最好的权重继续训练
		if current_lr != get_lr(opt):
			print("Loading best model weights")
			model.load_state_dict(best_model_wts)
		print("train loss: %.6f, dev loss: %.6f, accuracy: %.2f" %(train_loss, val_loss,100*val_metric))
		print("-"*10)
	# load best model weigths
	model.load_state_dict(best_model_wts)
	return model, loss_history, metric_history
#5.设置sanity_check为True并运行代码
import copy
loss_func = nn.NLLLoss(reduction="sum")
opt=optim.Adam(cnn_model.parameters(), lr=3e-4)
lr_scheduler = ReduceLROnPlateau(opt, mode="min", factor=0.5, patience=20, verbose=1)
# define traing parameters and call train_val function
params_train={
	"num_epochs": 100,
	"optimizer": opt,
	"loss_func" loss_func,
	"train_dl": train_dl,
	"val_dl": val_dl,
	"sanity_check": True,
	"lr_scheduler": lr_scheduler,
	"path2weights": "./models/weights.pt",
	}
# train and validate the model
cnn_model, loss_hist, metric_hist = train_val(cnn_model, params_train)

# Epoch 0/99, current lr=0.0003
# Copied best model weights!
# train loss: 0.000129, dev loss: 0.001024, accuracy: 0.05
# ----------
# Epoch 1/99, current lr=0.0003
# Copied best model weights!
# train loss: 0.000125, dev loss: 0.001021, accuracy: 0.05
# ...
#6. plot loss_his and metric_hist
# train-validation progress
num_epochs = params_train["num_epochs"]
# plot loss progress
plt.title("Train-Val Loss")
plt.plot(range(1, num_epochs+1), loss_hist["train"], label="train")
plt.plot(range(1, num_epochs+1), loss_hist["val"], label="val")
plt.ylabel("Loss")
plt.xlabel("Training Epochs")
plt.legend()
plt.show()

# plot accuracy progress
plt.title("Train-Val Accuracy")
plt.plot(range(1, num_epochs+1), metric_hist["train"], label="train")
plt.plot(range(1, num_epochs+1), metric_hist["val"], label="val")
plt.ylabel("Accuracy")
plt.xlabel("Training Epochs")
plt.legend()
plt.grid()
plt.show()

#7. 已经确信代码都正确,设置sanity_check:False并且运行代码:
# define traing parameters and call train_val function
params_train={
	"num_epochs": 100,
	"optimizer": opt,
	"loss_func": loss_func,
	"train_dl": train_dl,
	"val_dl": val_dl,
	"sanity_check": False,
	"lr_scheduler": lr_scheduler,
	"path2weights": "./models/weights.pt",
	}
# train and validate the model
cnn_model, loss_hist, metric_hist = train_val(cnn_model, params_train)

模型部署

定义模型,导入权重并且部署模型

  1. 首先,创建Net对象,导入模型权重
# 模型参数
params_model = {
	"input_shape": (3, 96, 96),
	"initial_filters": 8,
	"num_fc1": 100,
	"dropout_rate": 0.25,
	"num_classes": 2,
}
# 初始化模型
cnn_model = Net(params_model)
  1. 导入模型权重
# load sate_dict into model
path2weights="./models/weights.pt"
cnn_model.load_state_dict(torch.load(path2weights))
  1. 设置模型为eval模式
# set model in evaluation mode
cnn_model.eval()
  1. 模型移到GPU上
# move model to cuda/gpu device
if torch.cuda.is_available():
	device = torch.device("cuda")
	cnn_model=cnn_model.to(device)
  1. 实现deploy_model函数
def deploy_model(model, dataset, device,num_classes=2,sanity_check=False):
	len_data = len(dataset)
	# initialize output tensor on CPU:due to GPU memory limits
	y_out=torch.zeros(len_data, num_classes)
	# initialize ground truth on CPU:due to GPU memory limits
	y_gt=np.zeros((len_data), dtype="uint8")
	# move model to device
	model = model.to(device)
	
	elapsed_times=[]
	with torch.no_grad():
		for i in range(len_data):
			x,y=dataset[i]
			y_gt[i]=y
			start=time.time()
			y_out[i]=model(x.unsqueeze(0).to(device))
			elapsed=time.time()-start
			elapsed_times.append(elapsed)
			if sanity_check is True:
				break
	inference_time = np.mean(elapsed_times)*1000
	print("average inference time per image on %s:%.2f ms"%(device, inference_time))
	return y_out.numpy(), y_gt	
  1. 在验证集上部署模型
y_out, y_gt = deploy_model(cnn_model, val_ds, device=device, sanity_check=False)
print(y_out.shape, y_gt.shape)

# average inference time per image on cuda:0: 0.74ms (44005, 2)
# (44005,)
# (44005, 2) (44005,)
  1. 计算模型在验证集上的准确率
from sklearn.metrics import accuracy_score
# get predictions
y_pred = np.argmax(y_out, axis=1)
print(y_pred.shape, y_gt.shape)
# compute accuracy
acc = accuracy_score(y_pred, y_gt)
print("accuracy: %.2f" % acc)

# (44005,) (44005,)
# accuracy:0.91
  1. 计算在CPU上的推理时间
device_cpu = torch.device("cpu")
y_out,y_gt = deploy_model(cnn_model, val_ds, device=device_cpu, sanity_check=False)
print(y_out.shape, y_gt.shape)

average inference time per image on cpu: 2.21ms
# (44005, 2) (44005,)

在测试集上进行模型推理

# 1. 首先,导入test_labels.csv并打印头部
path2csv = "./data/test_labels.csv"
labels_df = pd.read_csv(path2csv)
labels_df.head()

# 2.创建测试集的datset对象
histo_test = histoCancerDataset(data_path, val_transformer, data_type="test")
print(len(histo_test))
# 57458

#3.在测试集上进行模型推理
y_test_out, _ = deploy_model(cnn_model, histo_test,device,sanity_check=False)
# average inference time per image on cuda:0: 0.74 ms

y_test_pred=np.argmax(y_test_out, axis=1)
print(y_test_pred.shape)
# (57458,)
#4. 显示预测结果
grid_size=4
rnd_inds = np.random.randint(0, len(histo_test), grid_size)
print("image indices:", rnd_inds)

x_grid_test = [histo_test[i][0] for i in range(grid_size)]
y_grid_test = [y_test_pred[i] for i in range(grid_size)]

x_grid_test = utils.make_grid(x_grid_test, nrow=4, padding=2)
print(x_grid_test.shape)

plt.rcParams["figure.figsize"]=(10.0, 5)
show(x_grid_test, y_grid_test)

# image indices: [2732 43567 43567 12346]
# torch.Size([3, 100, 394])

创建一个提交文件

print(y_test_out.shape)
cancer_preds = np.exp(y_test_out[:,1])
print(cancer_preds.shape)

# (57458, 2)
# (57458,)

# 将概率形式的预测结果转换为DataFrame格式并且存储在CSV文件中
path2sampleSub = "./data/" + "sample_submission.csv"
sample_df = pd.read_csv(path2sampleSub)
ids_list = list(sample_df.id)
pred_list = [p for p in cancer_preds]
pred_dict = dict((key[:-4], value) for (key, value) in zip(histo_test.filenames, pred_list))
pred_list_sub=[pred_dic[id_] for id_ in ids_list]
submission_df = pd.DataFrame({"id":ids_list, "label":pred_list_sub})
if not os.path.exists("./submissions/"):
	os.makedirs("submissions/")
	print("submission folder created!")
path2submission="./submissions/submission.csv"
submission_df.to_csv(path2submission, header=True, index=False)
submission_df.head()

你可以将CSV文件提交到Histopathologic Cancer Detection competition(已结束)

  • 25
    点赞
  • 93
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值