tensorboard主要是用于记录训练过程中的各种参数,基本上可以把展现出所有的训练细节。对于改进模型的有着极大的益处。
from datetime import datetime
from tensorboardX import SummaryWriter
# TIMESTAMP用于新建一个文件夹,存储不同时间训练得到的结果
TIMESTAMP = "{0:%Y-%m-%dT%H-%M-%S/}".format(datetime.now())
tb_writer = SummaryWriter(logdir='./runs/'+TIMESTAMP)
主要有一下几个方法:
1、add_text
add_text(
tag: str,
text_string: str,
global_step: Optional[int] = None,
walltime: Optional[float] = None):
这个应该是在文本处理的用到的,但是可以用于记录本次训练所使用的超参数。
tb_writer.add_text(tag="super parameters", text_string=str(args),global_step=0)
2、add_scalar
add_scalar(
self,
tag: str,
scalar_value: Union[float, numpy_compatible],
global_step: Optional[int] = None,
walltime: Optional[float] = None,
display_name: Optional[str] = "",
summary_description: Optional[str] = ""):
这个是主要的用到的函数,用于记录各种标量的变化。
tb_writer.add_scalar(tag="loss", scalar_value=total_loss/(i+1),global_step=epoch)
tb_writer.add_scalar(tag="lr", scalar_value=current_lr,global_step=epoch)
3、add_image
add_image(
self,
tag: str,
img_tensor: numpy_compatible,
global_step: Optional[int] = None,
walltime: Optional[float] = None,
dataformats: Optional[str] = 'CHW'):
img_tensor
:形如[channel,height,width]
的uint8
或者float
数据。元素的范围在[0,1](float)
或者[0,255](unit8)
。
这个函数能显示的东西非常的灵活,可以是卷积核,或者是得到的特征层,也能是对测试图片每一轮的预测结果。可以搭配torchvision.utils.make_grid()
使用,也可以搭配plt.figure()
使用,甚至可以利用“Grad-CAM”方法保存每次预测的热力图,实现对于关注点的迭代观测。相比起来,plt.figure()
的使用更加灵活。
gred = make_grid(img,normalize=True,scale_each=True,nrow=4)
tb_writer.add_image(tag="val original images", img_tensor=gred)
4、add_histogram
add_histogram(
self,
tag: str,
values: numpy_compatible,
global_step: Optional[int] = None,
bins: Optional[str] = 'tensorflow',
walltime: Optional[float] = None,
max_bins=None):
一般用于查看权重的直方图,检查训练过程中可能存在的问题。
tb_writer.add_histogram(tag="conv1",
values=model.conv1.weight,
global_step=epoch)
5、add_graph
def add_graph(
self,
model,
input_to_model=None,
verbose=False):
在使用的时候需要初始化一个输入,而且需要安装tensorboard
,其他的函数则不依赖于tensorboard
。
完整代码如下:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import torch.optim as optim
import torch.nn as nn
import math
import torch
import argparse
import torchvision.transforms.functional as F
from tensorboardX import SummaryWriter
from torchvision import datasets,transforms,models
from torchvision.utils import make_grid
from torch.utils.data import DataLoader
from datetime import datetime
from tqdm import tqdm
parser = argparse.ArgumentParser(description='Tensorboard Tutorial Demo Code')
parser.add_argument('-batch_size', '-b', type=int, help='batch size', default=32)
parser.add_argument('-cuda', '-g', type=int, help='cuda id', default=0)
parser.add_argument('-Epoch', '-e', type=int, default=5)
# learning rate
parser.add_argument('-lambda_lr', '-llr',type=str, default='cos_lr')
parser.add_argument('-learning_rate', '-lr', type=float, help='learning rate', default=1e-4)
parser.add_argument('-warm_up_epochs', '-w', type=int, help='warm up epoch for Cosine Schedule', default=1)
parser.add_argument('-weight_decay', '-wd', type=float, default=4e-5,
help='weight decay for Adam')
# dataset
parser.add_argument('-dataset_name', '-data', type=str, default='cifar10')
parser.add_argument('-img_size', '-is', type=int, default=32)
parser.add_argument('-crop_size', '-cs', type=int, default=28)
args = parser.parse_args()
transform = transforms.Compose(
[transforms.Resize([32,32]),
transforms.RandomResizedCrop([28,28]),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
])
CIFAR10 = datasets.CIFAR10(root='./cifar10', transform=transform)
data_loader = DataLoader(CIFAR10,batch_size=10,shuffle=True,num_workers=4)
model = models.resnet18(num_classes=10).cuda()
optimizier = optim.SGD(model.parameters(),lr=args.learning_rate,momentum=args.weight_decay)
warm_up_epochs = args.warm_up_epochs
warm_up_with_cosine_lr = lambda epoch: (epoch+1) / warm_up_epochs if epoch < warm_up_epochs \
else 0.5 * ( math.cos((epoch - warm_up_epochs) /(args.Epoch - warm_up_epochs) * math.pi) + 1)
lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizier,warm_up_with_cosine_lr)
criterion = nn.CrossEntropyLoss(label_smoothing=0.1).cuda()
TIMESTAMP = "{0:%Y-%m-%dT%H-%M-%S/}".format(datetime.now())
tb_writer = SummaryWriter(logdir='./runs/'+TIMESTAMP)
tb_writer.add_text(tag="super paramters", text_string=str(args),global_step=0)
# train
for epoch in range(args.Epoch):
total_loss = 0
length = len(data_loader)
with tqdm(total=length,postfix=dict,mininterval=0.3) as pbar:
for i,(img,label) in enumerate(data_loader):
img,label = img.cuda(),label.cuda()
output = model(img)
loss = criterion(output,label)
optimizier.zero_grad()
loss.backward()
optimizier.step()
total_loss += loss.item()
current_lr = optimizier.state_dict()['param_groups'][0]['lr']
pbar.set_description(f'epoch:{epoch+1}/{args.Epoch}, iter:{i + 1}/{length}')
pbar.set_postfix(**{'avg_loss': total_loss/(i+1),
'lr' : current_lr})
pbar.update(1)
# tensorboard log
tb_writer.add_scalar(tag="loss", scalar_value=total_loss/(i+1),global_step=epoch)
tb_writer.add_scalar(tag="lr", scalar_value=current_lr,global_step=epoch)
tb_writer.add_histogram(tag="conv1",
values=model.conv1.weight,
global_step=epoch)
lr_scheduler.step()
# evaluate
transform_val = transforms.Compose(
[transforms.Resize([32,32]),
transforms.CenterCrop([28,28]),
transforms.ToTensor(),
# transforms.Normalize(mean=[0.485, 0.456, 0.406],
# std=[0.229, 0.224, 0.225])
])
CIFAR10_val = datasets.CIFAR10(root='./cifar10',train=False, transform=transform)
data_loader_val = DataLoader(CIFAR10_val,batch_size=8,shuffle=False,num_workers=4)
with torch.no_grad():
model.eval()
for img,label in data_loader_val:
# add original images into tensorboard
gred = make_grid(img,normalize=True,scale_each=True,nrow=4)
tb_writer.add_image(tag="val original images", img_tensor=gred)
tb_writer.close()
img = F.normalize(img, mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225])
img, label = img.cuda(),label.cuda()
output = model(img)
pred = torch.max(output,1)[1]
break