代码
import os
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.tensorboard import SummaryWriter
import sys
hello_pytorch_DIR = os.path.abspath(os.path.dirname(__file__)+os.path.sep+".."+os.path.sep+"..")
sys.path.append(hello_pytorch_DIR)
from tools.common_tools import set_seed
set_seed(1) # 设置随机种子
# ----------------------------------- 0 SummaryWriter -----------------------------------
flag = 0
# flag = 1
if flag:
log_dir = "./train_log/test_log_dir"
writer = SummaryWriter(log_dir=log_dir, comment='_scalars', filename_suffix="12345678")
# writer = SummaryWriter(comment='_scalars', filename_suffix="12345678")
for x in range(100):
writer.add_scalar('y=pow_2_x', 2 ** x, x)
writer.close()
# ----------------------------------- 1 scalar and scalars -----------------------------------
flag = 0
# flag = 1
if flag:
max_epoch = 100
writer = SummaryWriter(comment='test_comment', filename_suffix="test_suffix")
for x in range(max_epoch):
writer.add_scalar('y=2x', x * 2, x)
writer.add_scalar('y=pow_2_x', 2 ** x, x)
writer.add_scalars('data/scalar_group', {"xsinx": x * np.sin(x),
"xcosx": x * np.cos(x)}, x)
writer.close()
# ----------------------------------- 2 histogram -----------------------------------
# flag = 0
flag = 1
if flag:
writer = SummaryWriter(comment='test_comment', filename_suffix="test_suffix")
for x in range(2):
np.random.seed(x)
data_union = np.arange(100)
data_normal = np.random.normal(size=1000)
writer.add_histogram('distribution union', data_union, x)
writer.add_histogram('distribution normal', data_normal, x)
plt.subplot(121).hist(data_union, label="union")
plt.subplot(122).hist(data_normal, label="normal")
plt.legend()
plt.show()
writer.close()
SummaryWriter
class SummaryWriter(object):
def __init__(self, log_dir=None, comment='', purge_step=None, max_queue=10,flush_secs=120, filename_suffix='')
功能:
提供创建event file的高级接口
主要属性:
- log_dir: event_file输出文件夹
- comment: 不指定 log_dir时,文件后缀
- filename_suffix:event file文件名后缀
我们运行上面代码的 SummaryWriter 部分,我们可以发现eventfile文件名后缀为012345678,即参数filename_suffix的值。但是我们可以发现_scalars并没有出现在文件中,这是由于我们设置了log_dir,那么comment参数就不会起作用。如果我们按照writer = SummaryWriter(comment=’_scalars’, filename_suffix=“12345678”)代码重新运行,那么我们就会得到第二张图的结果,生成了runs文件夹,里面的文件夹多了_scalars后缀。通常我们使用log_dir选项。
add_scalar()
add_scalar(tag, scalar_value, global_step=None, walltime=None)
功能: 记录标量。局限性是只能记录一条曲线。
- tag: 图像的标签名,图的唯一标识
- scalar_value: 要记录的标量
- global_step: x轴
add_scalars()
add_scalars(main_tag, tag_scalar_dict, global_step=None, walltime=None)
- main_tag: 该图的标签,等同于tag
- tag_scalar_dict: key是变量的tag,value是变量的值。绘制多个曲线的参数。
运行上面代码的 1 scalar and scalars 部分,进入tensorboard。我们可以看到两个函数的变化。
下面这三个按钮前两个分别对应着平铺、取对数。
例如我们对data使用第一个按钮得到一个平铺的曲线
我们对幂函数2的x次方取对数得到如下效果,虽然我们对曲线取了对数,但是我们的value并没有改变。
add_histogram()
add_histogram(tag, values, global_step=None, bins='tensorflow', walltime=None)
功能:统计直方图与多分位数折线图
- tag: 图像的标签名,图的唯一标识
- values:要统计的参数
- global_step:y轴
- bins:取直方图的bins
我们运行 2 histogram 这一部分,首先我们得到plt的绘图结果,
接下来是tensorboard的结果,首先我么你看histograms,第一张图的第一部分我们可以知道global_step实在图片最右边这个部分体现的,x轴为bins,y轴表示数据量。
接下来我们查看distributions这一栏,我们会得到多分位折线图。这些直线的方差是不变的。
RMB二分类实验
import os
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
from torch.utils.tensorboard import SummaryWriter
import torch.optim as optim
from matplotlib import pyplot as plt
import sys
hello_pytorch_DIR = os.path.abspath(os.path.dirname(__file__)+os.path.sep+".."+os.path.sep+"..")
sys.path.append(hello_pytorch_DIR)
from model.lenet import LeNet
from tools.my_dataset import RMBDataset
from tools.common_tools import set_seed
set_seed() # 设置随机种子
rmb_label = {"1": 0, "100": 1}
# 参数设置
MAX_EPOCH = 10
BATCH_SIZE = 16
LR = 0.01
log_interval = 10
val_interval = 1
# ============================ step 1/5 数据 ============================
split_dir = os.path.join("..", "..", "data", "rmb_split")
train_dir = os.path.join(split_dir, "train")
valid_dir = os.path.join(split_dir, "valid")
norm_mean = [0.485, 0.456, 0.406]
norm_std = [0.229, 0.224, 0.225]
train_transform = transforms.Compose([
transforms.Resize((32, 32)),
transforms.RandomCrop(32, padding=4),
transforms.RandomGrayscale(p=0.8),
transforms.ToTensor(),
transforms.Normalize(norm_mean, norm_std),
])
valid_transform = transforms.Compose([
transforms.Resize((32, 32)),
transforms.ToTensor(),
transforms.Normalize(norm_mean, norm_std),
])
# 构建MyDataset实例
train_data = RMBDataset(data_dir=train_dir, transform=train_transform)
valid_data = RMBDataset(data_dir=valid_dir, transform=valid_transform)
# 构建DataLoder
train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(dataset=valid_data, batch_size=BATCH_SIZE)
# ============================ step 2/5 模型 ============================
net = LeNet(classes=2)
net.initialize_weights()
# ============================ step 3/5 损失函数 ============================
criterion = nn.CrossEntropyLoss() # 选择损失函数
# ============================ step 4/5 优化器 ============================
optimizer = optim.SGD(net.parameters(), lr=LR, momentum=0.9) # 选择优化器
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1) # 设置学习率下降策略
# ============================ step 5/5 训练 ============================
train_curve = list()
valid_curve = list()
iter_count = 0
# 构建 SummaryWriter
writer = SummaryWriter(comment='test_your_comment', filename_suffix="_test_your_filename_suffix")
for epoch in range(MAX_EPOCH):
loss_mean = 0.
correct = 0.
total = 0.
net.train()
for i, data in enumerate(train_loader):
iter_count += 1
# forward
inputs, labels = data
outputs = net(inputs)
# backward
optimizer.zero_grad()
loss = criterion(outputs, labels)
loss.backward()
# update weights
optimizer.step()
# 统计分类情况
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).squeeze().sum().numpy()
# 打印训练信息
loss_mean += loss.item()
train_curve.append(loss.item())
if (i+1) % log_interval == 0:
loss_mean = loss_mean / log_interval
print("Training:Epoch[{:0>3}/{:0>3}] Iteration[{:0>3}/{:0>3}] Loss: {:.4f} Acc:{:.2%}".format(
epoch, MAX_EPOCH, i+1, len(train_loader), loss_mean, correct / total))
loss_mean = 0.
# 记录数据,保存于event file
writer.add_scalars("Loss", {"Train": loss.item()}, iter_count)
writer.add_scalars("Accuracy", {"Train": correct / total}, iter_count)
# 每个epoch,记录梯度,权值
for name, param in net.named_parameters():
writer.add_histogram(name + '_grad', param.grad, epoch)
writer.add_histogram(name + '_data', param, epoch)
scheduler.step() # 更新学习率
# validate the model
if (epoch+1) % val_interval == 0:
correct_val = 0.
total_val = 0.
loss_val = 0.
net.eval()
with torch.no_grad():
for j, data in enumerate(valid_loader):
inputs, labels = data
outputs = net(inputs)
loss = criterion(outputs, labels)
_, predicted = torch.max(outputs.data, 1)
total_val += labels.size(0)
correct_val += (predicted == labels).squeeze().sum().numpy()
loss_val += loss.item()
valid_curve.append(loss.item())
print("Valid:\t Epoch[{:0>3}/{:0>3}] Iteration[{:0>3}/{:0>3}] Loss: {:.4f} Acc:{:.2%}".format(
epoch, MAX_EPOCH, j+1, len(valid_loader), loss_val, correct / total))
# 记录数据,保存于event file
writer.add_scalars("Loss", {"Valid": np.mean(valid_curve)}, iter_count)
writer.add_scalars("Accuracy", {"Valid": correct / total}, iter_count)
train_x = range(len(train_curve))
train_y = train_curve
train_iters = len(train_loader)
valid_x = np.arange(1, len(valid_curve)+1) * train_iters*val_interval # 由于valid中记录的是epochloss,需要对记录点进行转换到iterations
valid_y = valid_curve
plt.plot(train_x, train_y, label='Train')
plt.plot(valid_x, valid_y, label='Valid')
plt.legend(loc='upper right')
plt.ylabel('loss value')
plt.xlabel('Iteration')
plt.show()
此处我们采用iteration来记录loss和accuracy,采用add_scalars对比训练集和验证集的变化情况。
下图显示了我们网络中所有可学习参数的数据和梯度。
我们以conv1的权值为例,数据分布服从正态分布,梯度变化在7,8,9时几乎为0,因为此时loss很小,而梯度为loss的导数。