pytorch学习笔记

最新推荐文章于 2022-09-27 03:02:35 发布

Jany的

最新推荐文章于 2022-09-27 03:02:35 发布

阅读量260

点赞数

文章标签： pytorch 深度学习神经网络

本文链接：https://blog.csdn.net/Aphrodte/article/details/120157876

版权

pytorch手册

模型的保存与加载

#保存模型到checkpoint.pth.tar,这种方式保存模型的所有信息，state是个自定义的字典
#保存模型的状态，可以设置一些参数，后续可以使用
state = {'epoch': epoch + 1,#保存的当前轮数
         'state_dict': mymodel.state_dict(),#训练好的参数
         'optimizer': optimizer.state_dict(),#优化器参数,为了后续的resume
         'best_pred': best_pred#当前最好的精度
          ,....,...}

#保存模型到checkpoint.pth.tar
torch.save(state, ‘checkpoint.pth.tar’)
#如果是best,则复制过去
if is_best:
    shutil.copyfile(filename, directory + 'model_best.pth.tar')
###########################################################################
checkpoint = torch.load('model_best.pth.tar',map_location=指定模型加载设备)###可以指定
model.load_state_dict(checkpoint['state_dict'])#加载模型参数，模型的参数存放在state_dict中
optimizer.load_state_dict(checkpoint['optimizer'])#优化参数，优化器的参数存在state_dict中
epoch = checkpoint['epoch']#epoch，可以用于更新学习率等

#有了以上的东西，就可以继续重新训练了，也就不需要担心停止程序重新训练。
train/eval
....
....

参数加载常出现的问题

# 加载optimizer时，其state中的tensor往往不在device上导致运行错误
#解决方法：
optimizer.load_state_dict(checkpoint['optimizer'])
for state in optimizer.state.values():
    for k, v in state.items():
        if torch.is_tensor(v):
            state[k] = v.cuda()

学习率的调整

import torch
import torch.optim as optim
from torch.optim import lr_scheduler
from torchvision.models import AlexNet
import matplotlib.pyplot as plt

model = AlexNet(num_classes=2)
optimizer = optim.SGD(params=model.parameters(), lr=0.05)

# lr_scheduler.StepLR()
# Assuming optimizer uses lr = 0.05 for all groups
# lr = 0.05     if epoch < 30
# lr = 0.005    if 30 <= epoch < 60
# lr = 0.0005   if 60 <= epoch < 90
#########################################
scheduler = lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)
plt.figure()
x = list(range(100))
y = []
for epoch in range(100):
    scheduler.step()
    lr = scheduler.get_lr()
    print(epoch, scheduler.get_lr()[0])
    y.append(scheduler.get_lr()[0])

plt.plot(x, y)
plt.show()
#########################################
scheduler = lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
plt.figure()
y.clear()
for epoch in range(100):
    scheduler.step()
    print(epoch, 'lr={:.6f}'.format(scheduler.get_lr()[0]))
    y.append(scheduler.get_lr()[0])

plt.plot(x, y)
plt.show()
#######################################
lf = lambda x: ((1 + math.cos(x * math.pi / args.epochs)) / 2) * (1 - args.lrf) + args.lrf  # cosine
scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)

余弦退火曲线

$\eta_{t+1}=\eta_{\min }+\left(\eta_{\max }-\eta_{\min }\right) \frac{1+\cos \frac{T_{c u r}+1}{T_{\max }} \pi}{1+\cos \frac{T_{\text {cur }}}{T_{\max }} \pi}, T_{c u r}=(2 k+1) T_{\max }$

参数	含义
T_max	一次学习率周期的迭代次数，即 T_max 个 epoch 之后重新设置学习率。
eta_min	最小学习率，即在一个周期中，学习率最小会下降到 eta_min，默认值为 0。
last_epoch	最后一个EPOCH 默认-1，可不设置

多GPU训练

请添加图片描述

图1: 多GPU训练可分为模型并行训练和数据并行训练

同步Batchnormlization

请添加图片描述

同步batchnormlization可以提升模型的精度，但是是降低并行速度

单级多卡DataParallel

############################################################################################################
#首先查看GPU是否可用
print(torch.cuda.is_available())
#查看GPU的数量
print(torch.cuda.device_count())
#返回GPU名称
print(torch.cuda.get_device_name(0))
#查看当前设备名称
print(torch.cuda.current_device())
############################################################################################################
#`os.environ[]这个参数的设定要保证在模型加载到gpu上之前, 一般都是在程序开始的时候就设定好这个参数
#`os.environ['CUDA_VISIBLE_DEVICES']`来限制使用的GPU个数, 例如我要使用第0和第3编号的GPU, 那么只需要在程序中设置:
os.environ['CUDA_VISIBLE_DEVICES'] = '0,3' #如果不指定，则默认使用机器上可用的所有显卡
model = nn.DataParallel(model)
model = model.cuda()		#将模型送入显存中
inputs = inputs.cuda()		#将数据送入显存中
labels = labels.cuda()
#以上是DataParallel的用法，DataParallel只能用于单机情况，他是单进程多线程的
#接下来的是DistributedDataParallel,它是多进程，多线程，可用于多机设备，速度要比DataParallel快
############################################################################################################
import os
import re
import torch
import torch.nn as nn
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
# 1. 获取环境信息
rank = int(os.environ['SLURM_PROCID'])
world_size = int(os.environ['SLURM_NTASKS'])
local_rank = int(os.environ['SLURM_LOCALID'])
node_list = str(os.environ['SLURM_NODELIST'])       
# 对ip进行操作
node_parts = re.findall('[0-9]+', node_list)
host_ip = '{}.{}.{}.{}'.format(node_parts[1], node_parts[2], node_parts[3], node_parts[4])
 # 注意端口一定要没有被使用
port = "23456"                                         
 # 使用TCP初始化方法
init_method = 'tcp://{}:{}'.format(host_ip, port)      
# 多进程初始化,初始化通信环境
dist.init_process_group("nccl", init_method=init_method,
                        world_size=world_size, rank=rank) 
# 指定每个节点上的device
torch.cuda.set_device(local_rank)                
model = model.cuda()
# 当前模型所在local_rank
model = DDP(model, device_ids=[local_rank])# 指定当前卡上的GPU号
input = input.cuda()
output = model(input)
# 此后训练流程与普通模型无异

模型参数初始化

from torch.nn import functional as F, init
 
   def init_params(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                init.kaiming_normal_(m.weight, mode='fan_out')
                if m.bias is not None:
                    init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d):
                init.constant_(m.weight, 1)
                init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                init.normal_(m.weight, std=0.001)
                if m.bias is not None:
                    init.constant_(m.bias, 0)

hook机制

#由于pytorch会自动舍弃图计算的中间结果，所以想要获取这些数值就需要使用钩子函数。
#Hook机制就是用来获取、改变网络中间某一层变量的值和梯度，从而便捷地分析网络，而不用专门改变网络结构。

#.register_hook()
#torch.Tensor的Hook  
"""
Tensor.requires_grad_()使一个向量转换为可学习的参数
此时的Tensor具有.data属性，却没有.grad属性
只有.backword()反向传播后才会有.grad属性
pytorch只保留叶子节点
"""
import torch
import torch.nn as nn

x = torch.Tensor([1, 2]).requires_grad_()
y = torch.Tensor([3, 4]).requires_grad_()
z = ((y-x) ** 2).mean()
# z.retain_grad()
z.backward()
# hook函数是对变量的grad属性进行处理，可以显示，也可以保存在一个变量当中
def hook_fn(grad):
    print(grad)
z.register_hook(hook_fn)
print(f"x.requires_grad: {x.requires_grad}")
print(f"y.requires_grad: {y.requires_grad}")
print(f"z.requires_grad: {z.requires_grad}\n")

print(f"x.grad: {x.grad}")
print(f"y.grad: {y.grad}")
print(f"z.grad: {z.grad}")

##########################################
#对torch.nn.Module的Hook，包括register_forward_hook和register_backward_hook
#PyTorch中使用register_forward_hook和register_backward_hook获取Module输入和输出的feature_map和grad
import torch as t
import torch.nn as nn
import torch.nn.functional as F

class LeNet(nn.Module):
    def __init__(self):
        super(LeNet, self).__init__()
        self.conv1 = nn.Conv2d(1, 6, 3,padding=1)
        self.conv2 = nn.Conv2d(6, 16, 3,padding=1)
        self.fc1 = nn.Linear(16 * 4 * 4, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = x.view(x.size()[0], -1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
net = LeNet()
img = t.Tensor(t.ones(4*4*1).view(1,1,4,4))
net(img)


def hook(module, inputdata, output):
    '''把这层的输出拷贝到features中'''
    print(inputdata)
    print(output.data)
def hook_backward_fn(module, grad_input, grad_output):
    print(f"module: {module}")
    print(f"grad_output: {grad_output}")
    print(f"grad_input: {grad_input}")
    print("*"*20)
"""
register_forward_hook使用方式为module.register_forward_hook(hook_fn)，其中hook_fn的声明为
hook_fn(module, input, output) -> Tensor or None

register_backward_hook使用方式为module.register_backward_hook(hook_fn)，其中hook_fn的声明为
hook_fn(module, grad_input, grad_output) -> Tensor or None
"""
handle = net.conv1.register_forward_hook(hook)
handle2 = net.conv1.register_backward_hook(hook_backward_fn)
y = net(img).mean()
y.backward()

Jany的

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
pytorch学习笔记

pytorch手册模型的保存与加载#保存模型到checkpoint.pth.tar,这种方式保存模型的所有信息，state是个自定义的字典#保存模型的状态，可以设置一些参数，后续可以使用state = {'epoch': epoch + 1,#保存的当前轮数 'state_dict': mymodel.state_dict(),#训练好的参数 'optimizer': optimizer.state_dict(),#优化器参数,为了后续的resume
复制链接

扫一扫