pytorch动态调整学习率
- lr_scheduler综述
- 一.根据训练次数调整学习率
- 1.torch.optim.lr_scheduler.LambdaLR(学习率前面乘上对应的参数)(特点:学习率下降不是特别的快)
- 2.torch.optim.lr_scheduler.StepLR(epoch挂在指数上面进行学习率的下降)(特点:学习率下降速度一般,因为有指数,所以比之前的那个学习率下降的快)
- 3.torch.optim.lr_scheduler.MultiStepLR(也是epoch挂在指数上学习率下降,不过会调用bisect_right函数,下降的速度跟上面的内容差不多)
- 4.torch.optim.lr_scheduler.ExponentialLR(既带前面的参数又带指数)
- 二、根据训练中某些测量值调整学习率
- 三、自研学习率函数
pytorch之中可以动态地调整学习率,代码如下:
import torch
from torch.optim.lr_scheduler import StepLR, ExponentialLR
from torch.optim.sgd import SGD
from torch.optim.lr_scheduler import _LRScheduler
from torch.optim.lr_scheduler import ReduceLROnPlateau
class GradualWarmupScheduler(_LRScheduler):
""" Gradually warm-up(increasing) learning rate in optimizer.
Proposed in 'Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour'.
Args:
optimizer (Optimizer): Wrapped optimizer.
multiplier: target learning rate = base lr * multiplier if multiplier > 1.0. if multiplier = 1.0, lr starts from 0 and ends up with the base_lr.
total_epoch: target learning rate is reached at total_epoch, gradually
after_scheduler: after target_epoch, use this scheduler(eg. ReduceLROnPlateau)
"""
def __init__(self, optimizer, multiplier, total_epoch, after_scheduler=None):
self.multiplier = multiplier
if self.multiplier < 1.:
raise ValueError('multiplier should be greater thant or equal to 1.')
self.total_epoch = total_epoch
self.after_scheduler = after_scheduler
self.finished = False
super(GradualWarmupScheduler, self).__init__(optimizer)
def get_lr(self):
if self.last_epoch > self.total_epoch:
if self.after_scheduler:
if not self.finished:
self.after_scheduler.base_lrs = [base_lr * self.multiplier for base_lr in self.base_lrs]
self.finished = True
return self.after_scheduler.get_last_lr()
return [base_lr * self.multiplier for base_lr in self.base_lrs]
if self.multiplier == 1.0:
return [base_lr * (float(self.last_epoch) / self.total_epoch) for base_lr in self.base_lrs]
else:
return [base_lr * ((self.multiplier - 1.) * self.last_epoch / self.total_epoch + 1.) for base_lr in self.base_lrs]
def step_ReduceLROnPlateau(self, metrics, epoch=None):
if epoch is None:
epoch = self.last_epoch + 1
self.last_epoch = epoch if epoch != 0 else 1 # ReduceLROnPlateau is called at the end of epoch, whereas others are called at beginning
if self.last_epoch <= self.total_epoch:
warmup_lr = [base_lr * ((self.multiplier - 1.) * self.last_epoch / self.total_epoch + 1.) for base_lr in self.base_lrs]
for param_group, lr in zip(self.optimizer.param_groups, warmup_lr):
param_group['lr'] = lr
else:
if epoch is None:
self.after_scheduler.step(metrics, None)
else:
self.after_scheduler.step(metrics, epoch - self.total_epoch)
def step(self, epoch=None, metrics=None):
if type(self.after_scheduler) != ReduceLROnPlateau:
if self.finished and self.after_scheduler:
if epoch is None:
self.after_scheduler.step(None)
else:
self.after_scheduler.step(epoch - self.total_epoch)
self._last_lr = self.after_scheduler.get_last_lr()
else:
return super(GradualWarmupScheduler, self).step(epoch)
else:
self.step_ReduceLROnPlateau(metrics, epoch)
if __name__ == '__main__':
model = [torch.nn.Parameter(torch.randn(2, 2, requires_grad=True))]
optim = SGD(model, 0.1)
# scheduler_warmup is chained with schduler_steplr
scheduler_steplr = StepLR(optim, step_size=10, gamma=0.1)
scheduler_warmup = GradualWarmupScheduler(optim, multiplier=1, total_epoch=5, after_scheduler=scheduler_steplr)
# this zero gradient update is needed to avoid a warning message, issue #8.
optim.zero_grad()
optim.step()
for epoch in range(1, 20):
scheduler_warmup.step(epoch)
print(epoch, optim.param_groups[0]['lr'])
optim.step() # backward pass (update network)
torch之中的学习率调整概述
lr_scheduler综述
一.根据训练次数调整学习率
1.torch.optim.lr_scheduler.LambdaLR(学习率前面乘上对应的参数)(特点:学习率下降不是特别的快)
import torch
import torch.nn as nn
from torch.optim.lr_scheduler import LambdaLR
initial_lr = 0.1
class model(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(in_channels=3, out_channels=3, kernel_size=3)
def forward(self, x):
pass
net_1 = model()
optimizer_1 = torch.optim.Adam(net_1.parameters(), lr = initial_lr)
scheduler_1 = LambdaLR(optimizer_1, lr_lambda=lambda epoch: 1/(epoch+1))
print("初始化的学习率:", optimizer_1.defaults['lr'])
for epoch in range(1, 11):
# train
optimizer_1.zero_grad()
optimizer_1.step()
print("第%d个epoch的学习率:%f" % (epoch, optimizer_1.param_groups[0]['lr']))
scheduler_1.step()
这里每次使用scheduler_1.step()更新学习率,注意这里scheduler_1里面定义了优化优化器optimizer_1以及对应的学习率随着epoch的变化lr_lambda函数
这里再深刻理解一下对应的学习公式
n
e
w
_
l
r
=
λ
∗
i
n
i
t
i
a
l
_
l
r
new\_lr = \lambda*initial\_lr
new_lr=λ∗initial_lr
n
e
w
_
l
r
new\_lr
new_lr:新得到的学习率,
初始化的学习率: 0.1
第1个epoch的学习率:0.100000
第2个epoch的学习率:0.050000
第3个epoch的学习率:0.033333
第4个epoch的学习率:0.025000
第5个epoch的学习率:0.020000
第6个epoch的学习率:0.016667
第7个epoch的学习率:0.014286
第8个epoch的学习率:0.012500
第9个epoch的学习率:0.011111
第10个epoch的学习率:0.010000
比较推荐使用这种epoch附带的学习率,因为这里的学习率下降不是特别的快
2.torch.optim.lr_scheduler.StepLR(epoch挂在指数上面进行学习率的下降)(特点:学习率下降速度一般,因为有指数,所以比之前的那个学习率下降的快)
class torch.optim.lr_scheduler.StepLR(optimizer.step_size,gamma=0.1,last_epoch=-1)
每step_size个epoch做一次更新
n
e
w
_
l
r
=
i
n
i
t
i
a
l
_
l
r
∗
γ
e
p
o
c
h
/
/
s
t
e
p
_
s
i
z
e
new\_lr = initial\_lr*\gamma^{epoch//step\_size}
new_lr=initial_lr∗γepoch//step_size
指定了相应的参数之后,学习率会自动进行更新
参数:
optimizer (Optimizer):要更改学习率的优化器;
step_size(int):每训练step_size个epoch,更新一次参数;
gamma(float):更新lr的乘法因子;
last_epoch (int):最后一个epoch的index,如果是训练了很多个epoch后中断了,继续训练,这个值就等于加载的模型的epoch。默认为-1表示从头开始训练,即从epoch=1开始。
对应代码如下:
import torch
import torch.nn as nn
from torch.optim.lr_scheduler import StepLR
import itertools
initial_lr = 0.1
class model(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(in_channels=3, out_channels=3, kernel_size=3)
def forward(self, x):
pass
net_1 = model()
optimizer_1 = torch.optim.Adam(net_1.parameters(), lr = initial_lr)
scheduler_1 = StepLR(optimizer_1, step_size=3, gamma=0.1)
print("初始化的学习率:", optimizer_1.defaults['lr'])
for epoch in range(1, 11):
# train
optimizer_1.zero_grad()
optimizer_1.step()
print("第%d个epoch的学习率:%f" % (epoch, optimizer_1.param_groups[0]['lr']))
scheduler_1.step()
输出的学习率为
初始化的学习率: 0.1
第1个epoch的学习率:0.100000
第2个epoch的学习率:0.100000
第3个epoch的学习率:0.100000
第4个epoch的学习率:0.010000
第5个epoch的学习率:0.010000
第6个epoch的学习率:0.010000
第7个epoch的学习率:0.001000
第8个epoch的学习率:0.001000
第9个epoch的学习率:0.001000
第10个epoch的学习率:0.000100
3.torch.optim.lr_scheduler.MultiStepLR(也是epoch挂在指数上学习率下降,不过会调用bisect_right函数,下降的速度跟上面的内容差不多)
n
e
w
_
l
r
=
i
n
i
t
i
a
l
_
l
r
∗
γ
b
i
s
e
c
t
_
r
i
g
h
t
(
m
i
l
e
s
t
o
n
e
s
,
e
p
o
c
h
)
new\_lr = initial\_lr*\gamma^{bisect\_right(milestones,epoch)}
new_lr=initial_lr∗γbisect_right(milestones,epoch)
对应的参数
optimizer (Optimizer):要更改学习率的优化器;
milestones(list):递增的list,存放要更新lr的epoch;
gamma(float):更新lr的乘法因子;
last_epoch (int):最后一个epoch的index,如果是训练了很多个epoch后中断了,继续训练,这个值就等于加载的模型的epoch。默认为-1表示从头开始训练,即从epoch=1开始。
对应内容
import torch
import torch.nn as nn
from torch.optim.lr_scheduler import MultiStepLR
import itertools
initial_lr = 0.1
class model(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(in_channels=3, out_channels=3, kernel_size=3)
def forward(self, x):
pass
net_1 = model()
optimizer_1 = torch.optim.Adam(net_1.parameters(), lr = initial_lr)
scheduler_1 = MultiStepLR(optimizer_1, milestones=[3, 7], gamma=0.1)
print("初始化的学习率:", optimizer_1.defaults['lr'])
for epoch in range(1, 11):
# train
optimizer_1.zero_grad()
optimizer_1.step()
print("第%d个epoch的学习率:%f" % (epoch, optimizer_1.param_groups[0]['lr']))
scheduler_1.step()
初始化的学习率: 0.1
第1个epoch的学习率:0.100000
第2个epoch的学习率:0.100000
第3个epoch的学习率:0.100000
第4个epoch的学习率:0.010000
第5个epoch的学习率:0.010000
第6个epoch的学习率:0.010000
第7个epoch的学习率:0.010000
第8个epoch的学习率:0.001000
第9个epoch的学习率:0.001000
第10个epoch的学习率:0.001000
4.torch.optim.lr_scheduler.ExponentialLR(既带前面的参数又带指数)
对每一个epoch进行指数更新
n
e
w
_
l
r
=
i
n
i
t
i
a
l
_
l
r
∗
γ
e
p
o
c
h
new\_lr = initial\_lr*\gamma^{epoch}
new_lr=initial_lr∗γepoch
对应的代码如下:
import torch
import torch.nn as nn
from torch.optim.lr_scheduler import ExponentialLR
import itertools
initial_lr = 0.1
class model(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(in_channels=3, out_channels=3, kernel_size=3)
def forward(self, x):
pass
net_1 = model()
optimizer_1 = torch.optim.Adam(net_1.parameters(), lr = initial_lr)
scheduler_1 = ExponentialLR(optimizer_1, gamma=0.1)
print("初始化的学习率:", optimizer_1.defaults['lr'])
for epoch in range(1, 11):
# train
optimizer_1.zero_grad()
optimizer_1.step()
print("第%d个epoch的学习率:%f" % (epoch, optimizer_1.param_groups[0]['lr']))
scheduler_1.step()
这个时候会下降的特别快,因为前面乘上一个数,指数上再带上一个数。
初始化的学习率: 0.1
第1个epoch的学习率:0.100000
第2个epoch的学习率:0.010000
第3个epoch的学习率:0.001000
第4个epoch的学习率:0.000100
第5个epoch的学习率:0.000010
第6个epoch的学习率:0.000001
第7个epoch的学习率:0.000000
第8个epoch的学习率:0.000000
第9个epoch的学习率:0.000000
第10个epoch的学习率:0.000000
二、根据训练中某些测量值调整学习率
class torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10, verbose=False, threshold=0.0001, threshold_mode='rel', cooldown=0, min_lr=0, eps=1e-08)
不依赖epoch更新lr的只有torch.optim.lr_scheduler.ReduceLROnPlateau
n
e
w
_
l
r
=
λ
×
o
l
d
_
l
r
new\_lr = \lambda \times old\_lr
new_lr=λ×old_lr
其中
n
e
w
_
l
r
new\_lr
new_lr是得到的新学习率,
o
l
d
_
l
r
old\_lr
old_lr是上一次优化使用的学习率,
λ
\lambda
λ是通过参数factor。
import torch
import torch.nn as nn
from torch.optim.lr_scheduler import ReduceLROnPlateau
import itertools
initial_lr = 0.1
class model(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(in_channels=3, out_channels=3, kernel_size=3)
def forward(self, x):
pass
net_1 = model()
optimizer_1 = torch.optim.Adam(net_1.parameters(), lr = initial_lr)
scheduler_1 = ReduceLROnPlateau(optimizer_1, mode='min', factor=0.1, patience=2)
print("初始化的学习率:", optimizer_1.defaults['lr'])
for epoch in range(1, 15):
# train
test = 2
optimizer_1.zero_grad()
optimizer_1.step()
print("第%d个epoch的学习率:%f" % (epoch, optimizer_1.param_groups[0]['lr']))
scheduler_1.step(test)
三、自研学习率函数
def lr_lambda(epoch):
if epoch >= 6 and epoch <= 7:
return 0.2
elif epoch >= 8:
return 0.1
else:
return 2/(epoch+1)