一、什么是优化器
pytorch的优化器: 管理并更新模型中可学习参数的值,使得模型输出更接真实标签
- 导数: 函数在指定坐标轴上的变化率
- 方向导数: 指定方向上的变化率
- 梯度:一个向量,方向为方向导数取得最大值的方向
二、optimizer的属性
class Optimizer(object):
def __init__(self, params, defaults):
self.defaults = defaults
self.state = defaultdict(dict)
self.param_groups=[]
...
param_groups = [{'params':param_groups}]
基本属性:
- defaults: 优化器超参数,如学习率
- state: 参数的缓存,如momentum的缓存
- params_groups:管理的参数组
- _step_count:记录更新次数,学习率调整中使用
三、optimizer的方法
3.1 zero_grad()
class Optimizer(object):
def zero_grad(self):
for group in self.param_groups:
for p in group['params"]:
if p.grad is not None:
p.grad.detach_()
p.grad.zero_()
功能: 清空所管理参数的梯度
pytorch特性:张量梯度不自动清零
# -*- coding:utf-8 -*-
import os
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
import torch
import torch.optim as optim
from tools.common_tools import set_seed
set_seed(1) # 设置随机种子
weight = torch.randn((2, 2), requires_grad=True)
weight.grad = torch.ones((2, 2))
optimizer = optim.SGD([weight], lr=0.1)
# ----------------------------------- zero_grad -----------------------------------
# flag = 0
flag = 1
if flag:
print("weight before step:{}".format(weight.data))
optimizer.step() # 修改lr=1 0.1观察结果
print("weight after step:{}".format(weight.data))
print("weight in optimizer:{}\nweight in weight:{}\n".format(id(optimizer.param_groups[0]['params'][0]), id(weight)))
print("weight.grad is {}\n".format(weight.grad))
optimizer.zero_grad()
print("after optimizer.zero_grad(), weight.grad is\n{}".format(weight.grad))
说明:
- 在优化器中保存的是参数的地址,根据地址寻找参数,减少内存消耗
- 通过zero_grad()方法后,就实现了参数梯度的清零
3.2 step()
功能:执行一步更新
详细说明:当我们计算得到了loss,然后反向传播计算各个参数的梯度后,就需要使用step()方法执行一步更新,更新参数,而更新的策略有很多,如随机梯度下降法,momentum等等
# -*- coding:utf-8 -*-
import os
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
import torch
import torch.optim as optim
from tools.common_tools import set_seed
set_seed(1) # 设置随机种子
weight = torch.randn((2, 2), requires_grad=True)
weight.grad = torch.ones((2, 2))
optimizer = optim.SGD([weight], lr=0.1)
# ----------------------------------- step -----------------------------------
# flag = 0
flag = 1
if flag:
print("weight before step:{}".format(weight.data))
optimizer.step() # 修改lr=1 0.1观察结果
print("weight after step:{}".format(weight.data))
说明:这里的梯度是1,所以执行一步更新参数,即0.6614-0.1*1=0.5614,其中0.1是学习率
3.3 add_param_group()
class Optimizer(object):
def add_param_group(self, param_group):
for group in self.param_groups:
param_set.update(set(group['params']))
...
self.param_groups.append(param_group)
功能:添加一组参数到优化器当中
# -*- coding:utf-8 -*-
import os
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
import torch
import torch.optim as optim
from tools.common_tools import set_seed
set_seed(1) # 设置随机种子
weight = torch.randn((2, 2), requires_grad=True)
weight.grad = torch.ones((2, 2))
optimizer = optim.SGD([weight], lr=0.1)
# ----------------------------------- add_param_group -----------------------------------
# flag = 0
flag = 1
if flag:
print("optimizer.param_groups is\n{}".format(optimizer.param_groups))
w2 = torch.randn((3, 3), requires_grad=True)
optimizer.add_param_group({"params": w2, 'lr': 0.0001})
print("optimizer.param_groups is\n{}".format(optimizer.param_groups))
3.4 state_dict()
class Optimizer(object):
def state_dict(self):
...
return {'state': packed_state, 'param_groups': param_groups,}
def load state_dict(self, state_dict):
功能: 获取优化器当前状态信息字典
# -*- coding:utf-8 -*-
import os
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
import torch
import torch.optim as optim
from tools.common_tools import set_seed
set_seed(1) # 设置随机种子
weight = torch.randn((2, 2), requires_grad=True)
weight.grad = torch.ones((2, 2))
optimizer = optim.SGD([weight], lr=0.1)
# ----------------------------------- state_dict -----------------------------------
# flag = 0
flag = 1
if flag:
optimizer = optim.SGD([weight], lr=0.1, momentum=0.9)
opt_state_dict = optimizer.state_dict()
print("state_dict before step:\n", opt_state_dict)
for i in range(10):
optimizer.step()
print("state_dict after step:\n", optimizer.state_dict())
torch.save(optimizer.state_dict(), os.path.join(BASE_DIR, "optimizer_state_dict.pkl"))
说明:当训练到某个阶段或时刻,通过获取优化器当前状态信息字典 ,然后将其保存下来,后面再使用的时候,就可以通过加载状态信息字典,来继续之前的训练
3.5 load_state_dict()
class Optimizer(object):
def state_dict(self):
...
return {'state': packed_state, 'param_groups': param_groups,}
def load state_dict(self, state_dict):
功能 : 加载状态信息字典
说明:用于模型的续训练
# -*- coding:utf-8 -*-
import os
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
import torch
import torch.optim as optim
from tools.common_tools import set_seed
set_seed(1) # 设置随机种子
weight = torch.randn((2, 2), requires_grad=True)
weight.grad = torch.ones((2, 2))
optimizer = optim.SGD([weight], lr=0.1)
# -----------------------------------load state_dict -----------------------------------
flag = 0
# flag = 1
if flag:
optimizer = optim.SGD([weight], lr=0.1, momentum=0.9)
state_dict = torch.load(os.path.join(BASE_DIR, "optimizer_state_dict.pkl"))
print("state_dict before load state:\n", optimizer.state_dict())
optimizer.load_state_dict(state_dict)
print("state_dict after load state:\n", optimizer.state_dict())
四、learning rate学习率
4.1 学习率的概念
学习率(learning rate): 控制更新的步伐
梯度下降:
w
i
+
1
=
w
i
−
g
(
w
i
)
w_{i+1}=w_i-g(w_i)
wi+1=wi−g(wi)
应用学习率后:
w
i
+
1
=
w
i
−
L
R
∗
g
(
w
i
)
w_{i+1} = w_i-LR * g(w_i)
wi+1=wi−LR∗g(wi)
4.2 学习率的设置演示
# -*- coding:utf-8 -*-
import torch
import numpy as np
import matplotlib.pyplot as plt
torch.manual_seed(1)
def func(x_t):
"""
y = (2x)^2 = 4*x^2 dy/dx = 8x
"""
return torch.pow(2*x_t, 2)
# init
x = torch.tensor([2.], requires_grad=True)
# ------------------------------ plot data ------------------------------
flag = 0
# flag = 1
if flag:
x_t = torch.linspace(-3, 3, 100)
y = func(x_t)
plt.plot(x_t.numpy(), y.numpy(), label="y = 4*x^2")
plt.grid()
plt.xlabel("x")
plt.ylabel("y")
plt.legend()
plt.show()
# ------------------------------ gradient descent ------------------------------
flag = 0
# flag = 1
if flag:
iter_rec, loss_rec, x_rec = list(), list(), list()
lr = 0.1 # /1. /.5 /.2 /.1 /.125
max_iteration = 4 # /1. 4 /.5 4 /.2 20 200
for i in range(max_iteration):
y = func(x)
y.backward()
print("Iter:{}, X:{:8}, X.grad:{:8}, loss:{:10}".format(
i, x.detach().numpy()[0], x.grad.detach().numpy()[0], y.item()))
x_rec.append(x.item())
x.data.sub_(lr * x.grad) # x -= x.grad 数学表达式意义: x = x - x.grad # 0.5 0.2 0.1 0.125
x.grad.zero_()
iter_rec.append(i)
loss_rec.append(y)
plt.subplot(121).plot(iter_rec, loss_rec, '-ro')
plt.xlabel("Iteration")
plt.ylabel("Loss value")
x_t = torch.linspace(-3, 3, 100)
y = func(x_t)
plt.subplot(122).plot(x_t.numpy(), y.numpy(), label="y = 4*x^2")
plt.grid()
y_rec = [func(torch.tensor(i)).item() for i in x_rec]
plt.subplot(122).plot(x_rec, y_rec, '-ro')
plt.legend()
plt.show()
# ------------------------------ multi learning rate ------------------------------
# flag = 0
flag = 1
if flag:
iteration = 100
num_lr = 10
lr_min, lr_max = 0.01, 0.2 # .5 .3 .2
lr_list = np.linspace(lr_min, lr_max, num=num_lr).tolist()
loss_rec = [[] for l in range(len(lr_list))]
iter_rec = list()
for i, lr in enumerate(lr_list):
x = torch.tensor([2.], requires_grad=True)
for iter in range(iteration):
y = func(x)
y.backward()
x.data.sub_(lr * x.grad) # x.data -= x.grad
x.grad.zero_()
loss_rec[i].append(y.item())
for i, loss_r in enumerate(loss_rec):
plt.plot(range(len(loss_r)), loss_r, label="LR: {}".format(lr_list[i]))
plt.legend()
plt.xlabel('Iterations')
plt.ylabel('Loss value')
plt.show()
学习率=1,迭代次数=4
由图可知,学习率过大,从而出现loss激增
学习率=0.1,迭代次数=4
由上图可知,学习率适当的情况,loss下降至收敛
通过设置多学习率的演示可知,不同的学习率的收敛速度不同
五、momentum动量
5.1 momentum动量概念
Momentum(动量,冲量) : 结合当前梯度与上一次更新信息, 用于当前更新
使用学习率的梯度下降法,其中学习率是固定的,即更新的步长是固定的,就如同图上滑雪,每次更新,向下一个梯度方向更新步长固定,而Momentum动量会结合当前梯度方向和上次更新信息,更新的步长不固定,如图中,在上一次的基础上会滑更长的距离
应用momentum后的更新公式:
5.2 指数加权平均
基本思想: 当我们要求取当前时刻的平均值,距离当前时刻越近的那些参数值越具有参考性,所占的权重越大,这个权重会随着时间间隔的增大呈指数下降
具体例子:
- 图中横轴是天数,纵轴是温度
- v t v_t vt是第t天温度指数加权平均值
- θ t \theta_t θt是第t天的温度值
- β \beta β为超参,值小于1,用以控制记忆周期
由最后一项可知,距离当前时刻越远的那些温度的权重是越小的,由于β是小于1的,所以距离当前时刻越远的那些温度的权重是呈指数下降的
代码演示:
# -*- coding:utf-8 -*-
import torch
import numpy as np
import torch.optim as optim
import matplotlib.pyplot as plt
torch.manual_seed(1)
def exp_w_func(beta, time_list):
return [(1 - beta) * np.power(beta, exp) for exp in time_list]
beta = 0.9
num_point = 100
time_list = np.arange(num_point).tolist()
# ------------------------------ exponential weight ------------------------------
flag = 0
# flag = 1
if flag:
weights = exp_w_func(beta, time_list)
plt.plot(time_list, weights, '-ro', label="Beta: {}\ny = B^t * (1-B)".format(beta))
plt.xlabel("time")
plt.ylabel("weight")
plt.legend()
plt.title("exponentially weighted average")
plt.show()
print(np.sum(weights))
# ------------------------------ multi weights ------------------------------
flag = 0
# flag = 1
if flag:
beta_list = [0.98, 0.95, 0.9, 0.8]
w_list = [exp_w_func(beta, time_list) for beta in beta_list]
for i, w in enumerate(w_list):
plt.plot(time_list, w, label="Beta: {}".format(beta_list[i]))
plt.xlabel("time")
plt.ylabel("weight")
plt.legend()
plt.show()
# ------------------------------ SGD momentum ------------------------------
# flag = 0
flag = 1
if flag:
def func(x):
return torch.pow(2*x, 2) # y = (2x)^2 = 4*x^2 dy/dx = 8x
iteration = 100
m = 0.63 # .9 .63
lr_list = [0.01, 0.03]
momentum_list = list()
loss_rec = [[] for l in range(len(lr_list))]
iter_rec = list()
for i, lr in enumerate(lr_list):
x = torch.tensor([2.], requires_grad=True)
momentum = 0. if lr == 0.03 else m
momentum_list.append(momentum)
optimizer = optim.SGD([x], lr=lr, momentum=momentum)
for iter in range(iteration):
y = func(x)
y.backward()
optimizer.step()
optimizer.zero_grad()
loss_rec[i].append(y.item())
for i, loss_r in enumerate(loss_rec):
plt.plot(range(len(loss_r)), loss_r, label="LR: {} M:{}".format(lr_list[i], momentum_list[i]))
plt.legend()
plt.xlabel('Iterations')
plt.ylabel('Loss value')
plt.show()
由图可知,距离当前时刻越远,其权重成指数下降趋势,而权重越小说明该时刻的温度对当前时刻温度的加权指数平均的贡献越小
由图可知
- 设置不同的β值,权重下降的趋势不同
- β值可理解为记忆周期,越小记忆周期越短
- β通常设置为0.9,为了更加关注当前10天左右的数据
由图所示,在确定学习率的基础下,适当的momentum系数(β值),能加速收敛
六、torch.optim.SGD
optim.SGD(params,
Ir=<object object>,
momentum=0,
dampening=0,
weight_decay=0,
nesterov=False)
功能:随机梯度下降法优化器
主要参数:
- params:管理的参数组
- lr:初始学习率
- momentum:动量系数,贝塔
- weight_decay: L2正则化系数
- nesterov:是否采用NAG,通常是不使用
NAG参考文献: 《On the importance of initialization and momentum in deep learning》
七、Pytorch的十种优化器
1 optim.SGD:随机梯度下降法
2 optim.Adagrad:自适应学习率梯度下降法
3 optim.RMSprop: Adagrad的改进
4 optim.Adadelta : Adagrad的改进
5 optim.Adam : RMSprop结合Momentum
6 optim.Adamax: Adam增加学习率上限
7 optim.SparseAdam:稀疏版的Adam
8 optim.ASGD:随机平均梯度下降
9 optim.Rprop :弹性反向传播
10 optim.LBFGS: BFGS的改进