NNDL 作业13

1. 函数3D可视化

分别画出x[0]^{2}+x[1]^{2}+x[1]^{3}+x[0]*x[1] 和 x^{2} /20+y^{2}的3D图

import torch
import numpy as np
import matplotlib.pyplot as plt
 
 
class Op(object):
    def __init__(self):
        pass
 
    def __call__(self, inputs):
        return self.forward(inputs)
 
    # 输入:张量inputs
    # 输出:张量outputs
    def forward(self, inputs):
        # return outputs
        raise NotImplementedError
 
    # 输入:最终输出对outputs的梯度outputs_grads
    # 输出:最终输出对inputs的梯度inputs_grads
    def backward(self, outputs_grads):
        # return inputs_grads
        raise NotImplementedError
 
 
class OptimizedFunction3D(Op):
    def __init__(self):
        super(OptimizedFunction3D, self).__init__()
        self.params = {'x': 0}
        self.grads = {'x': 0}
 
    def forward(self, x):
        self.params['x'] = x
        return x[0] ** 2 + x[1] ** 2 + x[1] ** 3 + x[0] * x[1]
 
    def backward(self):
        x = self.params['x']
        gradient1 = 2 * x[0] + x[1]
        gradient2 = 2 * x[1] + 3 * x[1] ** 2 + x[0]
        grad1 = torch.Tensor([gradient1])
        grad2 = torch.Tensor([gradient2])
        self.grads['x'] = torch.cat([grad1, grad2])
 
 
x1 = np.arange(-3, 3, 0.1)
x2 = np.arange(-3, 3, 0.1)
x1, x2 = np.meshgrid(x1, x2)
init_x = torch.Tensor(np.array([x1, x2]))
 
model = OptimizedFunction3D()
 
# 绘制 f_3d函数 的 三维图像
fig = plt.figure()
ax = plt.axes(projection='3d')
 
X = init_x[0].numpy()
Y = init_x[1].numpy()
Z = model(init_x).numpy()  # 改为 model(init_x).numpy() David 2022.12.4
ax.plot_surface(X, Y, Z, cmap='rainbow')
 
ax.set_xlabel('x1')
ax.set_ylabel('x2')
ax.set_zlabel('f(x1,x2)')
plt.show()

import torch
import numpy as np
import matplotlib.pyplot as plt
 
 
class Op(object):
    def __init__(self):
        pass
 
    def __call__(self, inputs):
        return self.forward(inputs)
 
    # 输入:张量inputs
    # 输出:张量outputs
    def forward(self, inputs):
        # return outputs
        raise NotImplementedError
 
    # 输入:最终输出对outputs的梯度outputs_grads
    # 输出:最终输出对inputs的梯度inputs_grads
    def backward(self, outputs_grads):
        # return inputs_grads
        raise NotImplementedError
 
 
class OptimizedFunction3D(Op):
    def __init__(self):
        super(OptimizedFunction3D, self).__init__()
        self.params = {'x': 0}
        self.grads = {'x': 0}
 
    def forward(self, x):
        self.params['x'] = x
        return x[0] ** 2 / 20 + x[1] ** 2
 
    def backward(self):
        x = self.params['x']
        gradient1 = 2 * x[0] + x[1]
        gradient2 = 2 * x[1] + 3 * x[1] ** 2 + x[0]
        grad1 = torch.Tensor([gradient1])
        grad2 = torch.Tensor([gradient2])
        self.grads['x'] = torch.cat([grad1, grad2])
 
 
x1 = np.arange(-40, 40, 0.1)
x2 = np.arange(-40, 40, 0.1)
x1, x2 = np.meshgrid(x1, x2)
init_x = torch.Tensor(np.array([x1, x2]))
 
model = OptimizedFunction3D()
 
# 绘制 f_3d函数 的 三维图像
fig = plt.figure()
ax = plt.axes(projection='3d')
 
X = init_x[0].numpy()
Y = init_x[1].numpy()
Z = model(init_x).numpy()  # 改为 model(init_x).numpy() David 2022.12.4
ax.plot_surface(X, Y, Z, cmap='rainbow')
 
ax.set_xlabel('x1')
ax.set_ylabel('x2')
ax.set_zlabel('f(x1,x2)')
plt.show()

2.加入优化算法,画出轨迹

分别画出x[0]^{2}+x[1]^{2}+x[1]^{3}+x[0]*x[1] 和 x^{2} /20+y^{2}的3D轨迹图

结合3D动画,用自己的语言,从轨迹、速度等多个角度讲解各个算法优缺点

import torch
import numpy as np
import copy
from matplotlib import pyplot as plt
from matplotlib import animation
from itertools import zip_longest
 
 
class Op(object):
    def __init__(self):
        pass
 
    def __call__(self, inputs):
        return self.forward(inputs)
 
    # 输入:张量inputs
    # 输出:张量outputs
    def forward(self, inputs):
        # return outputs
        raise NotImplementedError
 
    # 输入:最终输出对outputs的梯度outputs_grads
    # 输出:最终输出对inputs的梯度inputs_grads
    def backward(self, outputs_grads):
        # return inputs_grads
        raise NotImplementedError
 
 
class Optimizer(object):  # 优化器基类
    def __init__(self, init_lr, model):
        """
        优化器类初始化
        """
        # 初始化学习率,用于参数更新的计算
        self.init_lr = init_lr
        # 指定优化器需要优化的模型
        self.model = model
 
    def step(self):
        """
        定义每次迭代如何更新参数
        """
        pass
 
 
class SimpleBatchGD(Optimizer):
    def __init__(self, init_lr, model):
        super(SimpleBatchGD, self).__init__(init_lr=init_lr, model=model)
 
    def step(self):
        # 参数更新
        if isinstance(self.model.params, dict):
            for key in self.model.params.keys():
                self.model.params[key] = self.model.params[key] - self.init_lr * self.model.grads[key]
 
 
class Adagrad(Optimizer):
    def __init__(self, init_lr, model, epsilon):
        """
        Adagrad 优化器初始化
        输入:
            - init_lr: 初始学习率 - model:模型,model.params存储模型参数值  - epsilon:保持数值稳定性而设置的非常小的常数
        """
        super(Adagrad, self).__init__(init_lr=init_lr, model=model)
        self.G = {}
        for key in self.model.params.keys():
            self.G[key] = 0
        self.epsilon = epsilon
 
    def adagrad(self, x, gradient_x, G, init_lr):
        """
        adagrad算法更新参数,G为参数梯度平方的累计值。
        """
        G += gradient_x ** 2
        x -= init_lr / torch.sqrt(G + self.epsilon) * gradient_x
        return x, G
 
    def step(self):
        """
        参数更新
        """
        for key in self.model.params.keys():
            self.model.params[key], self.G[key] = self.adagrad(self.model.params[key],
                                                               self.model.grads[key],
                                                               self.G[key],
                                                               self.init_lr)
 
 
class RMSprop(Optimizer):
    def __init__(self, init_lr, model, beta, epsilon):
        """
        RMSprop优化器初始化
        输入:
            - init_lr:初始学习率
            - model:模型,model.params存储模型参数值
            - beta:衰减率
            - epsilon:保持数值稳定性而设置的常数
        """
        super(RMSprop, self).__init__(init_lr=init_lr, model=model)
        self.G = {}
        for key in self.model.params.keys():
            self.G[key] = 0
        self.beta = beta
        self.epsilon = epsilon
 
    def rmsprop(self, x, gradient_x, G, init_lr):
        """
        rmsprop算法更新参数,G为迭代梯度平方的加权移动平均
        """
        G = self.beta * G + (1 - self.beta) * gradient_x ** 2
        x -= init_lr / torch.sqrt(G + self.epsilon) * gradient_x
        return x, G
 
    def step(self):
        """参数更新"""
        for key in self.model.params.keys():
            self.model.params[key], self.G[key] = self.rmsprop(self.model.params[key],
                                                               self.model.grads[key],
                                                               self.G[key],
                                                               self.init_lr)
 
 
class Momentum(Optimizer):
    def __init__(self, init_lr, model, rho):
        """
        Momentum优化器初始化
        输入:
            - init_lr:初始学习率
            - model:模型,model.params存储模型参数值
            - rho:动量因子
        """
        super(Momentum, self).__init__(init_lr=init_lr, model=model)
        self.delta_x = {}
        for key in self.model.params.keys():
            self.delta_x[key] = 0
        self.rho = rho
 
    def momentum(self, x, gradient_x, delta_x, init_lr):
        """
        momentum算法更新参数,delta_x为梯度的加权移动平均
        """
        delta_x = self.rho * delta_x - init_lr * gradient_x
        x += delta_x
        return x, delta_x
 
    def step(self):
        """参数更新"""
        for key in self.model.params.keys():
            self.model.params[key], self.delta_x[key] = self.momentum(self.model.params[key],
                                                                      self.model.grads[key],
                                                                      self.delta_x[key],
                                                                      self.init_lr)
 
 
class Nesterov(Optimizer):
    def __init__(self, init_lr, model, rho):
        super(Nesterov, self).__init__(init_lr=init_lr, model=model)
        self.delta_x = {}
        for key in self.model.params.keys():
            self.delta_x[key] = 0
        self.rho = rho
 
    def nesterov(self, x, gradient_x, delta_x, init_lr):
        """
        Nesterov算法更新参数,delta_x为梯度的加权移动平均
        """
        delta_x_prev = delta_x
        delta_x = self.rho * delta_x - init_lr * gradient_x
        x += -self.rho * delta_x_prev + (1 + self.rho) * delta_x
        return x, delta_x
 
    def step(self):
        """参数更新"""
        for key in self.model.params.keys():
            self.model.params[key], self.delta_x[key] = self.nesterov(self.model.params[key],
                                                                      self.model.grads[key],
                                                                      self.delta_x[key],
                                                                      self.init_lr)
 
 
class Adam(Optimizer):
    def __init__(self, init_lr, model, beta1, beta2, epsilon):
        """
        Adam优化器初始化
        输入:
            - init_lr:初始学习率
            - model:模型,model.params存储模型参数值
            - beta1, beta2:移动平均的衰减率
            - epsilon:保持数值稳定性而设置的常数
        """
        super(Adam, self).__init__(init_lr=init_lr, model=model)
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.M, self.G = {}, {}
        for key in self.model.params.keys():
            self.M[key] = 0
            self.G[key] = 0
        self.t = 1
 
    def adam(self, x, gradient_x, G, M, t, init_lr):
        """
        adam算法更新参数
        输入:
            - x:参数
            - G:梯度平方的加权移动平均
            - M:梯度的加权移动平均
            - t:迭代次数
            - init_lr:初始学习率
        """
        M = self.beta1 * M + (1 - self.beta1) * gradient_x
        G = self.beta2 * G + (1 - self.beta2) * gradient_x ** 2
        M_hat = M / (1 - self.beta1 ** t)
        G_hat = G / (1 - self.beta2 ** t)
        t += 1
        x -= init_lr / torch.sqrt(G_hat + self.epsilon) * M_hat
        return x, G, M, t
 
    def step(self):
        """参数更新"""
        for key in self.model.params.keys():
            self.model.params[key], self.G[key], self.M[key], self.t = self.adam(self.model.params[key],
                                                                                 self.model.grads[key],
                                                                                 self.G[key],
                                                                                 self.M[key],
                                                                                 self.t,
                                                                                 self.init_lr)
 
 
class OptimizedFunction3D(Op):
    def __init__(self):
        super(OptimizedFunction3D, self).__init__()
        self.params = {'x': 0}
        self.grads = {'x': 0}
 
    def forward(self, x):
        self.params['x'] = x
        return x[0] ** 2 + x[1] ** 2 + x[1] ** 3 + x[0] * x[1]
 
    def backward(self):
        x = self.params['x']
        gradient1 = 2 * x[0] + x[1]
        gradient2 = 2 * x[1] + 3 * x[1] ** 2 + x[0]
        grad1 = torch.Tensor([gradient1])
        grad2 = torch.Tensor([gradient2])
        self.grads['x'] = torch.cat([grad1, grad2])
 
 
class Visualization3D(animation.FuncAnimation):
    """    绘制动态图像,可视化参数更新轨迹    """
 
    def __init__(self, *xy_values, z_values, labels=[], colors=[], fig, ax, interval=600, blit=True, **kwargs):
        """
        初始化3d可视化类
        输入:
            xy_values:三维中x,y维度的值
            z_values:三维中z维度的值
            labels:每个参数更新轨迹的标签
            colors:每个轨迹的颜色
            interval:帧之间的延迟(以毫秒为单位)
            blit:是否优化绘图
        """
        self.fig = fig
        self.ax = ax
        self.xy_values = xy_values
        self.z_values = z_values
 
        frames = max(xy_value.shape[0] for xy_value in xy_values)
        self.lines = [ax.plot([], [], [], label=label, color=color, lw=2)[0]
                      for _, label, color in zip_longest(xy_values, labels, colors)]
        super(Visualization3D, self).__init__(fig, self.animate, init_func=self.init_animation, frames=frames,
                                              interval=interval, blit=blit, **kwargs)
 
    def init_animation(self):
        # 数值初始化
        for line in self.lines:
            line.set_data([], [])
            line.set_3d_properties(np.asarray([]))  # 源程序中有这一行,加上会报错。 Edit by David 2022.12.4
        return self.lines
 
    def animate(self, i):
        # 将x,y,z三个数据传入,绘制三维图像
        for line, xy_value, z_value in zip(self.lines, self.xy_values, self.z_values):
            line.set_data(xy_value[:i, 0], xy_value[:i, 1])
            line.set_3d_properties(z_value[:i])
        return self.lines
 
 
def train_f(model, optimizer, x_init, epoch):
    x = x_init
    all_x = []
    losses = []
    for i in range(epoch):
        all_x.append(copy.deepcopy(x.numpy()))  # 浅拷贝 改为 深拷贝, 否则List的原值会被改变。 Edit by David 2022.12.4.
        loss = model(x)
        losses.append(loss)
        model.backward()
        optimizer.step()
        x = model.params['x']
    return torch.Tensor(np.array(all_x)), losses
 
 
# 构建5个模型,分别配备不同的优化器
model1 = OptimizedFunction3D()
opt_gd = SimpleBatchGD(init_lr=0.01, model=model1)
 
model2 = OptimizedFunction3D()
opt_adagrad = Adagrad(init_lr=0.5, model=model2, epsilon=1e-7)
 
model3 = OptimizedFunction3D()
opt_rmsprop = RMSprop(init_lr=0.1, model=model3, beta=0.9, epsilon=1e-7)
 
model4 = OptimizedFunction3D()
opt_momentum = Momentum(init_lr=0.01, model=model4, rho=0.9)
 
model5 = OptimizedFunction3D()
opt_adam = Adam(init_lr=0.1, model=model5, beta1=0.9, beta2=0.99, epsilon=1e-7)
 
model6 = OptimizedFunction3D()
opt_nesterov = Nesterov(init_lr=0.01, model=model6, rho=0.9)
 
models = [model1, model2, model3, model4, model5, model6]
opts = [opt_gd, opt_adagrad, opt_rmsprop, opt_momentum, opt_adam, opt_nesterov]
 
 
x_all_opts = []
z_all_opts = []
 
# 使用不同优化器训练
 
for model, opt in zip(models, opts):
    x_init = torch.FloatTensor([2, 3])
    x_one_opt, z_one_opt = train_f(model, opt, x_init, 150)  # epoch
    # 保存参数值
    x_all_opts.append(x_one_opt.numpy())
    z_all_opts.append(np.squeeze(z_one_opt))
 
# 使用numpy.meshgrid生成x1,x2矩阵,矩阵的每一行为[-3, 3],以0.1为间隔的数值
x1 = np.arange(-3, 3, 0.1)
x2 = np.arange(-3, 3, 0.1)
x1, x2 = np.meshgrid(x1, x2)
init_x = torch.Tensor(np.array([x1, x2]))
 
model = OptimizedFunction3D()
 
 
# 绘制 f_3d函数 的 三维图像
fig = plt.figure()
ax = plt.axes(projection='3d')
 
X = init_x[0].numpy()
Y = init_x[1].numpy()
Z = model(init_x).numpy()  # 改为 model(init_x).numpy() David 2022.12.4
ax.plot_surface(X, Y, Z, cmap='rainbow')
 
ax.set_xlabel('x1')
ax.set_ylabel('x2')
ax.set_zlabel('f(x1,x2)')
labels = ['SGD', 'AdaGrad', 'RMSprop', 'Momentum', 'Adam', 'Nesterov']
colors = ['#f6373c', '#f6f237', '#45f637', '#37f0f6', '#000000', '#FED295']
 
animator = Visualization3D(*x_all_opts, z_values=z_all_opts, labels=labels, colors=colors, fig=fig, ax=ax)
 
ax.legend(loc='upper left')
animator.save('animation.gif', writer='ffmpeg')
plt.show()
3.复现CS231经典动画
import torch
import numpy as np
import copy
from matplotlib import pyplot as plt
from matplotlib import animation
from itertools import zip_longest
from matplotlib import cm
 
 
class Op(object):
    def __init__(self):
        pass
 
    def __call__(self, inputs):
        return self.forward(inputs)
 
    # 输入:张量inputs
    # 输出:张量outputs
    def forward(self, inputs):
        # return outputs
        raise NotImplementedError
 
    # 输入:最终输出对outputs的梯度outputs_grads
    # 输出:最终输出对inputs的梯度inputs_grads
    def backward(self, outputs_grads):
        # return inputs_grads
        raise NotImplementedError
 
 
class Optimizer(object):  # 优化器基类
    def __init__(self, init_lr, model):
        """
        优化器类初始化
        """
        # 初始化学习率,用于参数更新的计算
        self.init_lr = init_lr
        # 指定优化器需要优化的模型
        self.model = model
 
    def step(self):
        """
        定义每次迭代如何更新参数
        """
        pass
 
 
class SimpleBatchGD(Optimizer):
    def __init__(self, init_lr, model):
        super(SimpleBatchGD, self).__init__(init_lr=init_lr, model=model)
 
    def step(self):
        # 参数更新
        if isinstance(self.model.params, dict):
            for key in self.model.params.keys():
                self.model.params[key] = self.model.params[key] - self.init_lr * self.model.grads[key]
 
 
class Adagrad(Optimizer):
    def __init__(self, init_lr, model, epsilon):
        """
        Adagrad 优化器初始化
        输入:
            - init_lr: 初始学习率 - model:模型,model.params存储模型参数值  - epsilon:保持数值稳定性而设置的非常小的常数
        """
        super(Adagrad, self).__init__(init_lr=init_lr, model=model)
        self.G = {}
        for key in self.model.params.keys():
            self.G[key] = 0
        self.epsilon = epsilon
 
    def adagrad(self, x, gradient_x, G, init_lr):
        """
        adagrad算法更新参数,G为参数梯度平方的累计值。
        """
        G += gradient_x ** 2
        x -= init_lr / torch.sqrt(G + self.epsilon) * gradient_x
        return x, G
 
    def step(self):
        """
        参数更新
        """
        for key in self.model.params.keys():
            self.model.params[key], self.G[key] = self.adagrad(self.model.params[key],
                                                               self.model.grads[key],
                                                               self.G[key],
                                                               self.init_lr)
 
 
class RMSprop(Optimizer):
    def __init__(self, init_lr, model, beta, epsilon):
        """
        RMSprop优化器初始化
        输入:
            - init_lr:初始学习率
            - model:模型,model.params存储模型参数值
            - beta:衰减率
            - epsilon:保持数值稳定性而设置的常数
        """
        super(RMSprop, self).__init__(init_lr=init_lr, model=model)
        self.G = {}
        for key in self.model.params.keys():
            self.G[key] = 0
        self.beta = beta
        self.epsilon = epsilon
 
    def rmsprop(self, x, gradient_x, G, init_lr):
        """
        rmsprop算法更新参数,G为迭代梯度平方的加权移动平均
        """
        G = self.beta * G + (1 - self.beta) * gradient_x ** 2
        x -= init_lr / torch.sqrt(G + self.epsilon) * gradient_x
        return x, G
 
    def step(self):
        """参数更新"""
        for key in self.model.params.keys():
            self.model.params[key], self.G[key] = self.rmsprop(self.model.params[key],
                                                               self.model.grads[key],
                                                               self.G[key],
                                                               self.init_lr)
 
 
class Momentum(Optimizer):
    def __init__(self, init_lr, model, rho):
        """
        Momentum优化器初始化
        输入:
            - init_lr:初始学习率
            - model:模型,model.params存储模型参数值
            - rho:动量因子
        """
        super(Momentum, self).__init__(init_lr=init_lr, model=model)
        self.delta_x = {}
        for key in self.model.params.keys():
            self.delta_x[key] = 0
        self.rho = rho
 
    def momentum(self, x, gradient_x, delta_x, init_lr):
        """
        momentum算法更新参数,delta_x为梯度的加权移动平均
        """
        delta_x = self.rho * delta_x - init_lr * gradient_x
        x += delta_x
        return x, delta_x
 
    def step(self):
        """参数更新"""
        for key in self.model.params.keys():
            self.model.params[key], self.delta_x[key] = self.momentum(self.model.params[key],
                                                                      self.model.grads[key],
                                                                      self.delta_x[key],
 
                                                                      self.init_lr)
class Nesterov(Optimizer):
    def __init__(self, init_lr, model, rho):
        super(Nesterov, self).__init__(init_lr=init_lr, model=model)
        self.delta_x = {}
        for key in self.model.params.keys():
            self.delta_x[key] = 0
        self.rho = rho
 
    def nesterov(self, x, gradient_x, delta_x, init_lr):
        """
        Nesterov算法更新参数,delta_x为梯度的加权移动平均
        """
        delta_x_prev = delta_x
        delta_x = self.rho * delta_x - init_lr * gradient_x
        x += -self.rho * delta_x_prev + (1 + self.rho) * delta_x
        return x, delta_x
 
    def step(self):
        """参数更新"""
        for key in self.model.params.keys():
            self.model.params[key], self.delta_x[key] = self.nesterov(self.model.params[key],
                                                                      self.model.grads[key],
                                                                      self.delta_x[key],
                                                                      self.init_lr)
 
 
class Adam(Optimizer):
    def __init__(self, init_lr, model, beta1, beta2, epsilon):
        """
        Adam优化器初始化
        输入:
            - init_lr:初始学习率
            - model:模型,model.params存储模型参数值
            - beta1, beta2:移动平均的衰减率
            - epsilon:保持数值稳定性而设置的常数
        """
        super(Adam, self).__init__(init_lr=init_lr, model=model)
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.M, self.G = {}, {}
        for key in self.model.params.keys():
            self.M[key] = 0
            self.G[key] = 0
        self.t = 1
 
    def adam(self, x, gradient_x, G, M, t, init_lr):
        """
        adam算法更新参数
        输入:
            - x:参数
            - G:梯度平方的加权移动平均
            - M:梯度的加权移动平均
            - t:迭代次数
            - init_lr:初始学习率
        """
        M = self.beta1 * M + (1 - self.beta1) * gradient_x
        G = self.beta2 * G + (1 - self.beta2) * gradient_x ** 2
        M_hat = M / (1 - self.beta1 ** t)
        G_hat = G / (1 - self.beta2 ** t)
        t += 1
        x -= init_lr / torch.sqrt(G_hat + self.epsilon) * M_hat
        return x, G, M, t
 
    def step(self):
        """参数更新"""
        for key in self.model.params.keys():
            self.model.params[key], self.G[key], self.M[key], self.t = self.adam(self.model.params[key],
                                                                                 self.model.grads[key],
                                                                                 self.G[key],
                                                                                 self.M[key],
                                                                                 self.t,
                                                                                 self.init_lr)
 
 
class OptimizedFunction3D(Op):
    def __init__(self):
        super(OptimizedFunction3D, self).__init__()
        self.params = {'x': 0}
        self.grads = {'x': 0}
 
    def forward(self, x):
        self.params['x'] = x
        return - x[0] * x[0] / 2 + x[1] * x[1] / 1  # x[0] ** 2 + x[1] ** 2 + x[1] ** 3 + x[0] * x[1]
 
    def backward(self):
        x = self.params['x']
        gradient1 = - 2 * x[0] / 2
        gradient2 = 2 * x[1] / 1
        grad1 = torch.Tensor([gradient1])
        grad2 = torch.Tensor([gradient2])
        self.grads['x'] = torch.cat([grad1, grad2])
 
 
class Visualization3D(animation.FuncAnimation):
    """    绘制动态图像,可视化参数更新轨迹    """
 
    def __init__(self, *xy_values, z_values, labels=[], colors=[], fig, ax, interval=100, blit=True, **kwargs):
        """
        初始化3d可视化类
        输入:
            xy_values:三维中x,y维度的值
            z_values:三维中z维度的值
            labels:每个参数更新轨迹的标签
            colors:每个轨迹的颜色
            interval:帧之间的延迟(以毫秒为单位)
            blit:是否优化绘图
        """
        self.fig = fig
        self.ax = ax
        self.xy_values = xy_values
        self.z_values = z_values
 
        frames = max(xy_value.shape[0] for xy_value in xy_values)
        # , marker = 'o'
        self.lines = [ax.plot([], [], [], label=label, color=color, lw=2)[0]
                      for _, label, color in zip_longest(xy_values, labels, colors)]
        print(self.lines)
        super(Visualization3D, self).__init__(fig, self.animate, init_func=self.init_animation, frames=frames,
                                              interval=interval, blit=blit, **kwargs)
 
    def init_animation(self):
        # 数值初始化
        for line in self.lines:
            line.set_data([], [])
            # line.set_3d_properties(np.asarray([]))  # 源程序中有这一行,加上会报错。 Edit by David 2022.12.4
        return self.lines
 
    def animate(self, i):
        # 将x,y,z三个数据传入,绘制三维图像
        for line, xy_value, z_value in zip(self.lines, self.xy_values, self.z_values):
            line.set_data(xy_value[:i, 0], xy_value[:i, 1])
            line.set_3d_properties(z_value[:i])
        return self.lines
 
 
def train_f(model, optimizer, x_init, epoch):
    x = x_init
    all_x = []
    losses = []
    for i in range(epoch):
        all_x.append(copy.deepcopy(x.numpy()))  # 浅拷贝 改为 深拷贝, 否则List的原值会被改变。 Edit by David 2022.12.4.
        loss = model(x)
        losses.append(loss)
        model.backward()
        optimizer.step()
        x = model.params['x']
    return torch.Tensor(np.array(all_x)), losses
 
 
# 构建5个模型,分别配备不同的优化器
model1 = OptimizedFunction3D()
opt_gd = SimpleBatchGD(init_lr=0.05, model=model1)
 
model2 = OptimizedFunction3D()
opt_adagrad = Adagrad(init_lr=0.05, model=model2, epsilon=1e-7)
 
model3 = OptimizedFunction3D()
opt_rmsprop = RMSprop(init_lr=0.05, model=model3, beta=0.9, epsilon=1e-7)
 
model4 = OptimizedFunction3D()
opt_momentum = Momentum(init_lr=0.05, model=model4, rho=0.9)
 
model5 = OptimizedFunction3D()
opt_adam = Adam(init_lr=0.05, model=model5, beta1=0.9, beta2=0.99, epsilon=1e-7)
 
model6 = OptimizedFunction3D()
opt_nesterov = Nesterov(init_lr=0.01, model=model6, rho=0.9)
 
models = [model5, model2, model3, model4, model1]
opts = [opt_adam, opt_adagrad, opt_rmsprop, opt_momentum, opt_gd]
 
x_all_opts = []
z_all_opts = []
 
# 使用不同优化器训练
 
for model, opt in zip(models, opts):
    x_init = torch.FloatTensor([0.00001, 0.5])
    x_one_opt, z_one_opt = train_f(model, opt, x_init, 500)  # epoch
    # 保存参数值
    x_all_opts.append(x_one_opt.numpy())
    z_all_opts.append(np.squeeze(z_one_opt))
 
# 使用numpy.meshgrid生成x1,x2矩阵,矩阵的每一行为[-3, 3],以0.1为间隔的数值
x1 = np.arange(-1, 2, 0.01)
x2 = np.arange(-1, 1, 0.05)
x1, x2 = np.meshgrid(x1, x2)
init_x = torch.Tensor(np.array([x1, x2]))
 
model = OptimizedFunction3D()
 
# 绘制 f_3d函数 的 三维图像
fig = plt.figure()
ax = plt.axes(projection='3d')
X = init_x[0].numpy()
Y = init_x[1].numpy()
Z = model(init_x).numpy()  # 改为 model(init_x).numpy() David 2022.12.4
surf = ax.plot_surface(X, Y, Z, edgecolor='grey', cmap=cm.coolwarm)
# fig.colorbar(surf, shrink=0.5, aspect=1)
ax.set_zlim(-3, 2)
ax.set_xlabel('x1')
ax.set_ylabel('x2')
ax.set_zlabel('f(x1,x2)')
 
labels = ['Adam', 'AdaGrad', 'RMSprop', 'Momentum', 'SGD']
colors = ['#8B0000', '#0000FF', '#000000', '#008B00', '#FF0000']
 
animator = Visualization3D(*x_all_opts, z_values=z_all_opts, labels=labels, colors=colors, fig=fig, ax=ax)
ax.legend(loc='upper right')
 
plt.show()
# animator.save('animation.gif') # 效果不好,估计被挡住了…… 有待进一步提高 Edit by David 2022.12.4

结合3D动画,用自己的语言,从轨迹、速度等多个角度讲解各个算法优缺点

SGD
轨迹:SGD沿着负梯度方向更新参数,因此在二维空间中表现为z字抖动的直线状的下降轨迹。
速度:由于每次更新只考虑当前样本的梯度信息,SGD可能会出现参数更新的抖动现象,导致收敛速度较慢。根据3D图发现,SGD很容易在鞍点的位置更新缓慢,不利于逃离鞍点。
AdaGrad
轨迹:Adagrad根据参数的历史梯度信息来自适应地调整学习率。所以在二维空间中表现为曲线轨迹且越来越密的下降轨迹。
速度:在优化过程中,学习率可能会逐渐减小,导致参数更新幅度逐渐减小。适应性学习率的设计可以加快收敛速度,但在长时间训练中,可能会导致学习率过小,反而降低收敛速度。
RMSprop
轨迹:RMSprop通过对梯度的平方进行指数加权移动平均来调整学习率。所以在二位空间中表现为曲线轨迹且线上的点分布较为平均的下降轨迹。
速度:指数加权移动平均的设计使得它能够在优化过程中动态地调整学习率,并可能产生适应性学习率的调整。
Momentum
​​​​​​​轨迹:动量算法引入了动量项,它考虑了参数更新时的历史梯度信息。在二维空间中表现为,靠近局部最优点,点的轨迹分布较为集中,原理分布较为松散
​​​​​​​速度:通过对轨迹的描述,我们明确发现在局部最优点附近的,点的更新较小,速度慢,原理局部最优点的地方速度较快
Adam
轨迹:这个轨迹不太好描述,因为Adam 约等于 动量法 + RMSprop,所以同时具有两个算法的有点,路径较为平稳,并且能很好的躲避局部最优点,不像动量法靠惯性原则一点点躲避局部最优点,而在开始就是逃离。
速度:cs231的环境下,Adam具有今次RMSprop的速度,一位内Adam是对RMSprop+动量法的优化,但是,在逃离局部最优点后,动量法的惯性原则,会让Adam成曲线运动,而非RMSprop的类直线运动,降低速度。
Nesterov
​​​​​​​轨迹:类似动量法,Nesterov是动量法的优化,在cs231的环境下,轨迹与动量法大致相同
速度:动量法是Nesterov的基础,但是无论是逃离局部最优点,还是逃离后的收敛速度,都不如动量法。

各种算法的不同参考了同学的博客,对我还是帮助很大
 

  • 9
    点赞
  • 9
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值