自适应梯度和RMSprop的理解与实现

1 自适应梯度Adagradient

  回顾动量版的SGD更新公式:
v t = γ v t − 1 + l r ∗ g r a d θ = θ − v t = θ − γ v t − 1 − l r ∗ g r a d \begin{aligned} v_t & = \gamma v_{t - 1} + lr * grad\\ \theta & = \theta - v_t = \theta - \gamma v_{t - 1} - lr * grad \end{aligned} vtθ=γvt1+lrgrad=θvt=θγvt1lrgrad  这里的 l r lr lr是一个固定值:越靠近目标值的位置,梯度的变化俞小,参数的变慢也俞慢。有没有方法解决呢?
  此时一个带着参数 s t s_t st的靓仔缓缓走来:
s t = s t − 1 + g r a d t 2 w t = w t − 1 − g r a d t l r s t + ϵ \begin{aligned} s_t & = s_{t - 1} + grad_t^2\\ w_t & =w_{t - 1} - grad_t \frac{lr}{\sqrt{s_t + \epsilon}} \end{aligned} stwt=st1+gradt2=wt1gradtst+ϵ lr  把它加进动量版的SGD试试😋

  还是以以下目标为例:
f ( x ) = a x + b ( y − f ( x ) ) 2 = ( y − ( a x + b ) ) 2 d y d a = − 2 x ( y − ( a x + b ) ) d y d b = − 2 ( y − ( a x + b ) ) \begin{aligned} f(x) &= a x+b \\ (y-f(x))^{2} & =(y-(a x+b))^{2} \\ \frac{d y}{d a} &=-2 x(y-(a x+b)) \\ \frac{d y}{d b} &=-2(y-(a x+b)) \end{aligned} f(x)(yf(x))2dadydbdy=ax+b=(y(ax+b))2=2x(y(ax+b))=2(y(ax+b))

测试1: 加入自适应梯度的SGD算法

import numpy as np
import matplotlib.pyplot as plt
from sympy import symbols, diff


def get_data():

    ret_x = np.linspace(-1, 1, 100)
    return ret_x, [(lambda x: 2 * x + 3)(x) for x in ret_x]


def grad():

    x, y, w_a, w_b = symbols(["x", "y", "w_a", "w_b"])
    loss = (y - (w_a * x + w_b))**2
    return diff(loss, w_a), diff(loss, w_b)


def test1(n_iter=10, lr=0.5, batch_size=20, epsilon=1e-6, shuffle=True):
    x, y = get_data()
    ga, gb = grad()
    n = len(x)
    idx = np.random.permutation(n)
    s_a, s_b, w_a, w_b = 0, 0, 0, 0
    move_a, move_b = [w_a], [w_b]
    move_lr_a, move_lr_b = [lr], [lr]
    for i in range(n_iter):
        if shuffle:
            np.random.shuffle(idx)
        batch_idxes = [idx[k: k + batch_size] for k in range(0, n, batch_size)]
        for idxes in batch_idxes:
            sum_ga, sum_gb = 0, 0
            for j in idxes:
                sum_ga += ga.subs({"x": x[j], "y": y[j], "w_a": w_a, "w_b": w_b})
                sum_gb += gb.subs({"x": x[j], "y": y[j], "w_a": w_a, "w_b": w_b})
            sum_ga /= batch_size
            sum_gb /= batch_size
            s_a += sum_ga**2
            s_b += sum_gb**2
            move_lr_a.append(lr / np.sqrt(float(s_a) + epsilon))
            move_lr_b.append(lr / np.sqrt(float(s_b) + epsilon))
            w_a -= sum_ga * move_lr_a[-1]
            w_b -= sum_gb * move_lr_b[-1]
            move_a.append(w_a)
            move_b.append(w_b)
    plt.subplot(211)
    plt.plot(move_a)
    plt.plot(move_b)
    plt.legend(["a", "b"])
    plt.subplot(212)
    plt.plot(move_lr_a)
    plt.plot(move_lr_b)
    plt.legend(["a", "b"])
    plt.show()


if __name__ == '__main__':
    test1()

2 RMSprop

  上面老几的问题是学习率变着变着就不变了,难受对吧?所以现在这个老几加了gamma参数,注意看代码中的区别哟😊
😂

测试2: 加入gamma的自适应学习SGD

import numpy as np
import matplotlib.pyplot as plt
from sympy import symbols, diff


def get_data():

    ret_x = np.linspace(-1, 1, 100)
    return ret_x, [(lambda x: 2 * x + 3)(x) for x in ret_x]


def grad():

    x, y, w_a, w_b = symbols(["x", "y", "w_a", "w_b"])
    loss = (y - (w_a * x + w_b))**2
    return diff(loss, w_a), diff(loss, w_b)


def test2(n_iter=10, lr=0.5, batch_size=20, epsilon=1e-6, shuffle=True, gamma=0.9):
    x, y = get_data()
    ga, gb = grad()
    n = len(x)
    idx = np.random.permutation(n)
    s_a, s_b, w_a, w_b = 0, 0, 0, 0
    move_a, move_b = [w_a], [w_b]
    move_lr_a, move_lr_b = [lr], [lr]
    for i in range(n_iter):
        if shuffle:
            np.random.shuffle(idx)
        batch_idxes = [idx[k: k + batch_size] for k in range(0, n, batch_size)]
        for idxes in batch_idxes:
            sum_ga, sum_gb = 0, 0
            for j in idxes:
                sum_ga += ga.subs({"x": x[j], "y": y[j], "w_a": w_a, "w_b": w_b})
                sum_gb += gb.subs({"x": x[j], "y": y[j], "w_a": w_a, "w_b": w_b})
            sum_ga /= batch_size
            sum_gb /= batch_size
            """Note the difference."""
            s_a = gamma * s_a + sum_ga**2
            s_b = gamma * s_b + sum_gb**2
            move_lr_a.append(lr / np.sqrt(float(s_a) + epsilon))
            move_lr_b.append(lr / np.sqrt(float(s_b) + epsilon))
            w_a -= sum_ga * move_lr_a[-1]
            w_b -= sum_gb * move_lr_b[-1]
            move_a.append(w_a)
            move_b.append(w_b)
    plt.subplot(211)
    plt.plot(move_a)
    plt.plot(move_b)
    plt.legend(["a", "b"])
    plt.subplot(212)
    plt.plot(move_lr_a)
    plt.plot(move_lr_b)
    plt.legend(["a", "b"])
    plt.show()


if __name__ == '__main__':
    test2()


参考文献
【1】https://towardsdatascience.com/introduction-and-implementation-of-adagradient-rmsprop-fad64fe4991

  • 2
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值