1 自适应梯度Adagradient
回顾动量版的SGD更新公式:
v
t
=
γ
v
t
−
1
+
l
r
∗
g
r
a
d
θ
=
θ
−
v
t
=
θ
−
γ
v
t
−
1
−
l
r
∗
g
r
a
d
\begin{aligned} v_t & = \gamma v_{t - 1} + lr * grad\\ \theta & = \theta - v_t = \theta - \gamma v_{t - 1} - lr * grad \end{aligned}
vtθ=γvt−1+lr∗grad=θ−vt=θ−γvt−1−lr∗grad 这里的
l
r
lr
lr是一个固定值:越靠近目标值的位置,梯度的变化俞小,参数的变慢也俞慢。有没有方法解决呢?
此时一个带着参数
s
t
s_t
st的靓仔缓缓走来:
s
t
=
s
t
−
1
+
g
r
a
d
t
2
w
t
=
w
t
−
1
−
g
r
a
d
t
l
r
s
t
+
ϵ
\begin{aligned} s_t & = s_{t - 1} + grad_t^2\\ w_t & =w_{t - 1} - grad_t \frac{lr}{\sqrt{s_t + \epsilon}} \end{aligned}
stwt=st−1+gradt2=wt−1−gradtst+ϵlr 把它加进动量版的SGD试试😋
还是以以下目标为例:
f
(
x
)
=
a
x
+
b
(
y
−
f
(
x
)
)
2
=
(
y
−
(
a
x
+
b
)
)
2
d
y
d
a
=
−
2
x
(
y
−
(
a
x
+
b
)
)
d
y
d
b
=
−
2
(
y
−
(
a
x
+
b
)
)
\begin{aligned} f(x) &= a x+b \\ (y-f(x))^{2} & =(y-(a x+b))^{2} \\ \frac{d y}{d a} &=-2 x(y-(a x+b)) \\ \frac{d y}{d b} &=-2(y-(a x+b)) \end{aligned}
f(x)(y−f(x))2dadydbdy=ax+b=(y−(ax+b))2=−2x(y−(ax+b))=−2(y−(ax+b))
测试1: 加入自适应梯度的SGD算法
import numpy as np
import matplotlib.pyplot as plt
from sympy import symbols, diff
def get_data():
ret_x = np.linspace(-1, 1, 100)
return ret_x, [(lambda x: 2 * x + 3)(x) for x in ret_x]
def grad():
x, y, w_a, w_b = symbols(["x", "y", "w_a", "w_b"])
loss = (y - (w_a * x + w_b))**2
return diff(loss, w_a), diff(loss, w_b)
def test1(n_iter=10, lr=0.5, batch_size=20, epsilon=1e-6, shuffle=True):
x, y = get_data()
ga, gb = grad()
n = len(x)
idx = np.random.permutation(n)
s_a, s_b, w_a, w_b = 0, 0, 0, 0
move_a, move_b = [w_a], [w_b]
move_lr_a, move_lr_b = [lr], [lr]
for i in range(n_iter):
if shuffle:
np.random.shuffle(idx)
batch_idxes = [idx[k: k + batch_size] for k in range(0, n, batch_size)]
for idxes in batch_idxes:
sum_ga, sum_gb = 0, 0
for j in idxes:
sum_ga += ga.subs({"x": x[j], "y": y[j], "w_a": w_a, "w_b": w_b})
sum_gb += gb.subs({"x": x[j], "y": y[j], "w_a": w_a, "w_b": w_b})
sum_ga /= batch_size
sum_gb /= batch_size
s_a += sum_ga**2
s_b += sum_gb**2
move_lr_a.append(lr / np.sqrt(float(s_a) + epsilon))
move_lr_b.append(lr / np.sqrt(float(s_b) + epsilon))
w_a -= sum_ga * move_lr_a[-1]
w_b -= sum_gb * move_lr_b[-1]
move_a.append(w_a)
move_b.append(w_b)
plt.subplot(211)
plt.plot(move_a)
plt.plot(move_b)
plt.legend(["a", "b"])
plt.subplot(212)
plt.plot(move_lr_a)
plt.plot(move_lr_b)
plt.legend(["a", "b"])
plt.show()
if __name__ == '__main__':
test1()
2 RMSprop
上面老几的问题是学习率变着变着就不变了,难受对吧?所以现在这个老几加了gamma参数,注意看代码中的区别哟😊
😂
测试2: 加入gamma的自适应学习SGD
import numpy as np
import matplotlib.pyplot as plt
from sympy import symbols, diff
def get_data():
ret_x = np.linspace(-1, 1, 100)
return ret_x, [(lambda x: 2 * x + 3)(x) for x in ret_x]
def grad():
x, y, w_a, w_b = symbols(["x", "y", "w_a", "w_b"])
loss = (y - (w_a * x + w_b))**2
return diff(loss, w_a), diff(loss, w_b)
def test2(n_iter=10, lr=0.5, batch_size=20, epsilon=1e-6, shuffle=True, gamma=0.9):
x, y = get_data()
ga, gb = grad()
n = len(x)
idx = np.random.permutation(n)
s_a, s_b, w_a, w_b = 0, 0, 0, 0
move_a, move_b = [w_a], [w_b]
move_lr_a, move_lr_b = [lr], [lr]
for i in range(n_iter):
if shuffle:
np.random.shuffle(idx)
batch_idxes = [idx[k: k + batch_size] for k in range(0, n, batch_size)]
for idxes in batch_idxes:
sum_ga, sum_gb = 0, 0
for j in idxes:
sum_ga += ga.subs({"x": x[j], "y": y[j], "w_a": w_a, "w_b": w_b})
sum_gb += gb.subs({"x": x[j], "y": y[j], "w_a": w_a, "w_b": w_b})
sum_ga /= batch_size
sum_gb /= batch_size
"""Note the difference."""
s_a = gamma * s_a + sum_ga**2
s_b = gamma * s_b + sum_gb**2
move_lr_a.append(lr / np.sqrt(float(s_a) + epsilon))
move_lr_b.append(lr / np.sqrt(float(s_b) + epsilon))
w_a -= sum_ga * move_lr_a[-1]
w_b -= sum_gb * move_lr_b[-1]
move_a.append(w_a)
move_b.append(w_b)
plt.subplot(211)
plt.plot(move_a)
plt.plot(move_b)
plt.legend(["a", "b"])
plt.subplot(212)
plt.plot(move_lr_a)
plt.plot(move_lr_b)
plt.legend(["a", "b"])
plt.show()
if __name__ == '__main__':
test2()
参考文献
【1】https://towardsdatascience.com/introduction-and-implementation-of-adagradient-rmsprop-fad64fe4991