import matplotlib.pyplot as plt
import numpy as np
import latexify
from collections import OrderedDict
from common.optimizer import *
一维函数的优化
定义函数
def J(x):
return x**6/6 - 5.5*x**5/5 + 6.5*x**4/4 + 5.5*x**3/3 - 7.5*x**2/2
def dJ(x):
return x**5 - 5.5*x**4 + 6.5*x**3 + 5.5*x**2 - 7.5*x
def ddJ(x):
return 5*x**4 - 22*x**3 + 19.5*x**2 + 11*x - 7.5
x = np.linspace(-2, 5, 100)
plt.figure(figsize=(8, 6))
plt.plot(x, J(x), label='J(x)')
plt.plot(x, dJ(x), label='dJ(x)')
plt.plot(x, ddJ(x), label='ddJ(x)')
plt.legend()
plt.xlabel('x')
plt.ylabel('y')
plt.title('J(x) and dJ(x) and ddJ(x)')
plt.ylim(-5, 20)
plt.show()
可视化迭代过程
# 创建一个动画,将梯度下降的过程可视化,这里使用的是matplotlib的animation模块
from matplotlib import animation
from IPython.display import HTML
def show_animation(x_history):
fig = plt.figure(figsize=(8, 6))
ax = plt.axes(xlim=(-2, 5), ylim=(-5, 30))
line, = ax.plot([], [], 'bo', lw=2, label='point')
x = np.linspace(-2, 5, 100)
plt.legend()
def init():
line.set_data([], [])
return line,
def animate(frame):
line.set_data(x_history[frame], J(x_history[frame]))
plt.title('iteration = {}'.format(frame))
return line,
anim = animation.FuncAnimation(fig, animate, init_func=init,
frames=len(x_history), # 这里的frames是指动画的帧数
interval=40, # 这里的interval是指动画的间隔时间 单位是ms
blit=True # 这里的blit是指是否只更新动画中改变的部分
)
return HTML(anim.to_html5_video())
# test
x_history = {
'AdaGrad':[1,2,3],
'SGD':[1,2,3],
'Momentum':[1,2,3],
'Nesterov':[1,2,3],
'Adam': [1,2,3]
}
print(len(x_history.keys()))
print(len(x_history))
5
5
SGD
简单实现
alpha = 0.05
x = 4
x_history = [x]
for i in range(200):
x = x - alpha * dJ(x)
x_history.append(x)
print(x_history[-1])
show_animation(x_history)
-1.0
Your browser does not support the video tag.
优化器实现
init_pos = 4
params = {}
params['x'] = init_pos
grads = {}
x_history = []
optimizer = SGD(lr=0.05)
for i in range(200):
x_history.append(params['x'])
grads['x'] = dJ(params['x'])
optimizer.update(params, grads)
print(params['x'])
show_animation(x_history)
-1.0
Your browser does not support the video tag.
Momentum
简单实现
v = 0
alpha = 0.02
beta = 0.9
x = 4
x_history = [x]
for i in range(200):
v = beta * v - alpha * dJ(x)
x = x + v
x_history.append(x)
print(x_history[-1])
show_animation(x_history)
-0.9999435339008631
Your browser does not support the video tag.
优化器实现
init_pos = 4
params = {}
params['x'] = init_pos
grads = {}
x_history = []
optimizer = Momentum(lr=0.02)
for i in range(200):
x_history.append(params['x'])
grads['x'] = dJ(params['x'])
optimizer.update(params, grads)
print(params['x'])
show_animation(x_history)
-0.9999435339008631
Your browser does not support the video tag.
AdaGrad
简单实现
alpha = 0.5
x = 4
h = 0
x_history = [x]
for i in range(200):
h = h + dJ(x)**2
x = x - alpha * dJ(x) / (np.sqrt(h)+1e-7)
x_history.append(x)
print(x_history[-1])
show_animation(x_history)
3.0000003436659832
Your browser does not support the video tag.
优化器实现
init_pos = np.float64(4)
params = {}
params['x'] = init_pos
grads = {}
x_history = []
optimizer = AdaGrad(lr=0.5)
for i in range(200):
x_history.append(params['x'])
grads['x'] = dJ(params['x'])
optimizer.update(params, grads)
print(params['x'])
show_animation(x_history)
3.0000003436659832
Your browser does not support the video tag.
小结
我们可以看到上面AdaGrad的学习率在迭代的过程中不断减小,这是因为AdaGrad会对每个参数的梯度进行累加,从而使得学习率不断减小,这样的话,我们就可以使用一个较大的学习率,从而加快学习的速度。但是也有个缺点,我们可以从视频中看到其飞快地收敛于局部最优,这是值得我们思考的地方:我们能不能利用AdaGrad的优点,同时避免其缺点呢?即用较大学习率加快前期学习,同时也引入动量的概念,使得其一直朝着全局最优的方向前进呢?这就是Adam的思想。
Adam
简单实现
alpha = 0.5
beta1 = 0.9
beta2 = 0.999
m = 0
v = 0
x = 4
x_history = [x]
for i in range(200):
m += (1-beta1) * (dJ(x) - m)
v += (1-beta2) * (dJ(x)**2 - v)
lr_t = alpha * np.sqrt(1-beta2**(i+1)) / (1-beta1**(i+1))
x = x - lr_t * m / (np.sqrt(v)+1e-7)
x_history.append(x)
print(x_history[-1])
show_animation(x_history)
0.9999331921434963