下面是各种优化器的对比效果,包含 gd, momentum, nesterov
Python 代码示例
# -*- coding: utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt
# 待解决的函数
def f(x,y):
return x * x + 50 * y * y
# f(x,y) 的全导数
def g(x):
return np.array([2 * x[0], 100 * x[1]])
# 梯度下降 gradient descent
def gd(x_start, step, g):
# step 学习率
x = np.array(x_start, dtype='float64')
passing_dot = [x.copy()] # 将点信息放入数组
for i in range(50): # 50 轮迭代
grad = g(x) # 求导算梯度
x -= grad * step
passing_dot.append(x.copy())
print('[ Epoch {0} ] grad = {1}, x = {2}'.format(i, grad, x))
if abs(sum(grad)) < 1e-6:
break;
return passing_dot
# 动量
def momentum(x_start, step, g, discount = 0.7):
# step 学习率
# discount 动量率
x = np.array(x_start, dtype='float64')
passing_dot = [x.copy()] # 将点信息放入数组
pre_grad = np.zeros_like(x) # 创建维度与矩阵x一致的 pre_grad, 元素全部为0, 用于存放动量值
for i in range(50):
grad = g(x) # 求导算梯度
pre_grad = pre_grad * discount + grad
x -= pre_grad * step
passing_dot.append(x.copy())
print('[ Epoch {0} ] grad = {1}, x = {2}'.format(i, grad, x))
if abs(sum(grad)) < 1e-6:
break;
return passing_dot
# 牛顿动量
def nesterov(x_start, step, g, discount = 0.7):
x = np.array(x_start, dtype='float64')
passing_dot = [x.copy()] # 将点信息放入数组
pre_grad = np.zeros_like(x) # 创建维度与矩阵x一致的pre_grad, 元素全部为0
for i in range(50):
x_future = x - step * discount * pre_grad
grad = g(x_future) # 求导算梯度
pre_grad = pre_grad * 0.7 + grad
x -= pre_grad * step
passing_dot.append(x.copy())
print('[ Epoch {0} ] grad = {1}, x = {2}'.format(i, grad, x))
if abs(sum(grad)) < 1e-6:
break;
return passing_dot
# xi 生成 [-200,200] 一千个像素点
xi = np.linspace(-200,200,1000)
# yi 生成 [-100,100] 一千个像素点
yi = np.linspace(-100,100,1000)
# 生成网络绘制用的矩阵, x = [-200,200], y = [-100,100]
X,Y = np.meshgrid(xi, yi)
# 基于 x,y 计算 z 的值 (等高线的高度值)
Z = f(X,Y)
def contour(X,Y,Z, arr = None,title=''):
c = plt.contour(X, Y, Z, colors='black')
plt.plot(0,0,marker='*')
plt.title(title)
if arr is not None:
arr = np.array(arr)
for i in range(len(arr) - 1):
plt.plot(arr[i:i+2,0],arr[i:i+2,1])
# 起点是 (150,75)
start_point = [150,70]
select = 0
if select == 0:
# gd, step:0.016
step = 0.016
x_arr = gd(start_point, step, g)
contour(X,Y,Z,x_arr,'gd, step = {}'.format(step))
elif select == 1:
step = 0.019
# gd, step:0.019
x_arr = gd(start_point, step, g)
contour(X,Y,Z,x_arr,'gd, step = {}'.format(step))
elif select == 2:
step = 0.016
# momentum, step:0.016
x_arr = momentum(start_point, step, g)
contour(X,Y,Z,x_arr,'momentum, step = {}'.format(step))
elif select == 3:
step = 0.012
# nesterov, step:0.012
x_arr = nesterov(start_point, step, g)
contour(X,Y,Z,x_arr,'nesterov, step = {}'.format(step))
首先是 gd, step = 0.016, epoch = 50, 起始点为 (150, 70)
给步长加长一点, step = 0.019,收敛速度提升一点
下面是 momentum,收敛速度有了明显提升,直接满足训练要求,但是波动有点大
接下来是 nesterov,在 momentum 的基础进一步提升稳定性