主要介绍了各个主流神经网络优化算法的代码实现
SGD
while True:
dx = compute_gradient(x)
x -= learning_rate * dx
SGD + Momentum
v = 0
rv = 0.9
while True:
dx = compute_gradient(x)
v = rv * v + dx
x -= learning_rate * v
AdaGrad
grad_square = 0
while True:
dx = compute_gradient(x)
grad_square += dx * dx
x -= learning_rate * dx / (sqrt(grad_square) + 1e-7)
RMSProp
grad_square = 0
decay_rate= 0.9
while True:
dx = compute_gradient(x)
grad_square += decay_rate* grad_square + (1 - decay_rate) * dx * dx
x -= learning_rate * dx / (sqrt(grad_square) + 1e-7)
Adam
first_momentum, second_momentum = 0, 0
beta1, beta2 = 0.9, 0.999
epoches = 1000
for e in range(1, epoches):
dx = compute_gradient(x)
#Momentum
first_momentum = beta1 * first_momentum + (1 - beta1) * dx
second_momentum = beta2 * second_momentum + (1 - beta2) * dx *dx
#Bias correction
first_unbias = first_momentum / (1 - beat1 ** e)
second_unbias = second_momentum / (1 - beta2 ** e)
#AdaGrad/RMSProp
x -= learning_rate * first_unbias / (sqrt(second_unbias) + 1e-7)