AdaGrad
这前面两篇不同的是,AdaGrad优化算法的学习率明面上是不断变换,且每个维度的学习率都尽相同。
具体算法过程上图所示。可以发现它是通过累加平方梯度的方式,来改变学习率。这使得在平缓的地方,由于梯度值小,对应学习的下降幅度就会小;对于陡峭的地方,由于梯度值大。对应的学习率就会相对大幅下降,使得更加平缓,从而加快训练速度。
缺点:AdaGrad会记录之前的所有梯度之和,这使得这个结果会越来越大,从而使得后面学习率会无限接近0,从而使得权值无法得到更新。为了改善这点,下面会介绍RMSProp方法。
python实现
import numpy as np
import random
#1、AdaGrad
def AdaGrad(x,y,alpha,theta,maxiterations,n,sigma = 1e-6):
r = 0
for i in range(maxiterations):
index_list = random.sample(range(0,10),n) # 随机选择m个样本的索引
x_selsct = x[[index_list]]
m = y.size # 样本数量
h = x.dot(theta) #预测值 (10,1)
loss = h - y
loss = loss[index_list] #随机选择5个样本的损失值替代整体损失值 (n,1)
gradient = np.dot(x_selsct.transpose(),loss) / n #(3,)
r += gradient*gradient
theta -= (alpha/(r**0.5+sigma))*gradient
if loss.sum() == 0:
print("迭代结束,收敛总共迭代了{}次".format(i+1))
break
return theta
def predict(x,theta):
h = x.dot(theta)
return h
#(10, 3)
trainData = np.array([[1.1,1.5,1],[1.3,1.9,1],[1.5,2.3,1],[1.7,2.7,1],[1.9,3.1,1],[2.1,3.5,1],[2.3,3.9,1],[2.5,4.3,1],[2.7,4.7,1],[2.9,5.1,1]])
trainLabel = np.array([2.5,3.2,3.9,4.6,5.3,6,6.7,7.4,8.1,8.8])
alpha = 0.4
maxIteration = 10000
_,n = trainData.shape
theta = np.ones(n)
theta = AdaGrad(trainData, trainLabel ,alpha,theta,maxIteration,5)
#迭代结束,收敛总共迭代了1888次
#array([ 0.95221273, 1.27389363, -0.45827446])
predict(trainData,theta)
#array([2.5, 3.2, 3.9, 4.6, 5.3, 6. , 6.7, 7.4, 8.1, 8.8])
原始的RMSProp
鉴于AdaGrad会累积历史梯度,造成后续的学习率很小,权值无法得到有效更新。所以原始的RMSProp增加了一个衰减系数来控制历史信息获取多少。
python实现
import numpy as np
import random
#1、RMSProp
def RMSProp(x,y,alpha,theta,maxiterations,n,sigma = 1e-6,mu = 0.9):
r = 0
for i in range(maxiterations):
index_list = random.sample(range(0,10),n) # 随机选择m个样本的索引
x_selsct = x[[index_list]]
m = y.size # 样本数量
h = x.dot(theta) #预测值 (10,1)
loss = h - y
loss = loss[index_list] #随机选择5个样本的损失值替代整体损失值 (n,1)
gradient = np.dot(x_selsct.transpose(),loss) / n #(3,)
r = mu*r + (1-mu)*gradient*gradient
theta -= (alpha/((r+sigma)**0.5))*gradient
if loss.sum() == 0:
print("迭代结束,收敛总共迭代了{}次".format(i+1))
break
return theta
def predict(x,theta):
h = x.dot(theta)
return h
#(10, 3)
trainData = np.array([[1.1,1.5,1],[1.3,1.9,1],[1.5,2.3,1],[1.7,2.7,1],[1.9,3.1,1],[2.1,3.5,1],[2.3,3.9,1],[2.5,4.3,1],[2.7,4.7,1],[2.9,5.1,1]])
trainLabel = np.array([2.5,3.2,3.9,4.6,5.3,6,6.7,7.4,8.1,8.8])
alpha = 0.01
maxIteration = 5000
_,n = trainData.shape
theta = np.ones(n)
theta = RMSProp(trainData, trainLabel ,alpha,theta,maxIteration,5)
#array([ 0.9305065 , 1.28002522, -0.45210931])
predict(trainData,theta)
#array([2.49148567, 3.18959706, 3.88770845, 4.58581983, 5.28393122,
# 5.98204261, 6.680154 , 7.37826538, 8.07637677, 8.77448816])