深度学习-*-梯度优化算法及代码实例

8 篇文章 0 订阅
14 篇文章 0 订阅

1 梯度下降简单说明

1.3 Momentum梯度下降

a 100 = d 1 + d 2 + . . . + d 100 100   ( 1.1 ) a_{100}=\frac{d_{1}+d_{2}+...+d_{100}}{100} \ (1.1)

v 100 = β ∗ v 99 + ( 1 − β ) ∗ d 100 v_{100}=\beta*v_{99}+(1-\beta)*d_{100}
v 99 = β ∗ v 98 + ( 1 − β ) ∗ d 99 v_{99}=\beta*v_{98}+(1-\beta)*d_{99}
. . . ...
v 1 = β ∗ v 0 + ( 1 − β ) ∗ d 1 v_{1}=\beta*v_{0}+(1-\beta)*d_{1}

V d W = β ∗ V d W + ( 1 − β ) ∗ d W V_{dW}=\beta*V_{dW}+(1-\beta)*dW
V d b = β ∗ V d b + ( 1 − β ) ∗ d b V_{db}=\beta*V_{db}+(1-\beta)*db
W : = W − α ∗ V d W W:=W-\alpha*V_{dW}
b : = b − α ∗ V d b b:=b-\alpha*V_{db}

1.4 RMSProp

S d W = β ∗ S d W + ( 1 − β ) ∗ ( d W ) 2 S_{dW}=\beta*S_{dW}+(1-\beta)*(dW)^2
S d b = β ∗ S d b + ( 1 − β ) ∗ ( d b ) 2 S_{db}=\beta*S_{db}+(1-\beta)*(db)^2
W : = W − α ∗ d W S d W + e W:=W-\alpha*\frac{dW}{\sqrt{S_{dW}}+e}
b : = b − α ∗ d b S d b + e b:=b-\alpha*\frac{db}{\sqrt{S_{db}}+e}

V d W = β 1 ∗ V d W + ( 1 − β 1 ) ∗ d W   V d b = β 1 ∗ V d b + ( 1 − β 1 ) ∗ d b V_{dW}=\beta_{1}*V_{dW}+(1-\beta_{1})*dW \ V_{db}=\beta_{1}*V_{db}+(1-\beta_{1})*db
S d W = β 2 ∗ S d W + ( 1 − β 2 ) ∗ ( d W ) 2   S d b = β 1 ∗ S d b + ( 1 − β 1 ) ∗ ( d b ) 2 S_{dW}=\beta_{2}*S_{dW}+(1-\beta_{2})*(dW)^2 \ S_{db}=\beta_{1}*S_{db}+(1-\beta_{1})*(db)^2
V d W c o r r e c t = V d W 1 − β 1 t   V d b c o r r e c t = V d b 1 − β 1 t V^{correct}_{dW}=\frac{V_{dW}}{1-\beta_{1}^t} \ V^{correct}_{db}=\frac{V_{db}}{1-\beta_{1}^t}
S d W c o r r e c t = S d W 1 − β 2 t   S d b c o r r e c t = S d b 1 − β 2 t S^{correct}_{dW}=\frac{S_{dW}}{1-\beta_{2}^t} \ S^{correct}_{db}=\frac{S_{db}}{1-\beta_{2}^t}
W : = W − α ∗ V d W c o r r e c t S d W c o r r e c t + e W:=W-\alpha*\frac{V^{correct}_{dW}}{\sqrt{S^{correct}_{dW}}+e}
b : = b − α ∗ V d b c o r r e c t S d b c o r r e c t + e b:=b-\alpha*\frac{V^{correct}_{db}}{\sqrt{S^{correct}_{db}}+e}

2 Python代码及结果

#!/usr/bin/python
# -*- coding:utf-8 -*-
"""
Python 3.6
Author LiHao
Time 2018/10/12 10:28
"""
import numpy as np
import random
from math import sqrt
"""

"""
def random_num():
#读取随机数据 50个
data = []
func = lambda x:x**2+2*x+1+random.uniform(-0.3,0.3)
for i in np.arange(-5,5,0.2):
data.append([i,func(i)])
return data

"""
利用数据求解 求解一元二次方程 x**2 + 2*x + 1的系数
分别利用
梯度下降法/Mini-Batch梯度下降法
动量梯度下降法
RMSProp
"""
def __init__(self,alpha = 0.001, error = 1e-03,A=0.1,B=0.1,C=0.1,data_num=100,max_iter=30000):
"""
:param alpha: 步长
:param error: 误差
:param maxcount: 最大迭代次数
"""
self._alpha = alpha
self._error = error
#读入数据并初始化各个系数
self.data_num = data_num
self._data = self.inputs()
#这里固定了为一元二次方程 x**2+2*x+1  _A _B _C 为真实系数
self.A = A
self._A = 1
self.B = B
self._B = 2
self.C = C
self._C = 1
self._max_iter = max_iter

def inputs(self):
datas = []
for i in range(self.data_num):
datas.extend(random_num())
#print("LH -*- 得到的数据个数：",len(datas))
return datas

def get_y_hat(self,mini_data):
"""
获取预测值 y^
:param mini_data:
:return:
"""
y_hat = self.A * np.power(mini_data[:,0],2) + self.B * mini_data[:, 0] + self.C
return y_hat

self.A = self.A - self._alpha * np.sum((y_hat-mini_data[:,1])*np.power(mini_data[:,0],2))/m
self.B = self.B - self._alpha * np.sum((y_hat-mini_data[:,1])*mini_data[:,0])/m
self.C = self.C - self._alpha * np.sum(y_hat-mini_data[:,1])/m

"""
minibatch-梯度下降
:param m:
:param error:
:return:
"""
self._error = error
all_lens = len(self._data)
if all_lens % m ==0:
epoch = int(all_lens/m)
else:
epoch = int(all_lens/m) + 1
Error = 1.0
count = 1
while(Error>self._error and count < self._max_iter):
#分批次求解 随机在数据集中选取一个epoch，里面含有m个数据，不满m的取剩余的数据
ie = random.randint(0,epoch-1)
mini_data = np.array(self._data[ie*m:(ie+1)*m],dtype=np.float32)
current_m = mini_data.shape[0]
y_hat = self.get_y_hat(mini_data)
Error = (abs(self.A - self._A) + abs(self.B - self._B) + abs(self.C - self._C)) / 3
#print("LH -*- epoch: ",ie,"\tloss : ",Error," A,B,C:",self.A,self.B,self.C)
count += 1
print("LH -*- Minibatch -*-Final A,B,C,iter:",self.A,self.B,self.C,count," error:",Error)

“”“
动量梯度求解具体过程
”“”
da =  np.sum((y_hat - mini_data[:, 1]) * np.power(mini_data[:, 0], 2)) / m
db =  np.sum((y_hat - mini_data[:, 1]) * mini_data[:, 0]) / m
dc =  np.sum(y_hat - mini_data[:, 1]) / m
va = da * (1 - beta) + beta * pre_va
vb = db * (1 - beta) + beta * pre_vb
vc = dc * (1 - beta) + beta * pre_vc
self.A = self.A - self._alpha * va
self.B = self.B - self._alpha * vb
self.C = self.C - self._alpha * vc
return va,vb,vc

"""
动量梯度下降
较之梯度下降更加快速
当前步的梯度与历史的梯度方向有关
:param m:
:param beta:
:param error:
:return:
"""
self._error = error
all_lens = len(self._data)
if all_lens % m == 0:
epoch = int(all_lens / m)
else:
epoch = int(all_lens / m) + 1
Error = 1.0
count = 1
pre_va = 0.0
pre_vb = 0.0
pre_vc = 0.0
while (Error >= self._error and count < self._max_iter):
ie = random.randint(0,epoch-1)
mini_data = np.array(self._data[ie * m:(ie + 1) * m], dtype=np.float32)
current_m = mini_data.shape[0]
y_hat = self.get_y_hat(mini_data)
Error =(abs(self.A-self._A)+abs(self.B-self._B)+abs(self.C-self._C))/3
#print("LH -*- epoch: ", ie, "\terror : ", Error, " A,B,C:", self.A, self.B, self.C)
count += 1
print("LH -*- Momentum -*-Final A,B,C,iter:", self.A, self.B, self.C, count, " error:", Error)

da = np.sum((y_hat - mini_data[:, 1]) * np.power(mini_data[:, 0], 2)) / m
db = np.sum((y_hat - mini_data[:, 1]) * mini_data[:, 0]) / m
dc = np.sum(y_hat - mini_data[:, 1]) / m
sa = da**2 * (1 - beta) + beta * pre_sa
sb = db**2 * (1 - beta) + beta * pre_sb
sc = dc**2 * (1 - beta) + beta * pre_sc
self.A = self.A - self._alpha * da / (sqrt(sa) + eps)
self.B = self.B - self._alpha * db / (sqrt(sb) + eps)
self.C = self.C - self._alpha * dc / (sqrt(sc) + eps)
return sa, sb, sc

"""
RMSProp梯度下降
自适应更新学习速率
:param m:minibatch
:param beta:超参数
:param error:误差
:return:
"""
self._error = error
all_lens = len(self._data)
if all_lens % m == 0:
epoch = int(all_lens / m)
else:
epoch = int(all_lens / m) + 1
# 进行分批次求解
Error = 1.0
count = 1
pre_sa = 0.0
pre_sb = 0.0
pre_sc = 0.0
while (Error >= self._error and count < self._max_iter):
ie = random.randint(0,epoch-1)
mini_data = np.array(self._data[ie * m:(ie + 1) * m], dtype=np.float32)
current_m = mini_data.shape[0]
y_hat = self.get_y_hat(mini_data)
Error = (abs(self.A - self._A) + abs(self.B - self._B) + abs(self.C - self._C)) / 3
# print("LH -*- epoch: ", ie, "\terror : ", Error, " A,B,C:", self.A, self.B, self.C)
pre_sa, pre_sb, pre_sc = self.rmsprop_gradient(mini_data, y_hat, current_m, pre_sa, pre_sb, pre_sc,
beta)
count += 1
print("LH -*- RMSProp -*- Final A,B,C,iter:", self.A, self.B, self.C, count, " error:", Error)

#correct值归一化分母
norm_value_1 = 1-pow(beta_1,count)
norm_value_2 = 1-pow(beta_2,count)
#求da db dc
da = np.sum((y_hat - mini_data[:, 1]) * np.power(mini_data[:, 0], 2)) / m
db = np.sum((y_hat - mini_data[:, 1]) * mini_data[:, 0]) / m
dc = np.sum(y_hat - mini_data[:, 1]) / m
#求va vb vc
va = da * (1 - beta_1) + beta_1 * pre_va
vb = db * (1 - beta_1) + beta_1 * pre_vb
vc = dc * (1 - beta_1) + beta_1 * pre_vc
va_correct = va / norm_value_1
vb_correct = vb / norm_value_1
vc_correct = vc / norm_value_1
#求sa sb sc
sa = da ** 2 * (1 - beta_2) + beta_2 * pre_sa
sb = db ** 2 * (1 - beta_2) + beta_2 * pre_sb
sc = dc ** 2 * (1 - beta_2) + beta_2 * pre_sc
sa_correct = sa / norm_value_2
sb_correct = sb / norm_value_2
sc_correct = sc / norm_value_2
self.A = self.A - self._alpha * va_correct / (sqrt(sa_correct) + eps)
self.B = self.B - self._alpha * vb_correct / (sqrt(sb_correct) + eps)
self.C = self.C - self._alpha * vc_correct / (sqrt(sc_correct) + eps)
return va,vb,vc,sa,sb,sc

"""
结合动量及RMSProp
:param m:minibatch
:param error:误差
:return:
"""
self._error = error
all_lens = len(self._data)
if all_lens % m == 0:
epoch = int(all_lens / m)
else:
epoch = int(all_lens / m) + 1
Error = 1.0
count = 1
pre_sa = 0.0
pre_sb = 0.0
pre_sc = 0.0
pre_va = 0.0
pre_vb = 0.0
pre_vc = 0.0
while (Error >= self._error and count < self._max_iter):
ie = random.randint(0, epoch-1)
mini_data = np.array(self._data[ie * m:(ie + 1) * m], dtype=np.float32)
current_m = mini_data.shape[0]
y_hat = self.get_y_hat(mini_data)
Error = (abs(self.A - self._A) + abs(self.B - self._B) + abs(self.C - self._C)) / 3
# print("LH -*- epoch: ", ie, "\terror : ", Error, " A,B,C:", self.A, self.B, self.C)
count += 1
print("LH -*- Adam -*- Final A,B,C,iter:", self.A, self.B, self.C, count, " error:", Error)

if __name__ == '__main__':
#分别测试不同的方法观察结果

LH -*- Minibatch -*-Final A,B,C,iter: 0.9998391374256507 2.000062286798814 0.994375273081942 15357  error: 0.0019482761020684913
LH -*- Momentum -*-Final A,B,C,iter: 1.000219900542389 2.000187069910188 0.9944682440600149 10864  error: 0.0019906410380585346
LH -*- RMSProp -*- Final A,B,C,iter: 1.000128957762776 2.000499604490694 1.0035405980946732 2109  error: 0.0017750440141715007
LH -*- Adam -*- Final A,B,C,iter: 1.000215617975376 2.0007904489544006 1.004995565781187 5911  error: 0.0019990631734925213


参考文献：

1.吴恩达.深度学习课程笔记v5.54.pdf(黄海广整理)
2.https://blog.csdn.net/u014595019/article/details/52989301
3.http://cs231n.github.io/neural-networks-3/

• 0
点赞
• 0
评论
• 1
收藏
• 一键三连
• 扫一扫，分享海报

07-17 1880
08-16 1237
05-31 1447
10-08 1万+
10-30 1万+
08-18 7979
02-12 2445
06-08 4593
05-14 8062
03-13 2160
03-18 6078
03-24 1万+