有限内存SRI
对于有限内存SRI算法,利用的修正公式为:
H
k
=
H
0
+
(
S
k
−
H
0
Y
k
)
(
D
k
+
L
k
+
L
k
T
−
Y
k
T
H
0
Y
k
)
−
1
(
S
k
−
H
0
Y
k
)
T
H_{k} = H_{0} + (S_{k} - H_{0}Y_{k})(D_{k} + L_{k} + L_{k}^{T} - Y_{k}^{T}H_{0}Y_{k})^{-1}(S_{k} - H_{0}Y_{k})^T
Hk=H0+(Sk−H0Yk)(Dk+Lk+LkT−YkTH0Yk)−1(Sk−H0Yk)T
这里的
H
0
H_{0}
H0其实和上面的BFGS算法一样,也是可以任意取的,这里不同的是:
S
k
=
[
s
k
−
m
+
1
,
…
,
s
k
−
1
]
,
Y
k
=
[
y
k
−
m
+
1
,
…
,
y
k
−
1
]
S_{k} = [s_{k-m+1},\ldots,s_{k-1}],Y_{k} = [y_{k-m+1},\ldots,y_{k-1}]
Sk=[sk−m+1,…,sk−1],Yk=[yk−m+1,…,yk−1]
D
k
=
d
i
a
g
(
s
k
−
m
+
1
T
y
k
−
m
+
1
,
…
,
s
k
−
1
T
y
k
−
1
)
D_{k} = diag(s_{k-m+1}^{T}y_{k - m + 1},\ldots,s_{k-1}^{T}y_{k - 1})
Dk=diag(sk−m+1Tyk−m+1,…,sk−1Tyk−1)
( L k ) i , j = { ( s k − m − 1 + i ) T ( y k − m − 1 + j ) if i > j 0 otherwise \left(L_{k}\right)_{i, j}=\left\{\begin{array}{ll} \left(s_{k-m-1+i}\right)^{T}\left(y_{k-m-1+j}\right) & \text { if } i>j \\ 0 & \text { otherwise } \end{array}\right. (Lk)i,j={(sk−m−1+i)T(yk−m−1+j)0 if i>j otherwise
事实上,通过上面的公式,我们很容易发现,其实 D k + L k + L k T = S k Y k T D_{k} + L_{k} + L_{k}^{T}=S_{k}Y_{k}^{T} Dk+Lk+LkT=SkYkT,计算下降方向的流程如下:
q
=
g
k
q = g_k
q=gk,\
tmp =
H
0
H_{0}
H0,\
alpha =
(
S
k
−
H
0
Y
k
)
T
q
(S_{k} - H_{0}Y_{k})^{T}q
(Sk−H0Yk)Tq, \
D =
(
S
k
−
H
0
Y
k
)
(
D
k
+
L
k
+
L
k
T
−
Y
k
T
H
0
Y
k
)
−
1
(S_{k} - H_{0}Y_{k})(D_{k} + L_{k} + L_{k}^{T} - Y_{k}^{T}H_{0}Y_{k})^{-1}
(Sk−H0Yk)(Dk+Lk+LkT−YkTH0Yk)−1,\
d =
t
m
p
∗
g
k
+
D
∗
a
l
p
h
a
tmp*g_k + D*alpha
tmp∗gk+D∗alpha.\
这个计算过程中,如果
H
0
H_{0}
H0我们取对角矩阵,那么整个计算方向的过程就只需要存储一个
m
×
m
m\times m
m×m阶的矩阵,以及计算这个
m
×
m
m\times m
m×m阶矩阵的逆,这个运算量不大。下面开始展示数值结果。
关于这个Extended Freudenstein and Roth function函数和下面的Extended Rosenbrock function函数,在代码运行过程中,本人发现无法直接将上面两个函数的代码照搬过去,事实上,我通过修改goldstein函数里面超参数的选取,也无法得到正确的结果,在通过debug以后,本人发现针对这两个函数,goldstein求解步长失效,无法得到满意的步长,导致溢出。
Penalty函数
import torch
import numpy as np
import time
def FF(X):
n = X.shape[1]
al = 1e-5
return al*((X - 1)**2).sum() + ((X**2).sum() - 0.25)**2
def GG(X):
n = X.shape[1]
al = 1e-5
return 2*al*(X - 1) + 4*X*((X**2).sum() - 0.25)
def HH(X):
n = X.shape[1]
al = 1e-5
return 8*X.T@X + 4*((X**2).sum() - 0.25)*np.eye(n,n)
def gold(FF,GG,d,X):
rho = 0.4
am = 0;aM = 1;al = 0.5*(am + aM)
for i in range(100):
rig1 = FF(X) + rho*al*GG(X)@d.T
rig2 = FF(X) + (1 - rho)*al*GG(X)@d.T
lef = FF(X + al*d)
if lef <= rig1:
if lef >= rig2:
break
else:
am = al
al = am + 0.6*(aM - am)
else:
aM = al
al = am + 0.6*(aM - am)
al_k = al
#print(i)
return al_k,2*(i + 1)
def SRI(FF,GG,HH,X,m,N):
tic = time.time()
eps = 1e-5
n = X.shape[1]
fun_iter = 0
X_new = np.zeros([1,n])
S = np.zeros([m,n])
Y = np.zeros([m,n])
alpha = np.zeros(m)
for i in range(N):
if i <= m:
if i == 0:
al_k,ite = gold(FF,GG,-GG(X),X)
fun_iter += ite
X_new = X - GG(X)*al_k
else:
s = X_new - X
y = GG(X_new) - GG(X)
S[i - 1:i,:] = s
Y[i - 1:i,:] = y
#d = np.linalg.solve(HH(X_new),-GG(X_new).T)
d = -GG(X).T
al_k,ite = gold(FF,GG,d.T,X_new)
#print(np.linalg.norm(X))
X = X_new.copy()
X_new = X_new - al_k*GG(X_new)
fun_iter += ite
else:
s = X_new - X
y = GG(X_new) - GG(X)
S[0:m - 1,:] = S[1:m,:]
Y[0:m - 1,:] = Y[1:m,:]
S[m - 1:m,:] = s
Y[m - 1:m,:] = y
tmp = (s*y).sum()/(y*y).sum()
q = GG(X_new)
#H_{k} = H_{0} + (S_{k} - H_{0}Y_{k})@(D_{k} + L_{k} + L_{k}^{T} - Y_{k}^{T}H_{0}Y_{k})^{-1}(S_{k} - H_{0}Y_{k})^T
alpha = np.dot(q,(S - tmp*Y).T)#这里计算g_{k}(S_{k} - H_{0}Y_{k})^T
D = np.dot(np.linalg.inv(np.dot(S,Y.T) - tmp*np.dot(Y,Y.T)),(S - tmp*Y))
#这里计算D = (S_{k} - H_{0}Y_{k})@(D_{k} + L_{k} + L_{k}^{T} - Y_{k}^{T}H_{0}Y_{k})^{-1}
r = tmp*q + np.dot(alpha,D)
al_k,ite = gold(FF,GG,-r,X_new)
fun_iter += ite
X = X_new.copy()
X_new = X_new + al_k*(-r)
left = np.linalg.norm(GG(X))
rig = max(1,np.linalg.norm(X))*eps
if left < rig:
break
ela = time.time() - tic
print('iteration:%d,grad_2:%.3e,fun_iter = %d,time = %.2f'%(i + 1,(GG(X)**2).sum(),fun_iter,ela))
return X_new
n = 1000
m = 15
X = np.zeros([1,n])
for i in range(n):
X[:,i] = i + 1
X_new = SRI(FF,GG,HH,X,m,400)
#print(X_new)
print(FF(X_new))
Trigonometric function
import torch
import numpy as np
import time
def FF(X):
n = X.shape[1]
return ((n - np.cos(X).sum() + np.arange(1,n + 1)*(1 - np.cos(X)) - np.sin(X))**2).sum()
def GG(X):
n = X.shape[1]
#vec = np.zeros_like(X)
tmp = n - np.cos(X).sum() + np.arange(1,n + 1)*(1 - np.cos(X)) - np.sin(X)
return 2*np.sin(X)*tmp.sum() + 2*tmp*(np.arange(1,n + 1)*np.sin(X) - np.cos(X))
def HH(X):
n = X.shape[1]
mat = np.zeros([n,n])
tmp = n - np.cos(X).sum() + np.arange(1,n + 1)*(1 - np.cos(X)) - np.sin(X)
dia = 2*np.cos(X)*tmp.sum() + 2*np.sin(X)*((np.arange(1,n + 1) + n)*np.sin(X) - np.cos(X)) +\
2*((np.arange(1,n + 1) + 1)*np.sin(X) - np.cos(X))*(np.arange(1,n + 1)*np.sin(X) - np.cos(X)) + \
+ 2*tmp*(np.sin(X) + np.arange(1,n + 1)*np.cos(X))
mat = 2*np.sin(X).T*((np.arange(1,n + 1) + n)*np.sin(X) - np.cos(X)) + \
(np.arange(1,n + 1)*np.sin(X) - np.cos(X)).T*2*np.sin(X)
mat.flat[::(n + 1)] = dia
return mat
def gold(FF,GG,d,X):
rho = 0.4
am = 0;aM = 1;al = 0.5*(am + aM)
for i in range(100):
rig1 = FF(X) + rho*al*GG(X)@d.T
rig2 = FF(X) + (1 - rho)*al*GG(X)@d.T
lef = FF(X + al*d)
if lef <= rig1:
if lef >= rig2:
break
else:
am = al
al = am + 0.6*(aM - am)
else:
aM = al
al = am + 0.6*(aM - am)
al_k = al
#print(i)
return al_k,2*(i + 1)
def SRI(FF,GG,HH,X,m,N):
tic = time.time()
eps = 1e-5
n = X.shape[1]
fun_iter = 0
X_new = np.zeros([1,n])
S = np.zeros([m,n])
Y = np.zeros([m,n])
alpha = np.zeros(m)
for i in range(N):
if i <= m:
if i == 0:
al_k,ite = gold(FF,GG,-GG(X),X)
fun_iter += ite
X_new = X - GG(X)*al_k
else:
s = X_new - X
y = GG(X_new) - GG(X)
S[i - 1:i,:] = s
Y[i - 1:i,:] = y
#d = np.linalg.solve(HH(X_new),-GG(X_new).T)
d = -GG(X).T
al_k,ite = gold(FF,GG,d.T,X_new)
#print(np.linalg.norm(X))
X = X_new.copy()
X_new = X_new - al_k*GG(X_new)
fun_iter += ite
else:
s = X_new - X
y = GG(X_new) - GG(X)
S[0:m - 1,:] = S[1:m,:]
Y[0:m - 1,:] = Y[1:m,:]
S[m - 1:m,:] = s
Y[m - 1:m,:] = y
tmp = (s*y).sum()/(y*y).sum()
q = GG(X_new)
#H_{k} = H_{0} + (S_{k} - H_{0}Y_{k})@(D_{k} + L_{k} + L_{k}^{T} - Y_{k}^{T}H_{0}Y_{k})^{-1}(S_{k} - H_{0}Y_{k})^T
alpha = np.dot(q,(S - tmp*Y).T)#这里计算g_{k}(S_{k} - H_{0}Y_{k})^T
D = np.dot(np.linalg.inv(np.dot(S,Y.T) - tmp*np.dot(Y,Y.T)),(S - tmp*Y))
#这里计算D = (S_{k} - H_{0}Y_{k})@(D_{k} + L_{k} + L_{k}^{T} - Y_{k}^{T}H_{0}Y_{k})^{-1}
r = tmp*q + np.dot(alpha,D)
al_k,ite = gold(FF,GG,-r,X_new)
fun_iter += ite
X = X_new.copy()
X_new = X_new + al_k*(-r)
left = np.linalg.norm(GG(X))
rig = max(1,np.linalg.norm(X))*eps
if left < rig:
break
ela = time.time() - tic
print('iteration:%d,grad_2:%.3e,fun_iter = %d,time = %.2f'%(i + 1,(GG(X)**2).sum(),fun_iter,ela))
return X_new
n = 1000
m = 5
X = np.zeros([1,n])
for i in range(n):
X[:,i] = i + 1
X_new = SRI(FF,GG,HH,X,m,400)
#print(X_new)
print(FF(X_new))