有限内存BFGS
BFGS算法的每一步迭代皆形如:
x
k
+
1
=
x
k
−
α
k
H
k
g
k
x_{k+1}=x_{k} - \alpha_{k}H_{k}g_{k}
xk+1=xk−αkHkgk
H
k
+
1
=
V
k
T
H
k
V
k
+
ρ
k
s
k
s
k
T
,
ρ
k
=
1
y
k
T
s
k
,
V
k
=
I
−
ρ
k
y
k
s
k
T
,
s
k
=
x
k
+
1
−
x
k
,
y
k
=
g
k
+
1
−
g
k
.
H_{k+1} = V_{k}^{T}H_{k}V_{k} + \rho_{k}s_{k}s_{k}^{T},\\ \rho_k = \frac{1}{y_{k}^{T}s_{k}},V_{k}=I - \rho_{k}y_{k}s_{k}^{T},\\ s_{k} = x_{k+1}-x_{k},y_{k}=g_{k+1}-g_{k}.
Hk+1=VkTHkVk+ρkskskT,ρk=ykTsk1,Vk=I−ρkykskT,sk=xk+1−xk,yk=gk+1−gk.
具体介绍参考链接添加链接描述
非精确牛顿法
对于非精确牛顿法,直接考虑下降方向的选取,以往的牛顿方法直接求解下面这个方程:
G
k
d
=
−
g
k
G_{k}d=-g_{k}
Gkd=−gk
具体的考虑是,通过某种方法得到一个近似解
d
k
d_k
dk,使得
∥
G
k
d
k
+
g
k
∥
≤
η
k
∥
g
k
∥
\parallel G_{k}d_{k}+g_{k} \parallel \leq \eta_{k} \parallel g_{k} \parallel
∥Gkdk+gk∥≤ηk∥gk∥,这里我们使用G-S迭代得到
d
k
d_k
dk,
η
\eta
η的选取针对不同的函数不太一样,具体参考下文
η
\eta
η的选取参考下面两种方式。
η
1
\eta_1
η1表示
η
k
=
m
i
n
{
0.5
,
∥
g
k
∥
}
\eta_k = min\{0.5,\parallel g_k \parallel \}
ηk=min{0.5,∥gk∥},
η
2
\eta_2
η2表示
η
k
=
m
a
x
{
γ
∥
g
k
∥
/
∥
g
k
−
1
∥
,
γ
η
k
−
1
}
\eta_k = max\{\gamma \parallel g_k \parallel / \parallel g_{k-1} \parallel, \gamma \eta_{k-1}\}
ηk=max{γ∥gk∥/∥gk−1∥,γηk−1}。这个
γ
=
0.2
\gamma = 0.2
γ=0.2,且G-S迭代最多进行100次。
测试函数
这里注意我们取
X
=
(
x
0
,
…
,
x
n
−
1
)
X = (x_0,\ldots,x_{n-1})
X=(x0,…,xn−1),如果
X
∈
R
n
X \in R^n
X∈Rn。\
f
=
∑
i
=
1
m
f
i
2
(
x
0
,
x
2
,
…
,
x
n
−
1
)
f = \sum_{i=1}^{m}f_{i}^{2}(x_0,x_2,\ldots,x_{n-1})
f=∑i=1mfi2(x0,x2,…,xn−1)
∙
\bullet
∙ Penalty I function:
α
=
1
0
−
5
,
\alpha = 10^{-5},
α=10−5,
f
i
(
X
)
=
α
1
2
(
x
i
−
1
)
,
0
≤
i
≤
n
−
1
,
f
n
(
X
)
=
(
∑
i
=
0
n
−
1
x
i
2
)
−
0.25.
x
i
n
i
t
=
(
1
,
2
,
…
,
n
)
.
f_{i}(X) = \alpha^{\frac{1}{2}}(x_i - 1),0\leq i \leq n - 1, f_{n}(X) = (\sum_{i=0}^{n-1} x_{i}^2) - 0.25. \\ x_{init} = (1,2,\ldots,n).
fi(X)=α21(xi−1),0≤i≤n−1,fn(X)=(∑i=0n−1xi2)−0.25.xinit=(1,2,…,n).
∙
\bullet
∙ Trigonometric function:
f
i
(
X
)
=
n
−
∑
i
=
0
n
−
1
c
o
s
(
x
i
)
+
(
i
+
1
)
(
1
−
c
o
s
(
x
i
)
)
−
s
i
n
(
x
i
)
,
0
≤
i
≤
n
−
1
,
x
i
n
i
t
=
(
1
n
,
…
,
1
n
)
.
f_{i}(X) = n - \sum_{i = 0}^{n - 1} cos(x_i) + (i+1)(1 - cos(x_i)) - sin(x_i) ,0\leq i \leq n - 1, x_{init} = (\frac{1}{n},\ldots,\frac{1}{n}).
fi(X)=n−∑i=0n−1cos(xi)+(i+1)(1−cos(xi))−sin(xi),0≤i≤n−1,xinit=(n1,…,n1).
∙
\bullet
∙ Extended Freudenstein and Roth function:
f
i
(
X
)
=
(
x
i
+
x
i
+
1
(
5
x
i
+
1
−
x
i
+
1
2
−
2
)
−
13
)
2
+
(
x
i
+
x
i
+
1
(
x
i
+
1
2
+
x
i
+
1
−
14
)
−
29
)
2
,
0
≤
i
≤
n
−
2
,
x
i
n
i
t
=
(
−
2
,
−
2
,
…
,
−
2
)
.
f_{i}(X) = (x_i + x_{i+1}(5x_{i+1} - x_{i+1}^2 - 2) - 13)^2 + (x_i + x_{i+1}(x_{i+1}^2 + x_{i+1} - 14) - 29)^2 ,\\ 0\leq i \leq n - 2,\\ x_{init} = (-2,-2,\ldots,-2).
fi(X)=(xi+xi+1(5xi+1−xi+12−2)−13)2+(xi+xi+1(xi+12+xi+1−14)−29)2,0≤i≤n−2,xinit=(−2,−2,…,−2).
∙
\bullet
∙ Extended Rosenbrock function:
f
2
i
=
10
∗
(
x
2
i
+
1
−
x
2
i
2
)
,
f
2
i
+
1
=
1
−
x
2
i
.
x
i
n
i
t
=
(
−
1.2
,
1
,
…
,
−
1.2
,
1
)
.
f_{2i} = 10*(x_{2i+1}-x_{2i}^2),\\ f_{2i+1} = 1 - x_{2i}.\\ \\ x_{init} = (-1.2,1,\ldots,-1.2,1).
f2i=10∗(x2i+1−x2i2),f2i+1=1−x2i.xinit=(−1.2,1,…,−1.2,1).
有限内存BFGS代码
首先这里需要重点注意定义函数的方式,绝对要避免for循环,采取张量运算可以提高运行速度。
先拿penalty函数举例子:
import numpy as np
import torch
import time
def FF(X):
n = X.shape[1]
al = 1e-5
return al*((X - 1)**2).sum() + ((X**2).sum() - 0.25)**2
def GG(X):
n = X.shape[1]
al = 1e-5
return 2*al*(X - 1) + 4*X*((X**2).sum() - 0.25)
def HH(X):
n = X.shape[1]
al = 1e-5
return 8*X.T@X + 4*((X**2).sum() - 0.25)*np.eye(n,n)
def gold(FF,GG,d,X):
rho = 0.3
am = 0;aM = 1;al = 0.5*(am + aM)
for i in range(100):
rig1 = FF(X) + rho*al*GG(X)@d.T
rig2 = FF(X) + (1 - rho)*al*GG(X)@d.T
lef = FF(X + al*d)
if lef <= rig1:
if lef >= rig2:
break
else:
am = al
al = am + 0.6*(aM - am)
else:
aM = al
al = am + 0.6*(aM - am)
al_k = al
#print(i)
return al_k,2*(i + 1)
def BFGS(FF,GG,HH,X,m,N):
tic = time.time()
eps = 1e-5
n = X.shape[1]
fun_iter = 0
X_new = np.zeros([1,n])
S = np.zeros([m,n])
Y = np.zeros([m,n])
alpha = np.zeros(m)
for i in range(N):
if i <= m:
if i == 0:
al_k,ite = gold(FF,GG,-GG(X),X)
fun_iter += ite
X_new = X - GG(X)*al_k
else:
s = X_new - X
y = GG(X_new) - GG(X)
S[i - 1:i,:] = s
Y[i - 1:i,:] = y
#d = np.linalg.solve(HH(X_new),-GG(X_new).T)
d = -GG(X).T
al_k,ite = gold(FF,GG,d.T,X_new)
#print(np.linalg.norm(X))
X = X_new.copy()
X_new = X_new - al_k*GG(X_new)
fun_iter += ite
#print(i,al_k,np.linalg.norm(X_new - X))
#print(i,al_k,np.linalg.norm(GG(X_new),np.linalg.norm(GG(X_new)))
#print(i,al_k,np.linalg.norm(GG(X_new)))
else:
s = X_new - X
y = GG(X_new) - GG(X)
S[0:m - 1,:] = S[1:m,:]
Y[0:m - 1,:] = Y[1:m,:]
S[m - 1:m,:] = s
Y[m - 1:m,:] = y
tmp = (s*y).sum()/(y*y).sum()
q = GG(X_new)
for j in range(m - 1,- 1,-1):
alpha[j] = (S[j:j + 1,:]*q).sum()/(S[j:j + 1,:]*Y[j:j + 1,:]).sum()
q = q - alpha[j]*Y[j:j + 1,:]
r = tmp*q
for j in range(0,m,1):
beta = (Y[j:j + 1,:]*r).sum()/(S[j:j + 1,:]*Y[j:j + 1,:]).sum()
r = r + S[j:j + 1,:]*(alpha[j] - beta)
al_k,ite = gold(FF,GG,-r,X_new)
X = X_new.copy()
X_new = X_new + (-r)
left = np.linalg.norm(GG(X))
rig = max(1,np.linalg.norm(X))*eps
if left < rig:
break
ela = time.time() - tic
print('iteration:%d,grad_2:%.3e,fun_iter = %d,time = %.2f'%(i + 1,(GG(X)**2).sum(),fun_iter,ela))
return X_new
n = 1000
m = 15
X = np.zeros([1,n])
for i in range(n):
X[:,i] = i + 1
X_new = BFGS(FF,GG,HH,X,m,100)
#print(X_new)
print(FF(X_new))
上面的FF,GG,HH表示函数,梯度和Hessen矩阵,定义采取张量,使用goldstein搜索步长(wolf原则本人代码太差,发现不好调节参数),BFGS就是算法。
Extended Freudenstein and Roth function
import torch
import numpy as np
import time
def FF(X):
n = X.shape[1]
x = X[:,0:n - 1];y = X[:,1:n]
return (((x + y*((5 - y)*y - 2) - 13)**2 + (x + y*((y + 1)*y - 14) - 29)**2)**2).sum()
def GG(X):
n = X.shape[1]
x = X[:,0:n - 1];y = X[:,1:n]
vec = np.zeros([1,n])
lef = (x + y*((5 - y)*y - 2) - 13);rig = (x + y*((y + 1)*y - 14) - 29)
tmp = (x + y*((5 - y)*y - 2) - 13)**2 + (x + y*((y + 1)*y - 14) - 29)**2
vec[:,0:n - 1] = 4*tmp*(lef + rig)
vec[:,1:n] = 4*tmp*(lef*(-3*y**2 + 10*y - 2) + rig*(3*y**2 + 2*y - 14))
return vec
def HH(X):
n = X.shape[1]
x = X[:,0:n - 1];y = X[:,1:n]
mat = np.zeros([n,n])
lef = (x + y*((5 - y)*y - 2) - 13);rig = (x + y*((y + 1)*y - 14) - 29)
tmp = (x + y*((5 - y)*y - 2) - 13)**2 + (x + y*((y + 1)*y - 14) - 29)**2
tmp_x = 2*(lef + rig);tmp_y = 2*(lef*(-3*y**2 + 10*y - 2) + rig*(3*y**2 + 2*y - 14))
m_xx = 2*(tmp_x)**2 + 4*tmp
m_yy = 2*(tmp_y)**2 + 4*tmp*((-3*y**2 + 10*y - 2)**2 + (3*y**2 + 2*y - 14)**2 + lef*(-6*y - 10) + rig*(6*y + 2))
m_xy = 2*tmp_x*tmp_y + 2*tmp*(-3*y**2 + 10*y - 2 + 3*y**2 + 2*y - 14)
mat[0:n - 1,0:n - 1] = np.diag(m_xx[0,:])
mat[1:n,1:n] += np.diag(m_yy[0,:])
mat += np.diag(m_xy[0,:],1) + np.diag(m_xy[0,:],-1)
return mat
def gold(FF,GG,d,X):
rho = 0.3
am = 0;aM = 1;al = 0.5*(am + aM)
for i in range(100):
rig1 = FF(X) + rho*al*GG(X)@d.T
rig2 = FF(X) + (1 - rho)*al*GG(X)@d.T
lef = FF(X + al*d)
if lef <= rig1:
if lef >= rig2:
break
else:
am = al
al = am + 0.6*(aM - am)
else:
aM = al
al = am + 0.6*(aM - am)
al_k = al
#print(i)
return al_k,2*(i + 1)
def BFGS(FF,GG,HH,X,m,N):
tic = time.time()
eps = 1e-5
n = X.shape[1]
fun_iter = 0
X_new = np.zeros([1,n])
S = np.zeros([m,n])
Y = np.zeros([m,n])
alpha = np.zeros(m)
for i in range(N):
if i <= m:
if i == 0:
al_k,ite = gold(FF,GG,-GG(X),X)
fun_iter += ite
X_new = X - GG(X)*al_k
else:
s = X_new - X
y = GG(X_new) - GG(X)
S[i - 1:i,:] = s
Y[i - 1:i,:] = y
tmp = (s*y).sum()/(y*y).sum()
q = GG(X_new)
for j in range(i - 1,-1,-1):
alpha[j] = (S[j:j + 1,:]*q).sum()/(S[j:j + 1,:]*Y[j:j + 1,:]).sum()
q = q - alpha[j]*Y[j:j + 1,:]
r = tmp*q
for j in range(0,i,1):
beta = (Y[j:j + 1,:]*r).sum()/(S[j:j + 1,:]*Y[j:j + 1,:]).sum()
r = r + (alpha[j] - beta)*S[j:j + 1,:]
al_k,ite = gold(FF,GG,-r,X_new)
fun_iter += ite
X = X_new.copy()
X_new = X_new + al_k*( - r)
else:
s = X_new - X
y = GG(X_new) - GG(X)
S[0:m - 1,:] = S[1:m,:]
Y[0:m - 1,:] = Y[1:m,:]
S[m - 1:m,:] = s
Y[m - 1:m,:] = y
tmp = (s*y).sum()/(y*y).sum()
q = GG(X_new)
for j in range(m - 1,- 1,-1):
alpha[j] = (S[j:j + 1,:]*q).sum()/(S[j:j + 1,:]*Y[j:j + 1,:]).sum()
q = q - alpha[j]*Y[j:j + 1,:]
r = tmp*q
for j in range(0,m,1):
beta = (Y[j:j + 1,:]*r).sum()/(S[j:j + 1,:]*Y[j:j + 1,:]).sum()
r = r + S[j:j + 1,:]*(alpha[j] - beta)
al_k,ite = gold(FF,GG,-r,X_new)
X = X_new.copy()
X_new = X_new + (-r)
left = np.linalg.norm(GG(X))
rig = max(1,np.linalg.norm(X))*eps
if left < rig:
break
ela = time.time() - tic
print('iteration:%d,grad_2:%.3e,fun_iter = %d,time = %.2f'%(i + 1,(GG(X)**2).sum(),fun_iter,ela))
return X_new
n = 1000
m = 5
X = -2*np.ones([1,n])
X_new = BFGS(FF,GG,HH,X,m,200)
#print(X_new)
print(FF(X_new))
Trigonometric function
import torch
import numpy as np
import time
def FF(X):
n = X.shape[1]
return ((n - np.cos(X).sum() + np.arange(1,n + 1)*(1 - np.cos(X)) - np.sin(X))**2).sum()
def GG(X):
n = X.shape[1]
#vec = np.zeros_like(X)
tmp = n - np.cos(X).sum() + np.arange(1,n + 1)*(1 - np.cos(X)) - np.sin(X)
return 2*np.sin(X)*tmp.sum() + 2*tmp*(np.arange(1,n + 1)*np.sin(X) - np.cos(X))
def HH(X):
n = X.shape[1]
mat = np.zeros([n,n])
tmp = n - np.cos(X).sum() + np.arange(1,n + 1)*(1 - np.cos(X)) - np.sin(X)
dia = 2*np.cos(X)*tmp.sum() + 2*np.sin(X)*((np.arange(1,n + 1) + n)*np.sin(X) - np.cos(X)) +\
2*((np.arange(1,n + 1) + 1)*np.sin(X) - np.cos(X))*(np.arange(1,n + 1)*np.sin(X) - np.cos(X)) + \
+ 2*tmp*(np.sin(X) + np.arange(1,n + 1)*np.cos(X))
mat = 2*np.sin(X).T*((np.arange(1,n + 1) + n)*np.sin(X) - np.cos(X)) + \
(np.arange(1,n + 1)*np.sin(X) - np.cos(X)).T*2*np.sin(X)
mat.flat[::(n + 1)] = dia
return mat
def gold(FF,GG,d,X):
rho = 0.3
am = 0;aM = 1;al = 0.5*(am + aM)
for i in range(100):
rig1 = FF(X) + rho*al*GG(X)@d.T
rig2 = FF(X) + (1 - rho)*al*GG(X)@d.T
lef = FF(X + al*d)
if lef <= rig1:
if lef >= rig2:
break
else:
am = al
al = am + 0.6*(aM - am)
else:
aM = al
al = am + 0.6*(aM - am)
al_k = al
#print(i)
return al_k,2*(i + 1)
def BFGS(FF,GG,HH,X,m,N):
tic = time.time()
eps = 1e-5
n = X.shape[1]
fun_iter = 0
X_new = np.zeros([1,n])
S = np.zeros([m,n])
Y = np.zeros([m,n])
alpha = np.zeros(m)
for i in range(N):
if i <= m:
if i == 0:
al_k,ite = gold(FF,GG,-GG(X),X)
fun_iter += ite
X_new = X - GG(X)*al_k
else:
s = X_new - X
y = GG(X_new) - GG(X)
S[i - 1:i,:] = s
Y[i - 1:i,:] = y
tmp = (s*y).sum()/(y*y).sum()
q = GG(X_new)
for j in range(i - 1,-1,-1):
alpha[j] = (S[j:j + 1,:]*q).sum()/(S[j:j + 1,:]*Y[j:j + 1,:]).sum()
q = q - alpha[j]*Y[j:j + 1,:]
r = tmp*q
for j in range(0,i,1):
beta = (Y[j:j + 1,:]*r).sum()/(S[j:j + 1,:]*Y[j:j + 1,:]).sum()
r = r + (alpha[j] - beta)*S[j:j + 1,:]
al_k,ite = gold(FF,GG,-r,X_new)
fun_iter += ite
X = X_new.copy()
X_new = X_new + al_k*( - r)
else:
s = X_new - X
y = GG(X_new) - GG(X)
S[0:m - 1,:] = S[1:m,:]
Y[0:m - 1,:] = Y[1:m,:]
S[m - 1:m,:] = s
Y[m - 1:m,:] = y
tmp = (s*y).sum()/(y*y).sum()
q = GG(X_new)
for j in range(m - 1,- 1,-1):
alpha[j] = (S[j:j + 1,:]*q).sum()/(S[j:j + 1,:]*Y[j:j + 1,:]).sum()
q = q - alpha[j]*Y[j:j + 1,:]
r = tmp*q
for j in range(0,m,1):
beta = (Y[j:j + 1,:]*r).sum()/(S[j:j + 1,:]*Y[j:j + 1,:]).sum()
r = r + S[j:j + 1,:]*(alpha[j] - beta)
al_k,ite = gold(FF,GG,-r,X_new)
X = X_new.copy()
X_new = X_new + (-r)
left = np.linalg.norm(GG(X))
rig = max(1,np.linalg.norm(X))*eps
if left < rig:
break
ela = time.time() - tic
print('iteration:%d,grad_2:%.3e,fun_iter = %d,time = %.2f'%(i + 1,(GG(X)**2).sum(),fun_iter,ela))
return X_new
n = 1000
m = 15
X = np.ones([1,n])/n
X_new = BFGS(FF,GG,HH,X,m,200)
#print(X_new)
print(FF(X_new))
Extended Rosenbrock function
import torch
import numpy as np
import time
def FF(X):
n = X.shape[1]
X_even = X[:,0:n:2];X_odd = X[:,1:n:2]
return 100*((X_odd - X_even**2)**2).sum() + ((1 - X_even)**2).sum()
def GG(X):
n = X.shape[1]
vec = np.zeros_like(X)
X_even = X[:,0:n:2];X_odd = X[:,1:n:2]
vec[:,0:n:2] = -400*X_even*(X_odd - X_even**2) - 2*(1 - X_even)
vec[:,1:n:2] = 200*(X_odd - X_even**2)
return vec
def HH(X):
n = X.shape[1];rank = int(n/2)
X_even = X[:,0:n:2];X_odd = X[:,1:n:2]
mat = np.zeros([n,n])
diag = np.zeros(n)
diag[0:n:2] = -400*(X_odd - X_even**2) + 800*X_even**2 + 2
diag[1:n:2] = 200*np.ones([1,rank])
xie = np.zeros(n - 1)
xie[0:n - 1:2] = -400*X_even
mat = np.diag(diag) + np.diag(xie,1) + np.diag(xie,-1)
return mat
def gold(FF,GG,d,X):
rho = 0.3
am = 0;aM = 1;al = 0.5*(am + aM)
for i in range(100):
rig1 = FF(X) + rho*al*GG(X)@d.T
rig2 = FF(X) + (1 - rho)*al*GG(X)@d.T
lef = FF(X + al*d)
if lef <= rig1:
if lef >= rig2:
break
else:
am = al
al = am + 0.6*(aM - am)
else:
aM = al
al = am + 0.6*(aM - am)
al_k = al
#print(i)
return al_k,2*(i + 1)
def BFGS(FF,GG,HH,X,m,N):
tic = time.time()
eps = 1e-5
n = X.shape[1]
fun_iter = 0
X_new = np.zeros([1,n])
S = np.zeros([m,n])
Y = np.zeros([m,n])
alpha = np.zeros(m)
for i in range(N):
if i <= m:
if i == 0:
al_k,ite = gold(FF,GG,-GG(X),X)
fun_iter += ite
X_new = X - GG(X)*al_k
else:
s = X_new - X
y = GG(X_new) - GG(X)
S[i - 1:i,:] = s
Y[i - 1:i,:] = y
tmp = (s*y).sum()/(y*y).sum()
q = GG(X_new)
for j in range(i - 1,-1,-1):
alpha[j] = (S[j:j + 1,:]*q).sum()/(S[j:j + 1,:]*Y[j:j + 1,:]).sum()
q = q - alpha[j]*Y[j:j + 1,:]
r = tmp*q
for j in range(0,i,1):
beta = (Y[j:j + 1,:]*r).sum()/(S[j:j + 1,:]*Y[j:j + 1,:]).sum()
r = r + (alpha[j] - beta)*S[j:j + 1,:]
al_k,ite = gold(FF,GG,-r,X_new)
fun_iter += ite
X = X_new.copy()
X_new = X_new + al_k*( - r)
else:
s = X_new - X
y = GG(X_new) - GG(X)
S[0:m - 1,:] = S[1:m,:]
Y[0:m - 1,:] = Y[1:m,:]
S[m - 1:m,:] = s
Y[m - 1:m,:] = y
tmp = (s*y).sum()/(y*y).sum()
q = GG(X_new)
for j in range(m - 1,- 1,-1):
alpha[j] = (S[j:j + 1,:]*q).sum()/(S[j:j + 1,:]*Y[j:j + 1,:]).sum()
q = q - alpha[j]*Y[j:j + 1,:]
r = tmp*q
for j in range(0,m,1):
beta = (Y[j:j + 1,:]*r).sum()/(S[j:j + 1,:]*Y[j:j + 1,:]).sum()
r = r + S[j:j + 1,:]*(alpha[j] - beta)
al_k,ite = gold(FF,GG,-r,X_new)
X = X_new.copy()
X_new = X_new + (-r)
left = np.linalg.norm(GG(X))
rig = max(1,np.linalg.norm(X))*eps
if left < rig:
break
ela = time.time() - tic
print('iteration:%d,grad_2:%.3e,fun_iter = %d,time = %.2f'%(i + 1,(GG(X)**2).sum(),fun_iter,ela))
return X_new
n = 1000
rank = int(n/2)
m = 5
X = np.zeros([1,n])
for i in range(rank):
X[:,2*i] = -1.2
X[:,2*i + 1] = 1
X_new = BFGS(FF,GG,HH,X,m,200)
#print(X_new)
print(FF(X_new))
关于上面的代码,重点是函数的定义
非精确牛顿法代码
上面有一个三角函数的Hessen矩阵不正定,G-S迭代不收敛,那里需要添加一个对角矩阵修改:
import torch
import numpy as np
import time
def FF(X):
n = X.shape[1]
return ((n - np.cos(X).sum() + np.arange(1,n + 1)*(1 - np.cos(X)) - np.sin(X))**2).sum()
def GG(X):
n = X.shape[1]
#vec = np.zeros_like(X)
tmp = n - np.cos(X).sum() + np.arange(1,n + 1)*(1 - np.cos(X)) - np.sin(X)
return 2*np.sin(X)*tmp.sum() + 2*tmp*(np.arange(1,n + 1)*np.sin(X) - np.cos(X))
def HH(X):
n = X.shape[1]
mat = np.zeros([n,n])
tmp = n - np.cos(X).sum() + np.arange(1,n + 1)*(1 - np.cos(X)) - np.sin(X)
dia = 2*np.cos(X)*tmp.sum() + 2*np.sin(X)*((np.arange(1,n + 1) + n)*np.sin(X) - np.cos(X)) +\
2*((np.arange(1,n + 1) + 1)*np.sin(X) - np.cos(X))*(np.arange(1,n + 1)*np.sin(X) - np.cos(X)) + \
+ 2*tmp*(np.sin(X) + np.arange(1,n + 1)*np.cos(X))
mat = 2*np.sin(X).T*((np.arange(1,n + 1) + n)*np.sin(X) - np.cos(X)) + \
(np.arange(1,n + 1)*np.sin(X) - np.cos(X)).T*2*np.sin(X)
mat.flat[::(n + 1)] = dia
return mat
def GS(HH,GG,X,eta,N):
R = HH(X);b = -GG(X).T
n = X.shape[1]
nu = max(abs(np.diag(R))) + 1
A = R + nu*np.eye(n,n)
b_norm = (b**2).mean()
x0 = b
x = x0.copy()
for k in range(N):
x[0,0] = (b[0,0] - (A[0:1,1:n]@x0[1:n,0]).item())/A[0,0]
for i in range(1,n):
x[i,0] = (b[i,0] - (A[i:i + 1,0:i]@x[0:i]).item() - (A[i: i + 1,i + 1:n]@x0[i + 1:n]).item())/A[i,i]
x0 = x
r_norm = ((R@x - b)**2).mean()
if r_norm < eta*b_norm:
break
#print(k)
return x
def INewton(FF,GG,HH,X,N):
tic = time.time()
eps = 1e-5
fun_iter = 0
gama = 0.2
eta = np.zeros(2)
X_old = X.copy()
for i in range(N):
if i == 0:
eta[0] = 1e-2
d = GS(HH,GG,X_old,eta[0],100)
fun_iter += 1
X = X_old + d.T
eta[1] = gama*np.linalg.norm(X)/np.linalg.norm(X_old)
X_old = X
else:
if gama*eta[0] > 0.1:
eta[1] = max(eta[1],gama*eta[0])
d = GS(HH,GG,X_old,eta[1],100)
fun_iter += 1
X = X_old + d.T
eta[0] = eta[1]
eta[1] = gama*np.linalg.norm(X)/np.linalg.norm(X_old)
X_old = X
else:
d = GS(HH,GG,X_old,eta[1],100)
fun_iter += 1
X = X_old + d.T
eta[0] = eta[1]
eta[1] = gama*np.linalg.norm(X)/np.linalg.norm(X_old)
X_old = X
if (i + 1)%50 == 0:
print('the iteration:%d'%(i + 1))
tmp_lf = (GG(X)**2).sum()
tmp_ri = max(1,(X**2).sum())*eps
if tmp_lf < tmp_ri:
break
ela = time.time() - tic
print('iteration:%d,grad_2:%.3e,fun_iter = %d,time = %.2f'%(i + 1,(GG(X)**2).sum(),fun_iter,ela))
print('the end:%d'%(i))
return X
def Inewton(FF,GG,HH,X,N):
tic = time.time()
eps = 1e-5
fun_iter = 0
X_old = X.copy()
for i in range(N):
eta = min(1e-4,(GG(X_old)**2).sum())
d = GS(HH,GG,X_old,eta,1000)
fun_iter += 1
X = X_old + d.T
#print(np.linalg.norm(X - X_old))
X_old = X
if (i + 1)%50 == 0:
print('the iteration:%d'%(i + 1))
tmp_lf = (GG(X)**2).sum()
tmp_ri = max(1,(X**2).sum())*eps
if tmp_lf < tmp_ri:
break
ela = time.time() - tic
print('iteration:%d,grad_2:%.3e,fun_iter = %d,time = %.2f'%(i + 1,(GG(X)**2).sum(),fun_iter,ela))
print('the end:%d'%(i))
return X
n = 1000
X = np.ones([1,n])/n
X_new = Inewton(FF,GG,HH,X,400)#这里只需替换为X_new = INewton(FF,GG,HH,X,500)就得到另一种修正
#print(X_new)
print(FF(X_new))
重点
对于不同的函数,G-S迭代跳出循环的条件和最大迭代次数需要调整,不然得到的不是下降方向。