牛顿迭代法及其应用
考虑无约束优化问题
min
x
f
(
x
)
\min\limits_{\boldsymbol{x}}f(\boldsymbol{x})
xminf(x)
假设
f
(
x
)
f(\boldsymbol{x})
f(x)二阶可微,在第
k
k
k次的迭代值为
x
k
\boldsymbol{x}^k
xk,可将
f
(
x
)
f(\boldsymbol{x})
f(x)在
x
k
\boldsymbol{x}^k
xk处泰勒展开
f
(
x
)
=
f
(
x
k
)
+
(
x
−
x
k
)
T
∇
f
(
x
k
)
+
1
2
(
x
−
x
k
)
T
H
(
x
k
)
(
x
−
x
k
)
f(\boldsymbol{x})=f(\boldsymbol{x}^k)+(\boldsymbol{x}-\boldsymbol{x}^k)^T\nabla f(\boldsymbol{x}^k)+\frac{1}{2}(\boldsymbol{x}-\boldsymbol{x}^k)^TH(\boldsymbol{x}^k) (\boldsymbol{x}-\boldsymbol{x}^k)
f(x)=f(xk)+(x−xk)T∇f(xk)+21(x−xk)TH(xk)(x−xk)
其中
H
(
x
k
)
=
∇
2
f
(
x
k
)
H(\boldsymbol{x}^k)=\nabla^2f(\boldsymbol{x}^k)
H(xk)=∇2f(xk),为使下一次迭代使函数值变小,对泰勒展开式求导得
∇
f
(
x
)
=
∇
f
(
x
k
)
+
H
(
x
k
)
(
x
−
x
k
)
\nabla f(\boldsymbol{x})=\nabla f(\boldsymbol{x}^k)+H(\boldsymbol{x}^k)(\boldsymbol{x}-\boldsymbol{x}^k)
∇f(x)=∇f(xk)+H(xk)(x−xk)
令
∇
f
(
x
)
=
0
\nabla f(\boldsymbol{x})=0
∇f(x)=0得
x
=
x
k
−
H
(
x
k
)
−
1
∇
f
(
x
k
)
\boldsymbol{x}=\boldsymbol{x}^k-H(\boldsymbol{x}^k)^{-1}\nabla f(\boldsymbol{x}^k)
x=xk−H(xk)−1∇f(xk)
因此第
k
+
1
k+1
k+1次的迭代公式为
x
k
+
1
=
x
k
−
H
(
x
k
)
−
1
∇
f
(
x
k
)
\boldsymbol{x}^{k+1}=\boldsymbol{x}^k-H(\boldsymbol{x}^k)^{-1}\nabla f(\boldsymbol{x}^k)
xk+1=xk−H(xk)−1∇f(xk)
接下来考虑使用牛顿迭代法求解对数几率回归,对数几率回归可以写成如下优化问题
min
β
∑
i
=
1
m
(
−
y
i
β
T
x
^
i
+
l
n
(
1
+
e
β
T
x
^
i
)
)
\min\limits_{\boldsymbol{\beta}}\sum_{i=1}^{m}(-y_i\boldsymbol{\beta}^T\hat{\boldsymbol{x}}_i+ln(1+e^{\boldsymbol{\beta}^T\hat{\boldsymbol{x}}_i}))
βmini=1∑m(−yiβTx^i+ln(1+eβTx^i))
令
l
(
β
)
=
∑
i
=
1
m
(
−
y
i
β
T
x
^
+
l
n
(
1
+
e
β
T
x
^
)
)
l(\boldsymbol{\beta})=\sum_{i=1}^{m}(-y_i\boldsymbol{\beta}^T\hat{\boldsymbol{x}}+ln(1+e^{\boldsymbol{\beta}^T\hat{\boldsymbol{x}}}))
l(β)=∑i=1m(−yiβTx^+ln(1+eβTx^)),首先求梯度的表达式,
∂
l
(
β
)
∂
β
=
∑
i
=
1
m
(
−
y
i
x
^
i
+
x
^
i
e
β
T
x
^
i
1
+
e
β
T
x
^
i
)
\frac{\partial l(\boldsymbol{\beta})}{\partial\boldsymbol{\beta}}=\sum_{i=1}^{m}(-y_i\hat{\boldsymbol{x}}_i+\frac{\hat{\boldsymbol{x}}_ie^{\boldsymbol{\beta}^T\hat{\boldsymbol{x}}_i}}{1+e^{\boldsymbol{\beta}^T\hat{\boldsymbol{x}}_i}})
∂β∂l(β)=i=1∑m(−yix^i+1+eβTx^ix^ieβTx^i)
其次计算
H
e
s
s
i
a
n
Hessian
Hessian矩阵的表达式
∂
2
l
(
β
)
∂
β
∂
β
T
=
∑
i
=
1
m
e
β
T
x
^
i
(
1
+
e
β
T
x
^
i
)
2
x
^
i
x
^
i
T
\frac{\partial^2l(\boldsymbol{\beta})}{\partial\boldsymbol{\beta}\partial\boldsymbol{\beta}^T}=\sum_{i=1}^{m}\frac{e^{\boldsymbol{\beta}^T\hat{\boldsymbol{x}}_i}}{(1+e^{\boldsymbol{\beta}^T\hat{\boldsymbol{x}}_i})^2}\hat{\boldsymbol{x}}_i\hat{\boldsymbol{x}}_i^T
∂β∂βT∂2l(β)=i=1∑m(1+eβTx^i)2eβTx^ix^ix^iT
最后给出使用牛顿迭代法求解对数几率回归的python实现
import numpy as np
from sklearn import datasets
import matplotlib.pyplot as plt
class LogitRegression:
def __init__(self):
self._beta = None
self.coef_ = None
self.interception_ = None
def _sigmoid(self, x):
return 1. / (1. + np.exp(-x))
def fit(self, X, y, eta=0.1, n_iters=1e3):
assert X.shape[0] == y.shape[0]
def dL(beta, X_b, y):
try:
vec = beta.dot(X_b.T)
coef = np.exp(vec) / (1 + np.exp(vec))
return (-y.dot(X_b) + X_b.T.dot(coef)) / len(X_b)
except Exception:
return float('inf')
def Hessian(beta, X_b):
H = np.zeros(shape=(X_b.shape[1], X_b.shape[1]))
for x in X_b:
e = np.exp(beta.dot(x))
H += e / (e ** 2) * x[:, np.newaxis].dot(x[np.newaxis])
return H
def newton_method(X_b, y, n_iters, initial_beta, eta, eps):
beta = initial_beta
i_iter = 0
while i_iter < n_iters:
gradient = dL(beta, X_b, y)
if np.sum(np.abs(gradient)) < eps:
break
H = Hessian(beta, X_b)
beta = beta - eta * np.linalg.inv(H).dot(gradient)
i_iter += 1
return beta
X_b = np.hstack([np.ones((len(X), 1)), X])
initial_beta = np.zeros(X_b.shape[1])
eps = X.shape[1] / 10
self._beta = newton_method(X_b, y, n_iters, initial_beta, eta, eps)
self.coef_ = self._beta[1:]
self.interception_ = self._beta[0]
return self
def predict_probability(self, X_predict):
X_b = np.hstack([np.ones((len(X_predict), 1)), X_predict])
return self._sigmoid(X_b.dot(self._beta))
def predict(self, X_predict):
assert self.coef_ is not None and self.interception_ is not None
assert X_predict.shape[1] == len(self.coef_)
probability = self.predict_probability(X_predict)
return np.array(probability >= 0.5, dtype='int')
def score(self, X_test, y_test):
y_predict = self.predict(X_test)
return np.sum(y_predict == y_test) / len(y_test)
def __repr__(self):
return "LogitRegression()"
if __name__ == "__main__":
iris = datasets.load_iris()
X = iris.data[:, :2]
y = iris.target
X = X[y < 2]
y = y[y < 2]
logit = LogitRegression().fit(X, y)
y_predict = logit.predict(X)
print(np.sum(y_predict == y))
w, b = logit.coef_, logit.interception_
def x2(w, b, x1):
return -(w[0] * x1 + b) / w[1]
plt.figure()
plt.scatter(X[y == 0, 0], X[y == 0, 1], c='r')
plt.scatter(X[y == 1, 0], X[y == 1, 1], c='b')
xmin, xmax = np.min(X[:, 0]), np.max(X[:, 0])
plt.plot([xmin, xmax], [x2(w, b, xmin), x2(w, b, xmax)], label='boundary')
plt.title('Boundary of LogitRegression')
plt.legend()
plt.show()
扩展:逆牛顿法
牛顿迭代法每一次都要计算 H e s s i a n Hessian Hessian矩阵的逆,计算量较大,并且 H e s s i a n Hessian Hessian矩阵可能不可逆,逆牛顿法的基本思想是使用一个矩阵 G k = G ( x k ) \boldsymbol{G}_k=G(\boldsymbol{x^k}) Gk=G(xk)来近似代替 H ( x k ) − 1 H(\boldsymbol{x}^k)^{-1} H(xk)−1。
牛顿迭代法中的
H
(
x
k
)
H(\boldsymbol{x}^k)
H(xk)满足以下条件
∇
f
(
x
k
+
1
)
−
∇
f
(
x
k
)
=
H
(
x
k
)
(
x
k
+
1
−
x
k
)
\nabla f(\boldsymbol{x}^{k+1})-\nabla f(\boldsymbol{x}^k)=H(\boldsymbol{x}^k)(\boldsymbol{x}^{k+1}-\boldsymbol{x}^k)
∇f(xk+1)−∇f(xk)=H(xk)(xk+1−xk)
等价于
H
(
x
k
)
−
1
(
∇
f
(
x
k
+
1
)
−
∇
f
(
x
k
)
)
=
(
x
k
+
1
−
x
k
)
H(\boldsymbol{x}^k)^{-1}(\nabla f(\boldsymbol{x}^{k+1})-\nabla f(\boldsymbol{x}^k))=(\boldsymbol{x}^{k+1}-\boldsymbol{x}^k)
H(xk)−1(∇f(xk+1)−∇f(xk))=(xk+1−xk)
这称为拟牛顿条件,逆牛顿法将
G
k
\boldsymbol{G}_k
Gk作为
H
(
x
k
)
−
1
H(\boldsymbol{x}^k)^{-1}
H(xk)−1的近似,要求矩阵
G
k
\boldsymbol{G}_k
Gk正定,并且满足
G
k
+
1
(
∇
f
(
x
k
+
1
)
−
∇
f
(
x
k
)
)
=
(
x
k
+
1
−
x
k
)
\boldsymbol{G}_{k+1}(\nabla f(\boldsymbol{x}^{k+1})-\nabla f(\boldsymbol{x}^k))=(\boldsymbol{x}^{k+1}-\boldsymbol{x}^k)
Gk+1(∇f(xk+1)−∇f(xk))=(xk+1−xk)
不同的逆牛顿法,区别就在于如何确定
G
\boldsymbol{G}
G。