最优化问题
最优化问题的一般表述是
min
x
f
(
x
)
\min\limits_x f(x)
xminf(x)
其中f(x)称为损失函数。如前面所提到的,对于最优化问题,我们往往需要通过迭代的方法来进行求解。
迭代求解最优化问题一般包括5个步骤:
- 确定损失函数f(x)
- 确定初值 x 0 x_0 x0
- 确定每次迭代的增量 d d d
- 进行一次迭代 x k + 1 = x k + α d x_{k+1} = x_k + \alpha d xk+1=xk+αd,其中 α \alpha α控制迭代速率
- 判断是否收敛或达到最大迭代次数,若没有,返回第二步。
常用的迭代方法有梯度下降法,牛顿法,和它们的变种高斯牛顿法、L-M算法。
梯度下降法
梯度下降法直观的表述如图。图中心代表最低点。梯度下降法的主要思想就是每次朝着与当前梯度相反的方向前进一段距离。
它的迭代过程可以表示为 x k + 1 = x k − α ∇ f ( x k ) x_{k+1} = x_k-\alpha\nabla f(x_k) xk+1=xk−α∇f(xk)
牛顿法
牛顿法在梯度下降法的基础上考虑了函数的二阶导数,从而更容易收敛。
牛顿法一开始是用来求函数零点的,对函数f(x)做泰勒展开,有
f
(
x
)
=
f
(
x
0
)
+
f
′
(
x
0
)
(
x
−
x
0
)
f(x) = f(x_0)+f'(x_0)(x-x_0)
f(x)=f(x0)+f′(x0)(x−x0)
令f(x) = 0,有 x = x 0 − f ( x 0 ) f ′ ( x 0 ) x= x_0 -\frac{f(x_0)}{f'(x_0)} x=x0−f′(x0)f(x0)
于是通过 x k + 1 = x k − f ( x k ) f ′ ( x k ) x_{k+1} = x_k - \frac{f(x_k)}{f'(x_k)} xk+1=xk−f′(xk)f(xk)我们可以向函数零点迭代。
事实上使用牛顿法迭代求解极值也是一个求函数零点的过程,只不过是求函数导数的零点,即对
f
(
x
)
=
f
(
x
0
)
+
f
′
(
x
0
)
(
x
−
x
0
)
+
1
2
f
′
′
(
x
0
)
(
x
−
x
0
)
2
f(x) = f(x_0) + f'(x_0)(x-x_0)+\frac{1}{2}f''(x_0)(x-x_0)^2
f(x)=f(x0)+f′(x0)(x−x0)+21f′′(x0)(x−x0)2
求导得到
f
′
(
x
)
=
f
′
(
x
0
)
+
f
′
′
(
x
0
)
(
x
−
x
0
)
f'(x) = f'(x_0)+f''(x_0)(x-x_0)
f′(x)=f′(x0)+f′′(x0)(x−x0)
令它等于0有
x
=
x
0
−
f
′
(
x
0
)
f
′
′
(
x
0
)
x=x_0-\frac{f'(x_0)}{f''(x_0)}
x=x0−f′′(x0)f′(x0)
令 d = − f ′ ( x k ) f ′ ′ ( x k ) d = -\frac{f'(x_k)}{f''(x_k)} d=−f′′(xk)f′(xk)我们可以迭代求解函数最小值
对于多元函数使用同样的过程我们可以得到
d = − H f − 1 ∇ f d= -H_f^{-1}\nabla f d=−Hf−1∇f,其中 H f H_f Hf为f(x)的hessian矩阵
附录:迭代求解的一个例子
#include <iostream>
#include <math.h>
#include <functional>
using namespace std;
double NumbericDerivate(std::function<double(double)> f,double x)
{
const double d = 1e-5;
return (f(x+d)-f(x-d))/(2*d);
}
double Numberic2ndDerivate(std::function<double(double)> f,double x)
{
const double d = 1e-5;
return (f(x+d)-2*f(x)+f(x-d))/(d*d);
}
double NewtonMethod(function<double(double)> f,double therold)
{
double x = 0;
const int maxiter = 50;
for(int i=0;i<maxiter;i++)
{
auto fx = f(x);
auto fx1 = NumbericDerivate(f,x);
auto delta = -fx/fx1;
x = x + delta;
if(fabs(delta)<therold)
break;
}
return x;
}
double NewtonIter(function<double(double)> f,double therold)
{
double x = 0;
const int maxiter = 40;
for(int i=0;i<maxiter;i++)
{
auto fx = f(x);
auto fx1 = NumbericDerivate(f,x);
auto fx2 = Numberic2ndDerivate(f,x);
auto delta = -fx1/fx2;
x = x + delta;
if(fabs(delta)<therold)
break;
}
return x;
}
double GradDecent(function<double(double)> f,double therold,double lr = 0.1)
{
double x = 0;
const int maxiter = 100;
for(int i=0;i<maxiter;i++)
{
auto fx = f(x);
auto fx1 = NumbericDerivate(f,x);
auto delta = -fx1;
x = x + lr * delta;
if(fabs(delta)<therold)
break;
}
return x;
}
int main()
{
//say we'll derivate function f(x) = x^2+sinx;
auto result = NumbericDerivate([](double x){return x*x*sin(x)+sin(x);},3.14);
cout<<result<<endl;
result = NewtonMethod([](double x){return x*x*x+sin(x)+0.1;},0.01);
cout<<result<<endl;
result = Numberic2ndDerivate([](double x){return x*x*x;},1);
cout<<result<<endl;
result = NewtonIter([](double x){return x*(x+2)+3*x+1+exp(x);},0.001);
cout<<result<<endl;
result = GradDecent([](double x){return x*(x+2)+3*x+1+exp(x);},0.001);
cout<<result<<endl;
return 0;
}