线性收敛的随机L-BFGS算法

1 介绍

$\begin{array}{}\text{(28)}& \underset{w}{min}\frac{1}{N}\sum _{i=1}^{N}{f}_{i}\left(w\right),\end{array}$

2 算法

$\begin{array}{}\text{(29)}& f\left(w\right)=\frac{1}{N}\sum _{i=1}^{N}{f}_{i}\left(w\right)\end{array}$

$\begin{array}{}\text{(30)}& {f}_{\mathcal{S}}\left(w\right)=\frac{1}{|\mathcal{S}|}\sum _{i\in \mathcal{S}}{f}_{i}\left(w\right).\end{array}$

${\omega }_{k+1}={\omega }_{k}-{\eta }_{k}{H}_{k}{v}_{k}$

${\mathcal{F}}_{k,t}=\sigma \left(\begin{array}{c}\left\{{\mathcal{S}}_{{k}^{\prime },{t}^{\prime }}\phantom{\rule{thinmathspace}{0ex}}:\phantom{\rule{thinmathspace}{0ex}}{k}^{\prime }

2.1 构造Hessian逆的近似H_r

$\text{$$\label{eq:inv_hess_update} H_r^{(j)} = (I - \rho_j s_j y_j^{\top})^{\top}H_r^{(j-1)}(I - \rho_j s_j y_j^{\top}) + \rho_j s_j s_j^{\top} ,$$}$

3 前言

假设2

$\begin{array}{}\text{(40)}& \lambda I⪯{\mathrm{\nabla }}^{2}{f}_{\mathcal{T}}\left(w\right)⪯\mathrm{\Lambda }I\end{array}$

引理 3

$tr\left({B}_{r}\right)\le \left(d+M\right)\Lambda \phantom{\rule{0ex}{0ex}}det\left({B}_{r}\right)\ge {\lambda }^{\left(}M+d\right)/\left(\left(d+M\right)\Lambda {\right)}^{M}$

引理 4

$\begin{array}{}\text{(41)}& \gamma I⪯{H}_{r}⪯\mathrm{\Gamma }I\end{array}$

$\gamma =\frac{1}{\left(d+M\right)\mathrm{\Lambda }}\phantom{\rule{1em}{0ex}}\text{and}\phantom{\rule{1em}{0ex}}\mathrm{\Gamma }=\frac{\left(\left(d+M\right)\mathrm{\Lambda }{\right)}^{d+M-1}}{{\lambda }^{d+M}}.$

$‖\mathrm{\nabla }f\left(x\right){‖}^{2}\ge 2\lambda \left(f\left(x\right)-f\left({w}_{\ast }\right)\right).$

$\begin{array}{rl}f\left({w}_{\ast }\right)& \ge f\left(x\right)+\mathrm{\nabla }f\left(x{\right)}^{\mathrm{\top }}\left({w}_{\ast }-x\right)+\frac{\lambda }{2}‖{w}_{\ast }-x{‖}^{2}\\ & \ge f\left(x\right)+\underset{v}{min}\left(\mathrm{\nabla }f\left(x{\right)}^{\mathrm{\top }}v+\frac{\lambda }{2}‖v{‖}^{2}\right)\\ & =f\left(x\right)-\frac{1}{2\lambda }‖\mathrm{\nabla }f\left(x\right){‖}^{2}.\end{array}$

$\begin{array}{}\text{(42)}& {\mathbb{E}}_{k,t}\left[‖{v}_{t}{‖}^{2}\right]\le 4\mathrm{\Lambda }\left(f\left({x}_{t}\right)-f\left({w}_{\ast }\right)+f\left({w}_{k}\right)-f\left({w}_{\ast }\right)\right).\end{array}$

4 收敛分析

定理7

$\mathbb{E}\left[f\left({w}_{k}\right)-f\left({w}_{\ast }\right)\right]\le {\alpha }^{k}\mathbb{E}\left[f\left({w}_{0}\right)-f\left({w}_{\ast }\right)\right]$

$\alpha =\frac{1/\left(2m\eta \right)+\eta {\mathrm{\Gamma }}^{2}{\mathrm{\Lambda }}^{2}}{\gamma \lambda -\eta {\mathrm{\Gamma }}^{2}{\mathrm{\Lambda }}^{2}}<1$

$\begin{array}{rl}\text{(33)}& & \phantom{\rule{thinmathspace}{0ex}}\phantom{\rule{thinmathspace}{0ex}}f\left({x}_{t+1}\right)\le & \phantom{\rule{thinmathspace}{0ex}}\phantom{\rule{thinmathspace}{0ex}}f\left({x}_{t}\right)+\mathrm{\nabla }f\left({x}_{t}{\right)}^{\mathrm{\top }}\left({x}_{t+1}-{x}_{t}\right)+\frac{\mathrm{\Lambda }}{2}‖{x}_{t+1}-{x}_{t}{‖}^{2}\\ =& \phantom{\rule{thinmathspace}{0ex}}\phantom{\rule{thinmathspace}{0ex}}f\left({x}_{t}\right)-\eta \mathrm{\nabla }f\left({x}_{t}{\right)}^{\mathrm{\top }}{H}_{r}{v}_{t}+\frac{{\eta }^{2}\mathrm{\Lambda }}{2}‖{H}_{k}{v}_{t}{‖}^{2}.\end{array}$

${\mathbb{E}}_{k,t}\left[f\left({x}_{t+1}\right)\right]\le f\left({x}_{t}\right)-\eta \gamma ‖\mathrm{\nabla }f\left({x}_{t}\right){‖}^{2}+\frac{{\eta }^{2}{\mathrm{\Gamma }}^{2}\mathrm{\Lambda }}{2}{\mathbb{E}}_{k,t}‖{v}_{t}{‖}^{2}.$

$\begin{array}{rl}& \phantom{\rule{thinmathspace}{0ex}}\phantom{\rule{thinmathspace}{0ex}}{\mathbb{E}}_{k,t}\left[f\left({x}_{t+1}\right)\right]\\ \le & \phantom{\rule{thinmathspace}{0ex}}\phantom{\rule{thinmathspace}{0ex}}f\left({x}_{t}\right)-2\eta \gamma \lambda \left(f\left({x}_{t}\right)-f\left({w}_{\ast }\right)\right)\\ & \phantom{\rule{1em}{0ex}}+2{\eta }^{2}{\mathrm{\Gamma }}^{2}{\mathrm{\Lambda }}^{2}\left(f\left({x}_{t}\right)-f\left({w}_{\ast }\right)+f\left({w}_{k}\right)-f\left({w}_{\ast }\right)\right)\\ =& \phantom{\rule{thinmathspace}{0ex}}\phantom{\rule{thinmathspace}{0ex}}f\left({x}_{t}\right)-2\eta \left(\gamma \lambda -\eta {\mathrm{\Gamma }}^{2}{\mathrm{\Lambda }}^{2}\right)\left(f\left({x}_{t}\right)-f\left({w}_{\ast }\right)\right)\\ & \phantom{\rule{1em}{0ex}}+2{\eta }^{2}{\mathrm{\Gamma }}^{2}{\mathrm{\Lambda }}^{2}\left(f\left({w}_{k}\right)-f\left({w}_{\ast }\right)\right).\end{array}$

$\begin{array}{rl}& \phantom{\rule{thinmathspace}{0ex}}\phantom{\rule{thinmathspace}{0ex}}\mathbb{E}\left[f\left({x}_{m}\right)\right]\\ \le & \phantom{\rule{thinmathspace}{0ex}}\phantom{\rule{thinmathspace}{0ex}}\mathbb{E}\left[f\left({x}_{0}\right)\right]+2m{\eta }^{2}{\mathrm{\Gamma }}^{2}{\mathrm{\Lambda }}^{2}\mathbb{E}\left[f\left({w}_{k}\right)-f\left({w}_{\ast }\right)\right]\\ & \phantom{\rule{1em}{0ex}}-2\eta \left(\gamma \lambda -\eta {\mathrm{\Gamma }}^{2}{\mathrm{\Lambda }}^{2}\right)\left(\sum _{t=0}^{m-1}\mathbb{E}\left[f\left({x}_{t}\right)\right]-mf\left({w}_{\ast }\right)\right)\\ =& \phantom{\rule{thinmathspace}{0ex}}\phantom{\rule{thinmathspace}{0ex}}\mathbb{E}\left[f\left({w}_{k}\right)\right]+2m{\eta }^{2}{\mathrm{\Gamma }}^{2}{\mathrm{\Lambda }}^{2}\mathbb{E}\left[f\left({w}_{k}\right)-f\left({w}_{\ast }\right)\right]\\ & \phantom{\rule{1em}{0ex}}-2m\eta \left(\gamma \lambda -\eta {\mathrm{\Gamma }}^{2}{\mathrm{\Lambda }}^{2}\right)\mathbb{E}\left[f\left({w}_{k+1}\right)-f\left({w}_{\ast }\right)\right].\end{array}$

$\begin{array}{rl}0\le & \phantom{\rule{thinmathspace}{0ex}}\phantom{\rule{thinmathspace}{0ex}}\mathbb{E}\left[f\left({w}_{k}\right)-f\left({x}_{m}\right)\right]+2m{\eta }^{2}{\mathrm{\Gamma }}^{2}{\mathrm{\Lambda }}^{2}\mathbb{E}\left[f\left({w}_{k}\right)-f\left({w}_{\ast }\right)\right]\\ & \phantom{\rule{1em}{0ex}}-2m\eta \left(\gamma \lambda -\eta {\mathrm{\Gamma }}^{2}{\mathrm{\Lambda }}^{2}\right)\mathbb{E}\left[f\left({w}_{k+1}\right)-f\left({w}_{\ast }\right)\right]\\ \le & \phantom{\rule{thinmathspace}{0ex}}\phantom{\rule{thinmathspace}{0ex}}\mathbb{E}\left[f\left({w}_{k}\right)-f\left({w}_{\ast }\right)\right]+2m{\eta }^{2}{\mathrm{\Gamma }}^{2}{\mathrm{\Lambda }}^{2}\mathbb{E}\left[f\left({w}_{k}\right)-f\left({w}_{\ast }\right)\right]\\ & \phantom{\rule{1em}{0ex}}-2m\eta \left(\gamma \lambda -\eta {\mathrm{\Gamma }}^{2}{\mathrm{\Lambda }}^{2}\right)\mathbb{E}\left[f\left({w}_{k+1}\right)-f\left({w}_{\ast }\right)\right]\\ =& \phantom{\rule{thinmathspace}{0ex}}\phantom{\rule{thinmathspace}{0ex}}\left(1+2m{\eta }^{2}{\mathrm{\Gamma }}^{2}{\mathrm{\Lambda }}^{2}\right)\mathbb{E}\left[f\left({w}_{k}\right)-f\left({w}_{\ast }\right)\right]\\ & \phantom{\rule{1em}{0ex}}-2m\eta \left(\gamma \lambda -\eta {\mathrm{\Gamma }}^{2}{\mathrm{\Lambda }}^{2}\right)\mathbb{E}\left[f\left({w}_{k+1}\right)-f\left({w}_{\ast }\right)\right].\end{array}$

$\begin{array}{rl}& \phantom{\rule{thinmathspace}{0ex}}\phantom{\rule{thinmathspace}{0ex}}\mathbb{E}\left[f\left({w}_{k+1}\right)-f\left({w}_{\ast }\right)\right]\\ \le & \phantom{\rule{thinmathspace}{0ex}}\phantom{\rule{thinmathspace}{0ex}}\frac{1+2m{\eta }^{2}{\mathrm{\Gamma }}^{2}{\mathrm{\Lambda }}^{2}}{2m\eta \left(\gamma \lambda -\eta {\mathrm{\Gamma }}^{2}{\mathrm{\Lambda }}^{2}\right)}\mathbb{E}\left[f\left({w}_{k}\right)-f\left({w}_{\ast }\right)\right].\end{array}$

5 相关工作

Lucchi et al.(2015) 独立提出了方差约减来加速随机拟牛顿法,并能够达到线性收敛率.他们更新Hessian逆的近似矩阵和L-BFGS很相似,然而我们的方法利用了Hessian-vector乘积来稳定了近似.

7 前言的证明

7.1 引理3的证明

${s}_{j}^{T}{y}_{j}={s}_{j}{\nabla }^{2}{f}_{\left(}{T}_{j}\right)\left({\mu }_{j}\right){s}_{j}$$s_j^T y_j=s_j ∇^2 f_(T_j ) (μ_j ) s_j$,根据假设2有

$\begin{array}{}\text{(34)}& \lambda ‖{s}_{j}{‖}^{2}\le {s}_{j}^{\mathrm{\top }}{y}_{j}\le \mathrm{\Lambda }‖{s}_{j}{‖}^{2}.\end{array}$

$\frac{‖{y}_{j}{‖}^{2}}{{s}_{j}^{\mathrm{\top }}{y}_{j}}=\frac{{z}_{j}^{\mathrm{\top }}{\mathrm{\nabla }}^{2}{f}_{{\mathcal{T}}_{j}}\left({u}_{j}\right){z}_{j}}{{z}_{j}^{\mathrm{\top }}{z}_{j}},$

$\begin{array}{}\text{(35)}& \lambda \le \frac{‖{y}_{j}{‖}^{2}}{{s}_{j}^{\mathrm{\top }}{y}_{j}}\le \mathrm{\Lambda }.\end{array}$

$\begin{array}{}\text{(36)}& {B}_{r}^{\left(j\right)}={B}_{r}^{\left(j-1\right)}-\frac{{B}_{r}^{\left(j-1\right)}{s}_{j}{s}_{j}^{\mathrm{\top }}{B}_{r}^{\left(j-1\right)}}{{s}_{j}^{\mathrm{\top }}{B}_{r}^{\left(j-1\right)}{s}_{j}}+\frac{{y}_{j}{y}_{j}^{\mathrm{\top }}}{{y}_{j}^{\mathrm{\top }}{s}_{j}}.\end{array}$

$\begin{array}{rl}tr\left({B}_{r}^{\left(j\right)}\right)& =tr\left({B}_{r}^{\left(j-1\right)}\right)-\frac{tr\left({B}_{r}^{\left(j-1\right)}{s}_{j}{s}_{j}^{\mathrm{\top }}{B}_{r}^{\left(j-1\right)}\right)}{{s}_{j}^{\mathrm{\top }}{B}_{r}^{\left(j-1\right)}{s}_{j}}+\frac{tr\left({y}_{j}{y}_{j}^{\mathrm{\top }}\right)}{{y}_{j}^{\mathrm{\top }}{s}_{j}}\\ & =tr\left({B}_{r}^{\left(j-1\right)}\right)-\frac{‖{B}_{r}^{\left(j-1\right)}{s}_{j}{‖}^{2}}{{s}_{j}^{\mathrm{\top }}{B}_{r}^{\left(j-1\right)}{s}_{j}}+\frac{‖{y}_{j}{‖}^{2}}{{y}_{j}^{\mathrm{\top }}{s}_{j}}\\ & \le tr\left({B}_{r}^{\left(j-1\right)}\right)+\frac{‖{y}_{j}{‖}^{2}}{{y}_{j}^{\mathrm{\top }}{s}_{j}}\\ & \le tr\left({B}_{r}^{\left(j-1\right)}\right)+\mathrm{\Lambda }.\end{array}$

$tr\left({B}_{r}^{\left(0\right)}\right)=d\frac{‖{y}_{r}{‖}^{2}}{{s}_{r}^{\mathrm{\top }}{y}_{r}}\le d\mathrm{\Lambda },$

$tr\left({B}_{k}\right)\le \left(d+M\right)\mathrm{\Lambda }.$

$\begin{array}{rl}det\left({B}_{r}^{\left(j\right)}\right)& =det\left({B}_{r}^{\left(j-1\right)}\right)\\ & \phantom{\rule{1em}{0ex}}\phantom{\rule{thinmathspace}{0ex}}det\left(I-\frac{{s}_{j}{s}_{j}^{\mathrm{\top }}{B}_{r}^{\left(j-1\right)}}{{s}_{j}^{\mathrm{\top }}{B}_{r}^{\left(j-1\right)}{s}_{j}}+\frac{\left({B}_{r}^{\left(j-1\right)}{\right)}^{-1}{y}_{j}{y}_{j}^{\mathrm{\top }}}{{y}_{j}^{\mathrm{\top }}{s}_{j}}\right)\\ & =det\left({B}_{r}^{\left(j-1\right)}\right)\frac{{y}_{j}^{\mathrm{\top }}{s}_{j}}{{s}_{j}^{\mathrm{\top }}{B}_{r}^{\left(j-1\right)}{s}_{j}}\\ & =det\left({B}_{r}^{\left(j-1\right)}\right)\frac{{y}_{j}^{\mathrm{\top }}{s}_{j}}{‖{s}_{j}{‖}^{2}}\frac{‖{s}_{j}{‖}^{2}}{{s}_{j}^{\mathrm{\top }}{B}_{r}^{\left(j-1\right)}{s}_{j}}\\ & \ge det\left({B}_{r}^{\left(j-1\right)}\right)\frac{\lambda }{{\lambda }_{max}\left({B}_{r}^{\left(j-1\right)}\right)}\\ & \ge det\left({B}_{r}^{\left(j-1\right)}\right)\frac{\lambda }{tr\left({B}_{r}^{\left(j-1\right)}\right)}\\ & \ge det\left({B}_{r}^{\left(j-1\right)}\right)\frac{\lambda }{\left(d+M\right)\mathrm{\Lambda }}.\end{array}$

$\begin{array}{rl}\text{(37)}& & \phantom{\rule{thinmathspace}{0ex}}\phantom{\rule{thinmathspace}{0ex}}det\left(I+{u}_{1}{v}_{1}^{\mathrm{\top }}+{u}_{2}{v}_{2}^{\mathrm{\top }}\right)=& \phantom{\rule{thinmathspace}{0ex}}\phantom{\rule{thinmathspace}{0ex}}\left(1+{u}_{1}^{\mathrm{\top }}{v}_{1}\right)\left(1+{u}_{2}^{\mathrm{\top }}{v}_{2}\right)-\left({u}_{1}^{\mathrm{\top }}{v}_{2}\right)\left({v}_{1}^{\mathrm{\top }}{u}_{2}\right)\end{array}$

$det\left({B}_{r}^{\left(0\right)}\right)={\left(\frac{‖{y}_{r}{‖}^{2}}{{s}_{r}^{\mathrm{\top }}{y}_{r}}\right)}^{d}\ge {\lambda }^{d},$

$det\left({B}_{r}\right)\ge \frac{{\lambda }^{d+M}}{\left(\left(d+M\right)\mathrm{\Lambda }{\right)}^{M}}.$

7.2 引理4的证明

${\lambda }_{max}\left({B}_{r}\right)\le tr\left({B}_{r}\right)\le \left(d+M\right)\mathrm{\Lambda }.$

${\lambda }_{min}\left({B}_{r}\right)\ge \frac{det\left({B}_{r}\right)}{{\lambda }_{max}\left({B}_{r}{\right)}^{d-1}}\ge \frac{{\lambda }^{d+M}}{\left(\left(d+M\right)\mathrm{\Lambda }{\right)}^{d+M-1}}.$

$\frac{1}{\left(d+M\right)\mathrm{\Lambda }}I⪯{H}_{r}⪯\frac{\left(\left(d+M\right)\mathrm{\Lambda }{\right)}^{d+M-1}}{{\lambda }^{d+M}}I.$

7.3 引理6的证明

$0={g}_{\mathcal{S}}\left({w}_{\ast }\right)\le {g}_{\mathcal{S}}\left(w-\frac{1}{\mathrm{\Lambda }}\mathrm{\nabla }{g}_{\mathcal{S}}\left(w\right)\right)\le {g}_{\mathcal{S}}\left(w\right)-\frac{1}{2\mathrm{\Lambda }}‖\mathrm{\nabla }{g}_{\mathcal{S}}{‖}^{2}.$

$\begin{array}{rl}& \phantom{\rule{thinmathspace}{0ex}}\phantom{\rule{thinmathspace}{0ex}}‖\mathrm{\nabla }{f}_{\mathcal{S}}\left(w\right)-\mathrm{\nabla }{f}_{\mathcal{S}}\left({w}_{\ast }\right){‖}^{2}\\ \le & \phantom{\rule{thinmathspace}{0ex}}\phantom{\rule{thinmathspace}{0ex}}2\mathrm{\Lambda }\left({f}_{\mathcal{S}}\left(w\right)-{f}_{\mathcal{S}}\left({w}_{\ast }\right)-\mathrm{\nabla }{f}_{\mathcal{S}}\left({w}_{\ast }{\right)}^{\mathrm{\top }}\left(w-{w}_{\ast }\right)\right).\end{array}$

$\begin{array}{rl}\text{(38)}& & \phantom{\rule{thinmathspace}{0ex}}\phantom{\rule{thinmathspace}{0ex}}{\left(\genfrac{}{}{0}{}{N}{b}\right)}^{-1}\sum _{|\mathcal{S}|=b}‖\mathrm{\nabla }{f}_{\mathcal{S}}\left(w\right)-\mathrm{\nabla }{f}_{\mathcal{S}}\left({w}_{\ast }\right){‖}^{2}\le & \phantom{\rule{thinmathspace}{0ex}}\phantom{\rule{thinmathspace}{0ex}}2\mathrm{\Lambda }\left(f\left(w\right)-f\left({w}_{\ast }\right)\right).\end{array}$

$\begin{array}{rl}\text{(39)}& {\mathbb{E}}_{k,t}\left[‖{v}_{t}{‖}^{2}\right]\le & \phantom{\rule{thinmathspace}{0ex}}\phantom{\rule{thinmathspace}{0ex}}2{\mathbb{E}}_{k,t}\left[‖\mathrm{\nabla }{f}_{\mathcal{S}}\left({x}_{t}\right)-\mathrm{\nabla }{f}_{\mathcal{S}}\left({w}_{\ast }\right){‖}^{2}\right]& \phantom{\rule{1em}{0ex}}+2{\mathbb{E}}_{k,t}\left[‖\mathrm{\nabla }{f}_{\mathcal{S}}\left({w}_{k}\right)-\mathrm{\nabla }{f}_{\mathcal{S}}\left({w}_{\ast }\right)-{\mu }_{k}{‖}^{2}\right]\\ \le & \phantom{\rule{thinmathspace}{0ex}}\phantom{\rule{thinmathspace}{0ex}}2{\mathbb{E}}_{k,t}\left[‖\mathrm{\nabla }{f}_{\mathcal{S}}\left({x}_{t}\right)-\mathrm{\nabla }{f}_{\mathcal{S}}\left({w}_{\ast }\right){‖}^{2}\right]\\ & \phantom{\rule{1em}{0ex}}+2{\mathbb{E}}_{k,t}\left[‖\mathrm{\nabla }{f}_{\mathcal{S}}\left({w}_{k}\right)-\mathrm{\nabla }{f}_{\mathcal{S}}\left({w}_{\ast }\right){‖}^{2}\right]\\ \le & \phantom{\rule{thinmathspace}{0ex}}\phantom{\rule{thinmathspace}{0ex}}4\mathrm{\Lambda }\left(f\left({x}_{t}\right)-f\left({w}_{\ast }\right)+f\left({w}_{k}\right)-f\left({w}_{\ast }\right)\right).\end{array}$

8 探讨

Moritz, P., Nishihara, R., & Jordan, M. (2016, May). A linearly-convergent stochastic L-BFGS algorithm. In Artificial Intelligence and Statistics (pp. 249-258).