梯度下降的动量法是一种优化算法,用于加速梯度下降算法的收敛,特别是在具有高曲率、嘈杂梯度或有鞍点的情况下。动量法的核心思想是在梯度更新时引入一个动量项,累积之前的梯度更新方向,从而平滑优化路径,减少震荡并加快收敛速度。在动量法中,我们为参数更新引入了一个动量项,它代表了之前所有梯度的指数加权平均。
动量的物理意义:动量可以类比为物理学中的动量概念。在一个高低起伏的损失表面上,动量帮助模型在优化时更容易克服局部的阻力(如鞍点)和震荡现象。
累积效应:由于每次更新时都保留了一部分之前的更新量(通过动量项 vtv_tvt),当多次迭代的梯度朝着同一方向时,动量法会加速更新,因为这些更新会累积;反之,如果梯度方向不断变化,则动量法会平滑更新,防止过度调整。
动量法的优势
加快收敛速度:动量法能加快梯度下降在凹谷等方向的收敛速度,减少迭代次数。
减少振荡:动量法通过累积更新方向,减少了沿着梯度方向的振荡,尤其是在损失函数呈现较大变化的情况下。
处理鞍点:在鞍点(梯度为零但并非极小值)附近,标准梯度下降可能停滞,而动量法能帮助模型越过这些点,继续寻找全局最小值。
c++代码如下,任务为求10个参数影响的函数的参数值(线性且不考虑偏移量)
#include <iostream>
#include <cstring>
#include <algorithm>
#include <random>
using namespace std;
const int factor_num = 10;
const int trainset_num = 100;
double x[factor_num + 10];
double train[trainset_num + 10][factor_num + 10];
double pre_x[factor_num + 10];
double lr = 0.001;
double last_m[factor_num + 10];
double getloss()
{
double loss = 0.0;
for (int i = 1; i <= trainset_num; i++)
{
double p = 0.0;
for (int j = 1; j <= factor_num; j++)
{
p += train[i][j] * pre_x[j];
}
p -= train[i][factor_num + 1];
loss += p * p;
}
loss /= trainset_num;
return loss;
}
int main()
{
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_real_distribution<> dis(-10.0, 10.0), dis1(-10.0, 10.0), dis2(1.0, 5.0);
for (int i = 1; i <= factor_num; i++)
{
x[i] = dis(gen);
pre_x[i] = dis2(gen);
}
cout << "first x: ";
for (int i = 1; i <= factor_num; i++)
{
cout << pre_x[i] << ' ';
}
cout << endl
<< "ture x: ";
for (int i = 1; i <= factor_num; i++)
{
cout << x[i] << ' ';
}
for (int i = 1; i <= trainset_num; i++)
{
double ans = 0.0;
for (int j = 1; j <= factor_num; j++)
{
train[i][j] = dis1(gen);
ans += train[i][j] * x[j];
}
train[i][factor_num + 1] = ans;
}
cout << endl
<< "first loss: " << getloss() << endl;
int e = 10;
while (e--)
{
for (int k = 1; k <= factor_num; k++)
{
double q = 0.0;
for (int i = 1; i <= trainset_num; i++)
{
double p = 0.0;
for (int j = 1; j <= factor_num; j++)
{
p += train[i][j] * pre_x[j];
}
p -= train[i][factor_num + 1];
p *= pre_x[k];
q += p;
}
q /= trainset_num;
pre_x[k] = pre_x[k] - lr * (q * 0.8 + last_m[k] * 0.2);
last_m[k] = q;
}
lr *= 0.95;
if (e % 1 == 0)
{
cout << "loss: " << getloss() << endl;
}
}
cout << "after train: ";
for (int i = 1; i <= factor_num; i++)
{
cout << pre_x[i] << ' ';
}
}