Adam算法及python实现

算法介绍

Adam算法的发展经历了:SGD->SGDM->SGDNA->AdaGrad->AdaDelta->Adam->Adamax的过程。它是神经网络优化中的常用算法,在收敛速度上比较快,比SGD对收敛速度的纠结上有了很大的改进。但是该算法的学习率是不断减少的,可能收敛不到真正的最优解,实践中经常是前期Adam,后期SGD进行优化。
在这里插入图片描述

代码实现1

现以如下无约束凸优化问题为例进行算法实施,
min ⁡ 5 x 1 2 + 2 x 2 2 + 3 x 1 − 10 x 2 + 4 \min 5x^2_1+2x^2_2+3x_1−10x_2+4 min5x12+2x22+3x110x2+4

# Adam之实现

import numpy
from matplotlib import pyplot as plt


# 目标函数0阶信息
def func(X):
    funcVal = 5 * X[0, 0] ** 2 + 2 * X[1, 0] ** 2 + 3 * X[0, 0] - 10 * X[1, 0] + 4
    return funcVal


# 目标函数1阶信息
def grad(X):
    grad_x1 = 10 * X[0, 0] + 3
    grad_x2 = 4 * X[1, 0] - 10
    gradVec = numpy.array([[grad_x1], [grad_x2]])
    return gradVec


# 定义迭代起点
def seed(n=2):
    seedVec = numpy.random.uniform(-100, 100, (n, 1))
    return seedVec


class Adam(object):

    def __init__(self, _func, _grad, _seed):
        '''
        _func: 待优化目标函数
        _grad: 待优化目标函数之梯度
        _seed: 迭代起始点
        '''
        self.__func = _func
        self.__grad = _grad
        self.__seed = _seed

        self.__xPath = list()
        self.__JPath = list()


    def get_solu(self, alpha=0.001, beta1=0.9, beta2=0.999, epsilon=1.e-8, zeta=1.e-6, maxIter=3000000):
        '''
        获取数值解,
        alpha: 步长参数
        beta1: 一阶矩指数衰减率
        beta2: 二阶矩指数衰减率
        epsilon: 足够小正数
        zeta: 收敛判据
        maxIter: 最大迭代次数
        '''
        self.__init_path()

        x = self.__init_x()
        JVal = self.__calc_JVal(x)
        self.__add_path(x, JVal)
        grad = self.__calc_grad(x)
        m, v = numpy.zeros(x.shape), numpy.zeros(x.shape)
        for k in range(1, maxIter + 1):
            # print("k: {:3d},   JVal: {}".format(k, JVal))
            if self.__converged1(grad, zeta):
                self.__print_MSG(x, JVal, k)
                return x, JVal, True

            m = beta1 * m + (1 - beta1) * grad
            v = beta2 * v + (1 - beta2) * grad * grad
            m_ = m / (1 - beta1 ** k)
            v_ = v / (1 - beta2 ** k)

            alpha_ = alpha / (numpy.sqrt(v_) + epsilon)
            d = -m_
            xNew = x + alpha_ * d
            JNew = self.__calc_JVal(xNew)
            self.__add_path(xNew, JNew)
            if self.__converged2(xNew - x, JNew - JVal, zeta ** 2):
                self.__print_MSG(xNew, JNew, k + 1)
                return xNew, JNew, True

            gNew = self.__calc_grad(xNew)
            x, JVal, grad = xNew, JNew, gNew
        else:
            if self.__converged1(grad, zeta):
                self.__print_MSG(x, JVal, maxIter)
                return x, JVal, True

        print("Adam not converged after {} steps!".format(maxIter))
        return x, JVal, False


    def get_path(self):
        return self.__xPath, self.__JPath


    def __converged1(self, grad, epsilon):
        if numpy.linalg.norm(grad, ord=numpy.inf) < epsilon:
            return True
        return False


    def __converged2(self, xDelta, JDelta, epsilon):
        val1 = numpy.linalg.norm(xDelta, ord=numpy.inf)
        val2 = numpy.abs(JDelta)
        if val1 < epsilon or val2 < epsilon:
            return True
        return False


    def __print_MSG(self, x, JVal, iterCnt):
        print("Iteration steps: {}".format(iterCnt))
        print("Solution:\n{}".format(x.flatten()))
        print("JVal: {}".format(JVal))


    def __calc_JVal(self, x):
        return self.__func(x)


    def __calc_grad(self, x):
        return self.__grad(x)


    def __init_x(self):
        return self.__seed


    def __init_path(self):
        self.__xPath.clear()
        self.__JPath.clear()


    def __add_path(self, x, JVal):
        self.__xPath.append(x)
        self.__JPath.append(JVal)


class AdamPlot(object):

    @staticmethod
    def plot_fig(adamObj):
        x, JVal, tab = adamObj.get_solu(0.1)
        xPath, JPath = adamObj.get_path()

        fig = plt.figure(figsize=(10, 4))
        ax1 = plt.subplot(1, 2, 1)
        ax2 = plt.subplot(1, 2, 2)

        ax1.plot(numpy.arange(len(JPath)), JPath, "k.", markersize=1)
        ax1.plot(0, JPath[0], "go", label="starting point")
        ax1.plot(len(JPath)-1, JPath[-1], "r*", label="solution")

        ax1.legend()
        ax1.set(xlabel="$iterCnt$", ylabel="$JVal$")

        x1 = numpy.linspace(-100, 100, 300)
        x2 = numpy.linspace(-100, 100, 300)
        x1, x2 = numpy.meshgrid(x1, x2)
        f = numpy.zeros(x1.shape)
        for i in range(x1.shape[0]):
            for j in range(x1.shape[1]):
                f[i, j] = func(numpy.array([[x1[i, j]], [x2[i, j]]]))
        ax2.contour(x1, x2, f, levels=36)
        x1Path = list(item[0] for item in xPath)
        x2Path = list(item[1] for item in xPath)
        ax2.plot(x1Path, x2Path, "k--", lw=2)
        ax2.plot(x1Path[0], x2Path[0], "go", label="starting point")
        ax2.plot(x1Path[-1], x2Path[-1], "r*", label="solution")
        ax2.set(xlabel="$x_1$", ylabel="$x_2$")
        ax2.legend()

        fig.tight_layout()
        # plt.show()
        fig.savefig("plot_fig.png")



if __name__ == "__main__":
    adamObj = Adam(func, grad, seed())

    AdamPlot.plot_fig(adamObj)

在这里插入图片描述

代码实现2

import numpy as np
import matplotlib.pyplot as plt
# 生成模拟数据
np.random.seed(0)
X = np.random.rand(100, 1)
y = 2 * X + 1 + 0.1 * np.random.randn(*X.shape)

# 定义超参数
learning_rate =0.1
epochs = 1000
beta_1 = 0.9
beta_2 = 0.999
epsilon = 1e-8

# 初始化参数
w = np.random.randn(1)[0]
b = np.random.randn(1)[0]


# Adam优化器
class AdamOptimizer:
    def __init__(self, learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8):
        self.learning_rate = learning_rate
        self.beta_1 = beta_1
        self.beta_2 = beta_2
        self.epsilon = epsilon
        self.m =np.zeros((1,2))
        self.v = np.zeros((1,2))
        self.t = 0




    def update(self, grad):
        self.t += 1
        self.m = self.beta_1 * self.m + (1 - self.beta_1) * grad
        self.v = self.beta_2 * self.v + (1 - self.beta_2) * grad ** 2
        m_hat = self.m / (1 - self.beta_1 ** self.t)
        v_hat = self.v / (1 - self.beta_2 ** self.t)
        delta_param = -self.learning_rate * m_hat / (np.sqrt(v_hat) + self.epsilon)
        return delta_param
# 训练过程
optimizer = AdamOptimizer(learning_rate, beta_1, beta_2, epsilon)

myw=[w]
myb=[b]
for epoch in range(epochs):
    # 前向传播
    y_pred = w * X + b

    # 损失计算
    loss = np.mean((y - y_pred) ** 2) / 2

    # 反向传播
    grad_w = -(y - y_pred).T.dot(X) / len(X)
    grad_b = -(y - y_pred).sum() / len(X)

    # 更新权重和偏置
    grad = np.array([grad_w[0][0],grad_b])
    updated_gradients = optimizer.update(grad)

    w += updated_gradients[0,0]
    myw.append(w)
    b += updated_gradients[0,1]
    myb.append(b)



print("Final parameters:")
print(f"w: {w}, b: {b}")

plt.plot(myw)
plt.title('w')
plt.show()

plt.plot(myb)
plt.title('b')
plt.show()

在这里插入图片描述

参考

https://www.cnblogs.com/xxhbdk/p/15063793.html
论文:Adam: A method for stochastic optimization

  • 2
    点赞
  • 21
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值