python实现tensorflow2.0的常见优化器

最新推荐文章于 2024-03-20 09:21:02 发布

广小辉

最新推荐文章于 2024-03-20 09:21:02 发布

阅读量1.1k

点赞数

分类专栏：人工智能系列推荐系统

本文链接：https://blog.csdn.net/Galbraith_/article/details/109092131

版权

人工智能系列推荐系统专栏收录该内容

11 篇文章 0 订阅

订阅专栏

tensorlow2.0中的常见优化器如下：
在这里插入图片描述
其中，adam是最常使用的，比如esmm论文中使用。
下面通过python实现几种常见的优化器。其中使用了tensorflow2.0 的tf.GradientTape来自动求微分。

数据集构造

build data

import tensorflow as tf
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
x = np.arange(-3, 7, 0.1)
y = np.array([0.5 * _ * _ - 2 * _ + 0.5  for _ in x])
sns.lineplot(x, y, markers=True, dashes=False)
plt.grid(True)

在这里插入图片描述

x = tf.convert_to_tensor(x)
y = tf.convert_to_tensor(y)

SGD算法

计算损失函数(loss function， cost function)
计算损失函数关于模型参数 $\theta$ 的局部梯度【先求导数，然后带入点坐标】
沿着梯度方向进行下一次的迭代
知道梯度为0时，就达到了误差函数的最小值

在梯度下降中，一个最重要的参数就是学习率lr。
如果学习率太小，模型需要多次迭代才能达到最小值点，导致模型的训练时间变长；

如果学习率太大，模型在更新过程中，比较震荡，有可能会跳过最小值
在这里插入图片描述

# 损失函数

def sgd(a, b ,x, y, lr=0.001):
    with tf.GradientTape(persistent=True) as t1, tf.GradientTape(persistent=True) as t2:
        t1.watch(a)
        t2.watch(b)
        y_pred = tf.math.add(tf.math.multiply(a, tf.math.pow(x, 2)), tf.math.multiply(b, tf.math.pow(x, 1)))
        loss = tf.keras.losses.MeanSquaredError()(y, y_pred)
    dl_da = t1.gradient(loss, a)
    dl_db = t2.gradient(loss, b)

    a = a - lr * dl_da
    b = b - lr * dl_db
    return a, b 

def train(x, y):
    # 定义训练超参数
    MAX_ITER = 500
    TOL = 1e-5
    
    # 初始化模型的参数值
    a = tf.random.normal(shape = (1, ), dtype=tf.dtypes.float64)
    b = tf.random.normal(shape = (1, ), dtype=tf.dtypes.float64)
    cost_value_list = []
    print('初始值为a={}, b={}'.format(a, b))
    
    it = 0
    patience = 0
    while it < MAX_ITER: 
        y_pred = tf.math.add(tf.math.multiply(a, tf.math.pow(x, 2)), tf.math.multiply(b, tf.math.pow(x, 1)))
        pre_cost = tf.keras.losses.MeanSquaredError()(y, y_pred)
        a, b = sgd(a, b, x, y)
        y_pred = tf.math.add(tf.math.multiply(a, tf.math.pow(x, 2)), tf.math.multiply(b, tf.math.pow(x, 1)))
        post_cost = tf.keras.losses.MeanSquaredError()(y, y_pred)
        cost_value_list.append(post_cost)
        cost_delta = abs(pre_cost - post_cost)
        if cost_delta < TOL:
            patience += 1
        if patience == 3:
            break
            
        it += 1
    
    return a, b, cost_value_list
        
a, b, cost_list = train(x, y)
print('最终值为a={}, b={}'.format(a, b))
cost_list = [_.numpy() for _ in cost_list]
sns.lineplot(range(len(cost_list)), cost_list)
plt.grid(True)

在这里插入图片描述

动量优化Momentum

梯度下降算法的变量更行公式只是减去损失函数 $J(\theta)$ 相对于θ的局部梯度，乘以学习率η来更新权重theta：

方程为： θ=θ-η∇J(θ)

然而它并不关心以前的梯度是什么，如果局部梯度很小，那么这一次的更新会非常慢；

动量优化momentum很关心之前的梯度，在每次迭代时，它将动量矢量（累计历史梯度信息动量）m（乘以学习率β）与局部梯度相加，并通过简单地减去或加上该动量矢量来更新权重。换句话，梯度作用于加速度，不作用于速度，人为引入一个初速度βm；
在这里插入图片描述

def momentum(a, b ,ma, mb, x, y, lr=0.001, beta=0.9):
    with tf.GradientTape(persistent=True) as t1, tf.GradientTape(persistent=True) as t2:
        t1.watch(a)
        t2.watch(b)
        y_pred = tf.math.add(tf.math.multiply(a, tf.math.pow(x, 2)), tf.math.multiply(b, tf.math.pow(x, 1)))
        loss = tf.keras.losses.MeanSquaredError()(y, y_pred)
    dl_da = t1.gradient(loss, a)
    dl_db = t2.gradient(loss, b)
    
    ma = beta * ma + lr * dl_da
    mb = beta * mb + lr * dl_db

    a = a - ma
    b = b - lr * mb
    return a, b , ma, mb

def train(x, y):
    # 定义训练超参数
    MAX_ITER = 500
    TOL = 1e-5
    
    # 初始化模型的参数值
    a = tf.random.normal(shape = (1, ), dtype=tf.dtypes.float64)
    b = tf.random.normal(shape = (1, ), dtype=tf.dtypes.float64)
    cost_value_list = []
    print('初始值为a={}, b={}'.format(a, b))
    ma = 0.9
    mb = 0.9
    
    it = 0
    patience = 0
    while it < MAX_ITER:
        
        y_pred = tf.math.add(tf.math.multiply(a, tf.math.pow(x, 2)), tf.math.multiply(b, tf.math.pow(x, 1)))
        pre_cost = tf.keras.losses.MeanSquaredError()(y, y_pred)
        a, b, ma, mb = momentum(a, b, ma, mb, x, y)
        y_pred = tf.math.add(tf.math.multiply(a, tf.math.pow(x, 2)), tf.math.multiply(b, tf.math.pow(x, 1)))
        post_cost = tf.keras.losses.MeanSquaredError()(y, y_pred)
        
        cost_value_list.append(post_cost)
        cost_delta = abs(pre_cost - post_cost)
        if cost_delta < TOL:
            patience += 1
        if patience == 3:
            break
            
        it += 1
    
    return a, b, cost_value_list
        
a, b, cost_list = train(x, y)
print('最终值为a={}, b={}'.format(a, b))
cost_list = [_.numpy() for _ in cost_list]
sns.lineplot(range(len(cost_list)), cost_list)
plt.grid(True)

在这里插入图片描述

Adagrad

在这里插入图片描述

def adagrad(a, b ,sa, sb, x, y, lr=0.001, epison=0.00001):
    with tf.GradientTape(persistent=True) as t1, tf.GradientTape(persistent=True) as t2:
        t1.watch(a)
        t2.watch(b)
        y_pred = tf.math.add(tf.math.multiply(a, tf.math.pow(x, 2)), tf.math.multiply(b, tf.math.pow(x, 1)))
        loss = tf.keras.losses.MeanSquaredError()(y, y_pred)
    dl_da = t1.gradient(loss, a)
    dl_db = t2.gradient(loss, b)
    
    sa = sa + dl_da * dl_da
    sb = sb + dl_db * dl_da
    

    a = a - lr * dl_da / tf.math.sqrt(sa + epison)
    b = b - lr * dl_db / tf.math.sqrt(sa + epison)
    return a, b , sa, sb

def train(x, y):
    # 定义训练超参数
    MAX_ITER = 2000
    TOL = 1e-5
    
    # 初始化模型的参数值
    a = tf.random.normal(shape = (1, ), dtype=tf.dtypes.float64)
    b = tf.random.normal(shape = (1, ), dtype=tf.dtypes.float64)
    cost_value_list = []
    print('初始值为a={}, b={}'.format(a, b))
    sa = 0
    sb = 0
    
    it = 0
    patience = 0
    while it < MAX_ITER:
        
        y_pred = tf.math.add(tf.math.multiply(a, tf.math.pow(x, 2)), tf.math.multiply(b, tf.math.pow(x, 1)))
        pre_cost = tf.keras.losses.MeanSquaredError()(y, y_pred)
        a, b, sa, sb = adagrad(a, b, sa, sb, x, y)
        y_pred = tf.math.add(tf.math.multiply(a, tf.math.pow(x, 2)), tf.math.multiply(b, tf.math.pow(x, 1)))
        post_cost = tf.keras.losses.MeanSquaredError()(y, y_pred)
        
        cost_value_list.append(post_cost)
        cost_delta = abs(pre_cost - post_cost)
        if cost_delta < TOL:
            patience += 1
        if patience == 3:
            break
            
        it += 1
    
    return a, b, cost_value_list
        
a, b, cost_list = train(x, y)
print('最终值为a={}, b={}'.format(a, b))
cost_list = [_.numpy() for _ in cost_list]
sns.lineplot(range(len(cost_list)), cost_list)
plt.grid(True)

在这里插入图片描述
sa=sa+da* da，sb=sb+db*db 会随着轮数越来越大，然后导致学习率1/ np.sqrt(sa) 越来越小，权重更新得越慢。开始的时候，更新的多，越到后面，更新的越慢。【相当于自适应的gradient】

RMSProp

Adagrad的缺点：

越往后，模型的参数更新的越慢，并且从未收敛到全局最优；Adagrad权重更新，学习率积累自从训练依赖所有的梯度(sa积累了所有的da，sb积累了所有的db)
AdaGrad中：

sa=sa+dada

sb=sb+dbdb

a = a - lr * da / np.sqrt(sa+ epsilon)

b = b - lr * db / np.sqrt(sb+ epsilon)

就变化趋势来说：

随着轮数越来越多(steps 越来越大)，da，db越来越小(越来越平缓),但是sa和sb由于不断积累的原因，越来越大，归一化因子1 / np.sqrt(sa+ epsilon)越来越小，非常容易陷入局部最小值(lr很小，da很小，1 / np.sqrt(sa+ epsilon)也很小，因此它很难爬出局部最小值)

在这里插入图片描述

def rmsprop(a, b ,sa, sb, x, y, lr=0.001, epison=0.00001, beta=0.9):
    with tf.GradientTape(persistent=True) as t1, tf.GradientTape(persistent=True) as t2:
        t1.watch(a)
        t2.watch(b)
        y_pred = tf.math.add(tf.math.multiply(a, tf.math.pow(x, 2)), tf.math.multiply(b, tf.math.pow(x, 1)))
        loss = tf.keras.losses.MeanSquaredError()(y, y_pred)
    dl_da = t1.gradient(loss, a)
    dl_db = t2.gradient(loss, b)
    
    sa = beta * sa + (1- beta)* dl_da * dl_da
    sb = beta * sb + (1- beta)* dl_db * dl_db

    a = a - lr * dl_da / tf.math.sqrt(sa + epison)
    b = b - lr * dl_db / tf.math.sqrt(sa + epison)
    return a, b , sa, sb

def train(x, y):
    # 定义训练超参数
    MAX_ITER = 2000
    TOL = 1e-5
    
    # 初始化模型的参数值
    a = tf.random.normal(shape = (1, ), dtype=tf.dtypes.float64)
    b = tf.random.normal(shape = (1, ), dtype=tf.dtypes.float64)
    cost_value_list = []
    print('初始值为a={}, b={}'.format(a, b))
    sa = 0
    sb = 0
    
    it = 0
    patience = 0
    while it < MAX_ITER:
        
        y_pred = tf.math.add(tf.math.multiply(a, tf.math.pow(x, 2)), tf.math.multiply(b, tf.math.pow(x, 1)))
        pre_cost = tf.keras.losses.MeanSquaredError()(y, y_pred)
        a, b, sa, sb = rmsprop(a, b, sa, sb, x, y)
        y_pred = tf.math.add(tf.math.multiply(a, tf.math.pow(x, 2)), tf.math.multiply(b, tf.math.pow(x, 1)))
        post_cost = tf.keras.losses.MeanSquaredError()(y, y_pred)
        
        cost_value_list.append(post_cost)
        cost_delta = abs(pre_cost - post_cost)
        if cost_delta < TOL:
            patience += 1
        if patience == 3:
            break
            
        it += 1
    
    return a, b, cost_value_list
        
a, b, cost_list = train(x, y)
print('最终值为a={}, b={}'.format(a, b))
cost_list = [_.numpy() for _ in cost_list]
sns.lineplot(range(len(cost_list)), cost_list)
plt.grid(True)

在这里插入图片描述
sa = beta * sa + (1-beta) * da * da + epilon = 0.9 * 现在的 + 0.1 * 以前的

sb = beta * sb + (1-beta) * db * db + epilon = 0.9 * 现在的 + 0.1 * 以前的

梯度da，db总体上会随着训练的step越多越来越小，sa和sb也会越来越小。那么归一化因子 $\frac{1}{\sqrt{s + e}}$ 会越来越大。当损失函数处于局部最小值时，只要学习率足够大，容易爬出局部最小值。

Adam

Adam = momentum + RMSProp

momentum: 加入阻尼

adagrad：考虑梯度的平方，并且加入阻尼
动量：研究人员和从业人员都喜欢使用把球滚下山坡而向局部极小值更快滚动的类比法，但从本质上讲，我们必须知道的是，动量算法在相关方向上加速了随机梯度下降，如以及抑制振荡。为了将动量引入我们的神经网络，我们将时间元素添加到过去时间步长的更新向量中，并将其添加到当前更新向量中。这样可以使球的动量增加一定程度
在上面的公式中，θ是网络的参数（权重，偏差或者激活值）, η是学习率，J是需要优化的目标函数γ是常数项，也是动量。Vt-1是过去的时间不长，vt是当前的时间不长。动量项γ通常是当前的时间步长。
在这里插入图片描述

适应性学习率：通过将学习率降低

我们在AdaGrad，RMSProp和Adam和Adadelta中可以i看到预定义时间调度schedule，可以将自适应学习率视为训练阶段的学习率调整。

RMSProp(均方根传播)其目的是解决AdaGrad的学习率急剧下降的情况下，模型陷入到最小值。RMSProp更改学习率的速度比AdaGrad慢一些。
学习率随着时间自适应。
在这里插入图片描述

def adam(idx, a, b ,ma, mb, sa, sb, x, y, lr=0.001, epison=0.00001, 
         beta1=tf.constant(0.9, dtype=tf.dtypes.float64),beta2=tf.constant(0.999, dtype=tf.dtypes.float64)):
    with tf.GradientTape(persistent=True) as t1, tf.GradientTape(persistent=True) as t2:
        t1.watch(a)
        t2.watch(b)
        y_pred = tf.math.add(tf.math.multiply(a, tf.math.pow(x, 2)), tf.math.multiply(b, tf.math.pow(x, 1)))
        loss = tf.keras.losses.MeanSquaredError()(y, y_pred)
    dl_da = t1.gradient(loss, a)
    dl_db = t2.gradient(loss, b)
    
    ma = beta1 * ma + (1- beta1)* dl_da
    sa = beta2 * sa + (1- beta2)* dl_da * dl_da
    ma = ma / (1 - tf.math.pow(beta1, idx))
    sa = sa / (1 - tf.math.pow(beta2, idx))
    
    mb = beta1 * mb + (1- beta1)* dl_db
    sb = beta2 * sb + (1- beta2)* dl_db * dl_db
    mb = mb / (1 - tf.math.pow(beta1, idx))
    sb = sb / (1 - tf.math.pow(beta2, idx))
    
    a = a - lr * ma / tf.math.sqrt(sa + epison)
    b = b - lr * mb / tf.math.sqrt(sb + epison)
    return a, b , ma, mb, sa, sb

def train(x, y):
    # 定义训练超参数
    MAX_ITER = 1000
    TOL = 1e-5
    
    # 初始化模型的参数值
    a = tf.random.normal(shape = (1, ), dtype=tf.dtypes.float64)
    b = tf.random.normal(shape = (1, ), dtype=tf.dtypes.float64)
    cost_value_list = []
    print('初始值为a={}, b={}'.format(a, b))
    ma = tf.convert_to_tensor([0.9], dtype=tf.dtypes.float64)
    mb = tf.convert_to_tensor([0.9], dtype=tf.dtypes.float64)
    sa = tf.convert_to_tensor([0], dtype=tf.dtypes.float64)
    sb = tf.convert_to_tensor([0], dtype=tf.dtypes.float64)
    
    it = 1
    patience = 0
    while it < MAX_ITER:
        
        y_pred = tf.math.add(tf.math.multiply(a, tf.math.pow(x, 2)), tf.math.multiply(b, tf.math.pow(x, 1)))
        pre_cost = tf.keras.losses.MeanSquaredError()(y, y_pred)
        
        a, b , ma, mb, sa, sb = adam(it, a, b ,ma, mb, sa, sb, x, y)
        y_pred = tf.math.add(tf.math.multiply(a, tf.math.pow(x, 2)), tf.math.multiply(b, tf.math.pow(x, 1)))
        post_cost = tf.keras.losses.MeanSquaredError()(y, y_pred)
        
        cost_value_list.append(post_cost)
        cost_delta = abs(pre_cost - post_cost)
        if cost_delta < TOL:
            patience += 1
        if patience == 3:
            break
            
        it += 1
    
    return a, b, cost_value_list
        
a, b, cost_list = train(x, y)
print('最终值为a={}, b={}'.format(a, b))
cost_list = [_.numpy() for _ in cost_list]
sns.lineplot(range(len(cost_list)), cost_list)
plt.grid(True)

在这里插入图片描述

广小辉

关注

0
点赞
踩
6

收藏

觉得还不错? 一键收藏
0
评论
python实现tensorflow2.0的常见优化器

tensorlow2.0中的常见优化器如下：其中，adam是最常使用的，比如esmm论文中使用。下面通过python实现几种常见的优化器。其中使用了tensorflow2.0 的tf.GradientTape来自动求微分。数据集构造build dataimport tensorflow as tfimport numpy as npimport seaborn as snsfrom matplotlib import pyplot as pltx = np.arange(-3, 7, 0
复制链接

扫一扫