1.梯度可以理解为函数的导数,减去的是学习率乘以损失函数的导数,即梯度
#coding:utf-8
#4.2学习率
#设损失函数为 loss=(w+1)^2,令w初值是常数5。反向传播就是求最优w,即求最小对应的loss 对应的w值。
import tensorflow as tf
import numpy as np
#定义待优化参数 W初值为5
w = tf.Variable(tf.constant(5,dtype=tf.float32))#输出参数 每个x对应一个参数
#定义损失函数
loss= tf.square(w+1)
train_step = tf.train.GradientDescentOptimizer(0.2).minimize(loss)#学习率0.001 梯度下降函数GDO
#3生成会话,训练STEPS轮
with tf.Session() as sess:
init_op = tf.global_variables_initializer()#初始化所有变量
sess.run(init_op)#初始化所有变量
STEPS =40
for i in range(STEPS):
sess.run(train_step)
w_val = sess.run(w)
loss_val = sess.run(loss)
print("after %s steps: w is %f, loss is %f." %(i,w_val,loss_val))
指数学习率代码如下:
#coding:utf-8
#4.2学习率 指数衰减学习率
#设损失函数为 loss=(w+1)^2,令w初值是常熟5。反向传播就是求最优w,即求最小对应的loss 对应的w值。
#使用指数衰减学习率,在迭代的初期得到较高的下降速度,可以在较小的训练轮数下,得到较好的收敛度
import tensorflow as tf
import numpy as np
#定义待优化参数 W初值为5
LEARNING_RATE_BASE = 0.1 #最初的学习率
LEARNING_RATE_DECAY = 0.99 #学习衰减率
LEARNING_RATE_STEP = 1 #喂入多少轮BATCH_SIZE后,更新一次学习率,一般认为:总样本数/BATCH_SIZE
#运行了几轮BATCH_SIZE的计数器,初始值给0,设为不被训练
global_step = tf.Variable(0,trainable=False)
#定义指数下降学习率
learning_rate = tf.train.exponential_decay(LEARNING_RATE_BASE,global_step,LEARNING_RATE_STEP,LEARNING_RATE_DECAY,staircase = True)
#定义待优化参数,初值给为5
w = tf.Variable(tf.constant(5,dtype=tf.float32))
#定义损失函数
loss= tf.square(w+1)
#学习率是动态的,是指数学习率
train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss,global_step=global_step)
#3生成会话,训练STEPS轮
with tf.Session() as sess:
init_op = tf.global_variables_initializer()#初始化所有变量
sess.run(init_op)#初始化所有变量
STEPS =40
for i in range(STEPS):
sess.run(train_step)
learning_rate_val = sess.run(learning_rate)
global_step_val = sess.run(global_step)
w_val = sess.run(w)
loss_val = sess.run(loss)
print("after %s steps: global_step is %f ,w is %f, learning_rate is %f ,loss is %f." %(i,global_step_val,w_val,learning_rate_val,loss_val))