梯度更新的例子
下面是一个线性回归的小例子,梯度带可以给出梯度更新的过程。
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
TRAIN_STEPS=20
# Prepare train data
train_X = np.linspace(-1, 1, 100)
train_Y = 2 * train_X + np.random.randn(*train_X.shape) * 0.33 + 10
w=tf.Variable(initial_value=1.0)
b=tf.Variable(initial_value=1.0)
optimizer=tf.keras.optimizers.SGD(0.1)
mse=tf.keras.losses.MeanSquaredError()
print("w:", w.numpy())
print("b:", b.numpy())
for i in range(TRAIN_STEPS):
print("epoch:",i)
#计算和更新梯度
with tf.GradientTape() as tape:
logit = w * train_X + b
loss=mse(train_Y,logit)
gradients=tape.gradient(target=loss,sources=[w,b]) #计算梯度
optimizer.apply_gradients(zip(gradients,[w,b])) #更新梯度
print("w:", w.numpy())
print("w_gradients:",gradients[0].numpy())
print("b:", b.numpy())
print("b_gradients:",gradients[1].numpy())
#draw
plt.plot(train_X,train_Y,"+")
plt.plot(train_X,w * train_X + b)
plt.show()
注意,其中的 w 和 b 一定要设置成是变量,这样才可以进行梯度更新。
得到结果如下:
w: 1.0
b: 1.0
epoch: 0
w: 1.0613171
w_gradients: -0.6131705
b: 2.8052635
b_gradients: -18.052635
epoch: 1
w: 1.1184638
w_gradients: -0.57146645
b: 4.2494745
b_gradients: -14.442109
epoch: 2
w: 1.1717236
w_gradients: -0.532599
b: 5.4048433
b_gradients: -11.553687
epoch: 3
w: 1.2213612
w_gradients: -0.49637514
b: 6.3291383
b_gradients: -9.242949
epoch: 4
w: 1.2676226
w_gradients: -0.4626148
b: 7.068574
b_gradients: -7.394359
epoch: 5
w: 1.3107377
w_gradients: -0.43115103
b: 7.660123
b_gradients: -5.915488
epoch: 6
w: 1.3509204
w_gradients: -0.4018268
b: 8.133362
b_gradients: -4.7323904
epoch: 7
w: 1.3883702
w_gradients: -0.3744971
b: 8.511953
b_gradients: -3.7859123
epoch: 8
w: 1.4232727
w_gradients: -0.34902623
b: 8.814826
b_gradients: -3.0287292
epoch: 9
w: 1.4558015
w_gradients: -0.32528776
b: 9.057124
b_gradients: -2.422984
epoch: 10
w: 1.4861178
w_gradients: -0.3031639
b: 9.250963
b_gradients: -1.9383876
epoch: 11
w: 1.5143723
w_gradients: -0.28254467
b: 9.406034
b_gradients: -1.5507096
epoch: 12
w: 1.5407051
w_gradients: -0.26332784
b: 9.530091
b_gradients: -1.2405672
epoch: 13
w: 1.5652469
w_gradients: -0.245418
b: 9.629336
b_gradients: -0.99245346
epoch: 14
w: 1.5881195
w_gradients: -0.22872624
b: 9.708733
b_gradients: -0.79396325
epoch: 15
w: 1.6094365
w_gradients: -0.21316983
b: 9.772249
b_gradients: -0.6351708
epoch: 16
w: 1.6293037
w_gradients: -0.1986714
b: 9.823063
b_gradients: -0.5081376
epoch: 17
w: 1.6478196
w_gradients: -0.185159
b: 9.863714
b_gradients: -0.40651026
epoch: 18
w: 1.6650763
w_gradients: -0.17256568
b: 9.8962345
b_gradients: -0.3252076
epoch: 19
w: 1.6811591
w_gradients: -0.16082886
b: 9.922251
b_gradients: -0.260167
GradientTape类的参数
persistent
bool 类型,决定是否在求导之后图就被销毁了,如果设置为 True,则可以进行二次求导,即在一个梯度带中多次使用 gradient 函数,默认为 False。
persistent=True 用的并不多,因为在深度学习中,我们一般把 gradients=tape.gradient(target=loss,sources=[w,b]) 中的 sources 用中括号括起来,或在有网络模型时,用 model.trainable_variables 即可。
在做 CycleGAN 的时候用过一次 persistent=True:
def train_step(real_x, real_y):
with tf.GradientTape(persistent=True) as tape:
generated_y = generator_g(real_x, training=True)
generated_x = generator_f(real_y, training=True)
disc_real_x = discriminator_x(real_x, training=True)
disc_real_y = discriminator_y(real_y, training=True)
disc_fake_x = discriminator_x(generated_x, training=True)
disc_fake_y = discriminator_y(generated_y, training=True)
loss_gen_g = generator_loss(disc_fake_y)
loss_gen_f = generator_loss(disc_fake_x)
loss_disc_x = discriminator_loss(disc_real_x, disc_fake_y)
loss_disc_y = discriminator_loss(disc_real_y, disc_fake_x)
cycled_x = generator_f(generated_y, training=True)
cycled_y = generator_g(generated_x, training=True)
total_cycle_loss = calc_cycle_loss(real_x, cycled_x) + calc_cycle_loss(real_y, cycled_y)
same_x = generator_f(real_x, training=True)
same_y = generator_g(real_y, training=True)
total_loss_gen_g = loss_gen_g + total_cycle_loss + identity_loss(real_y, same_y)
total_loss_gen_f = loss_gen_f + total_cycle_loss + identity_loss(real_x, same_x)
grad_gen_g = tape.gradient(total_loss_gen_g, generator_g.trainable_variables)
grad_gen_f = tape.gradient(total_loss_gen_f, generator_f.trainable_variables)
grad_disc_x = tape.gradient(loss_disc_x, discriminator_x.trainable_variables)
grad_disc_y = tape.gradient(loss_disc_y, discriminator_y.trainable_variables)
generator_g_optimizer.apply_gradients(zip(grad_gen_g, generator_g.trainable_variables))
generator_f_optimizer.apply_gradients(zip(grad_gen_f, generator_f.trainable_variables))
discriminator_x_optimizer.apply_gradients(zip(grad_disc_x, discriminator_x.trainable_variables))
discriminator_y_optimizer.apply_gradients(zip(grad_disc_y, discriminator_y.trainable_variables))
从上面代码中可以看得出,在使用很多次 gradient 函数的时候,令 persistent=True 是必要的。
watch_accessed_variables
bool 类型,默认为 True,它的作用是控制那些可以 trainable 的变量是否会被自动监视 watch,默认情况下为 True,也就是说,即使不使用 watch 方法,那些 trainable 变量也会自动被 watch,如果设置为 False,则所有的要求导的变量都必须手动使用 watch 监视,否则会报错。
仍用文章开头的线性回归举例,但这次,我们只更新 w 变量,而不更新 b 变量:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
TRAIN_STEPS=20
# Prepare train data
train_X = np.linspace(-1, 1, 100)
train_Y = 2 * train_X + np.random.randn(*train_X.shape) * 0.33 + 10
w=tf.Variable(initial_value=1.0)
b=tf.Variable(initial_value=1.0)
optimizer=tf.keras.optimizers.SGD(0.1)
mse=tf.keras.losses.MeanSquaredError()
print("w:", w.numpy())
print("b:", b.numpy())
for i in range(TRAIN_STEPS):
print("epoch:",i)
#计算和更新梯度
with tf.GradientTape(persistent=True, watch_accessed_variables=False) as tape:
tape.watch(w)
logit = w * train_X + b
loss=mse(train_Y,logit)
gradients=tape.gradient(target=loss,sources=[w,b]) #计算梯度
variable_list=tape.watched_variables()
print(variable_list)
optimizer.apply_gradients(zip(gradients,[w,b])) #更新梯度
print("w:", w.numpy())
print("b:", b.numpy())
#draw
plt.plot(train_X,train_Y,"+")
plt.plot(train_X,w * train_X + b)
plt.show()
在上面的代码中,我们令 watch_accessed_variables=False,也就是不让梯度带监控任何变量(即不对这些变量进行更新),然后我们加入了 tape.watch(w),即对变量 w 进行更新,所以得到结果为:
w: 1.0
b: 1.0
epoch: 0
(<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=1.0>,)
WARNING:tensorflow:Gradients do not exist for variables ['Variable:0'] when minimizing the loss.
w: 1.0733174
b: 1.0
epoch: 1
(<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=1.0733174>,)
WARNING:tensorflow:Gradients do not exist for variables ['Variable:0'] when minimizing the loss.
w: 1.1416482
b: 1.0
epoch: 2
(<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=1.1416482>,)
WARNING:tensorflow:Gradients do not exist for variables ['Variable:0'] when minimizing the loss.
w: 1.2053316
b: 1.0
epoch: 3
(<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=1.2053316>,)
WARNING:tensorflow:Gradients do not exist for variables ['Variable:0'] when minimizing the loss.
w: 1.2646837
b: 1.0
epoch: 4
(<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=1.2646837>,)
WARNING:tensorflow:Gradients do not exist for variables ['Variable:0'] when minimizing the loss.
w: 1.3199991
b: 1.0
epoch: 5
(<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=1.3199991>,)
WARNING:tensorflow:Gradients do not exist for variables ['Variable:0'] when minimizing the loss.
w: 1.3715522
b: 1.0
epoch: 6
(<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=1.3715522>,)
WARNING:tensorflow:Gradients do not exist for variables ['Variable:0'] when minimizing the loss.
w: 1.419599
b: 1.0
epoch: 7
(<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=1.419599>,)
WARNING:tensorflow:Gradients do not exist for variables ['Variable:0'] when minimizing the loss.
w: 1.4643781
b: 1.0
epoch: 8
(<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=1.4643781>,)
WARNING:tensorflow:Gradients do not exist for variables ['Variable:0'] when minimizing the loss.
w: 1.5061115
b: 1.0
epoch: 9
(<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=1.5061115>,)
WARNING:tensorflow:Gradients do not exist for variables ['Variable:0'] when minimizing the loss.
w: 1.5450065
b: 1.0
epoch: 10
(<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=1.5450065>,)
WARNING:tensorflow:Gradients do not exist for variables ['Variable:0'] when minimizing the loss.
w: 1.5812562
b: 1.0
epoch: 11
(<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=1.5812562>,)
WARNING:tensorflow:Gradients do not exist for variables ['Variable:0'] when minimizing the loss.
w: 1.6150403
b: 1.0
epoch: 12
(<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=1.6150403>,)
WARNING:tensorflow:Gradients do not exist for variables ['Variable:0'] when minimizing the loss.
w: 1.6465267
b: 1.0
epoch: 13
(<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=1.6465267>,)
WARNING:tensorflow:Gradients do not exist for variables ['Variable:0'] when minimizing the loss.
w: 1.6758716
b: 1.0
epoch: 14
(<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=1.6758716>,)
WARNING:tensorflow:Gradients do not exist for variables ['Variable:0'] when minimizing the loss.
w: 1.7032206
b: 1.0
epoch: 15
(<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=1.7032206>,)
WARNING:tensorflow:Gradients do not exist for variables ['Variable:0'] when minimizing the loss.
w: 1.7287096
b: 1.0
epoch: 16
(<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=1.7287096>,)
WARNING:tensorflow:Gradients do not exist for variables ['Variable:0'] when minimizing the loss.
w: 1.7524649
b: 1.0
epoch: 17
(<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=1.7524649>,)
WARNING:tensorflow:Gradients do not exist for variables ['Variable:0'] when minimizing the loss.
w: 1.7746046
b: 1.0
epoch: 18
(<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=1.7746046>,)
WARNING:tensorflow:Gradients do not exist for variables ['Variable:0'] when minimizing the loss.
w: 1.7952384
b: 1.0
epoch: 19
(<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=1.7952384>,)
WARNING:tensorflow:Gradients do not exist for variables ['Variable:0'] when minimizing the loss.
w: 1.8144689
b: 1.0
得到图像:
watched_variables 函数
返回被这个梯度带所监视追踪的变量。
注意:使用这个函数之前一定要让 persistent=True。
x=tf.Variable(initial_value=[[1.0,2.0,3.0],[4.0,5.0,6.0]])
y=tf.Variable(initial_value=[[2.0,4.0,6.0],[8.0,10.0,12.0]])
with tf.GradientTape(persistent=True) as g:
g.watch([x, y])
z=tf.pow(x,2)+tf.pow(y,2)
dz_dx = g.gradient(z, [x, y])
variable_list=g.watched_variables()
print(variable_list)
(<tf.Variable 'Variable:0' shape=(2, 3) dtype=float32, numpy=
array([[1., 2., 3.],
[4., 5., 6.]], dtype=float32)>, <tf.Variable 'Variable:0' shape=(2, 3) dtype=float32, numpy=
array([[ 2., 4., 6.],
[ 8., 10., 12.]], dtype=float32)>)
debug
TypeError: zip argument #2 must support iteration
在只有一个变量的情况下可能出现这个错误,如:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
TRAIN_STEPS=20
# Prepare train data
train_X = np.linspace(-1, 1, 100)
train_Y = 2 * train_X + np.random.randn(*train_X.shape) * 0.33
w=tf.Variable(initial_value=1.0)
optimizer=tf.keras.optimizers.SGD(0.1)
mse=tf.keras.losses.MeanSquaredError()
print("w:", w.numpy())
for i in range(TRAIN_STEPS):
print("epoch:",i)
#计算和更新梯度
with tf.GradientTape() as tape:
logit = w * train_X
loss=mse(train_Y,logit)
gradients=tape.gradient(target=loss,sources=[w]) #计算梯度
optimizer.apply_gradients(zip(gradients,w)) #更新梯度
print("w:", w.numpy())
这是因为在更新梯度时,w 没有用中括号括起来,应改为:
gradients=tape.gradient(target=loss,sources=[w])
TypeError: Cannot iterate over a scalar tensor
在只有一个变量的情况下可能出现这个错误,如:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
TRAIN_STEPS=20
# Prepare train data
train_X = np.linspace(-1, 1, 100)
train_Y = 2 * train_X + np.random.randn(*train_X.shape) * 0.33
w=tf.Variable(initial_value=1.0)
optimizer=tf.keras.optimizers.SGD(0.1)
mse=tf.keras.losses.MeanSquaredError()
print("w:", w.numpy())
for i in range(TRAIN_STEPS):
print("epoch:",i)
#计算和更新梯度
with tf.GradientTape() as tape:
logit = w * train_X
loss=mse(train_Y,logit)
gradients=tape.gradient(target=loss,sources=w) # 计算梯度
optimizer.apply_gradients(zip(gradients,[w])) # 更新梯度
print("w:", w.numpy())
这是因为在计算梯度时,w 没有用中括号括起来,这样一来得到的 gradients 就是一个标量而不是一个列表,所以应改为:
gradients=tape.gradient(target=loss,sources=[w])
InvalidArgumentError: var and grad do not have the same shape[2,2] [2] [Op:ResourceApplyAdam]
在执行下面代码时,会报错。
x = tf.Variable(tf.ones((2, 2)))
with tf.GradientTape() as t:
z = x**2
# Derivative of z with respect to the original input tensor x
dz_dx = t.gradient(z, x)
opt.apply_gradients(zip(dz_dx, [x]))
InvalidArgumentError: var and grad do not have the same shape[2,2] [2] [Op:ResourceApplyAdam]
它的意思是说变量 x 和梯度 dz_dx 的形状不同。
让我们打印一下 zip(dz_dx, [x]):
x = tf.Variable(tf.ones((2, 2)))
with tf.GradientTape() as t:
z = x**2
# Derivative of z with respect to the original input tensor x
dz_dx = t.gradient(z, x)
for i, j in zip(dz_dx, [x]):
print('i: ', i, '\n')
print('j: ', j, '\n')
i: tf.Tensor([2. 2.], shape=(2,), dtype=float32)
j: <tf.Variable 'Variable:0' shape=(2, 2) dtype=float32, numpy=
array([[1., 1.],
[1., 1.]], dtype=float32)>
发现只取到了梯度中的第一个列表。
如果在 dz_dx 处也加上中括号,就会得到:
x = tf.Variable(tf.ones((2, 2)))
with tf.GradientTape() as t:
z = x**2
# Derivative of z with respect to the original input tensor x
dz_dx = t.gradient(z, x)
for i, j in zip([dz_dx], [x]):
print('i: ', i, '\n')
print('j: ', j, '\n')
i: tf.Tensor(
[[2. 2.]
[2. 2.]], shape=(2, 2), dtype=float32)
j: <tf.Variable 'Variable:0' shape=(2, 2) dtype=float32, numpy=
array([[1., 1.],
[1., 1.]], dtype=float32)>
这样一来变量和梯度的性质就一样了。
所以可以将代码改为:
x = tf.Variable(tf.ones((2, 2)))
with tf.GradientTape() as t:
z = x**2
# Derivative of z with respect to the original input tensor x
dz_dx = t.gradient(z, x)
opt.apply_gradients(zip([dz_dx], [x]))
print('x: ', x, '\n')
因为这里只有一个变量,所以其实不用 zip() 函数也可以,即将代码改为:
x = tf.Variable(tf.ones((2, 2)))
with tf.GradientTape() as t:
z = x**2
# Derivative of z with respect to the original input tensor x
dz_dx = t.gradient(z, x)
opt.apply_gradients([(dz_dx, x)])
print('x: ', x, '\n')
当然了,如果只用单纯的梯度下降法,直接用 x = x - dz_dx*0.01 代替 opt.apply_gradients([(dz_dx, x)]) 也是可以的,但如果想用动量法或者 Adam 啥的……就需要单独编函数来完成了。
AttributeError: ‘tensorflow.python.framework.ops.EagerTensor’ object has no attribute ‘_in_graph_mode’
在Tensorflow2.0之DeepDream(深度梦境)一文中,当定义梯度更新函数并调用时会出现这个错误:
def train_step(img, tile_size=512):
shift_down, shift_right, img_rolled = random_roll(img, tile_size)
# Initialize the image gradients to zero.
gradients = tf.zeros_like(img_rolled)
# Skip the last tile, unless there's only one tile.
xs = tf.range(0, img_rolled.shape[0], tile_size)[:-1]
if not tf.cast(len(xs), bool):
xs = tf.constant([0])
ys = tf.range(0, img_rolled.shape[1], tile_size)[:-1]
if not tf.cast(len(ys), bool):
ys = tf.constant([0])
for x in xs:
for y in ys:
# Calculate the gradients for this tile.
with tf.GradientTape() as tape:
# This needs gradients relative to `img_rolled`.
# `GradientTape` only watches `tf.Variable`s by default.
tape.watch(img_rolled)
# Extract a tile out of the image.
img_tile = img_rolled[x:x+tile_size, y:y+tile_size]
loss = calc_loss(img_tile, dream_model)
# Update the image gradients for this tile.
gradients = gradients + tape.gradient(loss, img_rolled)
# Undo the random shift applied to the image and its gradients.
gradients = tf.roll(tf.roll(gradients, -shift_right, axis=1), -shift_down, axis=0)
# Normalize the gradients.
gradients /= tf.math.reduce_std(gradients) + 1e-8
opt.apply_gradients([(-gradients, img)])
# opt.apply_gradients(zip([-gradients], [img]))
# img = img + gradients*0.01
img = tf.clip_by_value(img, -1, 1)
return gradients
原因是没有在函数中将 img 设置为变量,所以要在函数开头就定义 img = tf.Variable(img)。