TensorFlow-变量保存和恢复

最新推荐文章于 2023-06-24 23:10:19 发布

coderpai

最新推荐文章于 2023-06-24 23:10:19 发布

阅读量864

点赞数

分类专栏： Tensorflow 文章标签： TensorFlow

本文链接：https://blog.csdn.net/CoderPai/article/details/80302371

版权

Tensorflow 专栏收录该内容

48 篇文章 3 订阅

订阅专栏

作者：chen_h
微信号 & QQ：862251340
微信公众号：coderpai

变量（Variables）

# Rank 0 tensor (scalar)
fruit = tf.Variable("Orange", tf.string)
quantity = tf.Variable(2, tf.int16)
price = tf.Variable(3.23, tf.float32)

# Rank 1 tensor
strings = tf.Variable(["Fruit", "orange"], tf.string)
prices  = tf.Variable([3.23, 4.02], tf.float64)

# Rank 2 tensor
answers = tf.Variable([[False, True],[False, False]], tf.bool)

当你训练一个模型的时候，我们需要使用变量去存储训练的参数，比如权重和偏差项，超参数，比如学习率，步数等等信息。

但是，最好定义变量的方式是去使用 tf.get_variable() 函数，当我们设计的网络非常深的时候，这个 API 允许去重复使用一些变量。

import tensorflow as tf
import numpy as np

v1 = tf.get_variable("v1", [5, 5, 3])   # A tensor with shape (5, 5, 3) filled with random values
v2 = tf.get_variable("v2", initializer=tf.constant(2))    # 2, float32 scalar
v3 = tf.get_variable("v3", initializer=tf.constant([[2, 3], [4, 5]]))  # [[2, 3], [4, 5]]

v4 = tf.get_variable("v1", [3, 2], initializer=tf.zeros_initializer)
v5 = tf.get_variable("v2", [3, 2], initializer=tf.ones_initializer)

# [[ 1.  2.], [ 3.  4.], [ 5.  6.]]
v6 = tf.get_variable("v3", [3, 2], initializer=tf.constant_initializer([1, 2, 3, 4, 5, 6])) 

W = tf.get_variable("W", [784, 256], initializer=tf.truncated_normal_initializer(stddev=np.sqrt(2.0 / 784)))
Z = tf.get_variable("z", [4, 5], initializer=tf.random_uniform_initializer(-1, 1))

以下程序，我们定义了这些东西：

定义变量和并且初始化；
定义一个操作 op 去更新这些变量；
显示的初始化变量；
对变量进行检索操作；

import tensorflow as tf
import numpy as np 

### Using variables
# Define variables and its initializer
weights = tf.get_variable("W", [784, 256], initializer=tf.truncated_normal_initializer(stddev=np.sqrt(2.0 / 784)))
biases = tf.get_variable("z", [256], initializer=tf.zeros_initializer) 

counter = tf.get_variable("counter", initializer=tf.constant(0)) 

# Add an Op to increment a counter
increment = tf.assign(counter , counter + 1)

init_op = tf.global_variables_initializer()
with tf.Session() as sess:
    # Execute the init_op to initialize all variables
    sess.run(init_op)

    # Retrieve the value of a variable
    b = sess.run(biases)
    print(b)

保存一个检查点（checkpoint）

在训练期间，我们的变量可以保存到磁盘。然后这些变量可以被重新加载到模型进行训练，也可以被作为一个接口进行使用。

import tensorflow as tf
import numpy as np 

# Create some variables 
v1 = tf.get_variable("v1", shape=[3], initializer = tf.zeros_initializer)
v2 = tf.get_variable("v2", shape=[5], initializer = tf.zeros_initializer)

# Create the op
inc_v1 = v1.assign(v1+1)
dec_v2 = v2.assign(v2-1)
init_op = tf.global_variables_initializer()

# Add ops to save and restore all the variables.
saver = tf.train.Saver()

with tf.Session() as sess:
  sess.run(init_op)
  inc_v1.op.run()
  dec_v2.op.run()

  # Save the variables to disk.
  save_path = saver.save(sess, "/tmp/model.ckpt")

恢复一个检查点（checkpoint）

import tensorflow as tf
import numpy as np 

# Create some variables. 
# We do not need to provide initializer or init_op if it is restored from a checkpoint.
v1 = tf.get_variable("v1", shape=[3])
v2 = tf.get_variable("v2", shape=[5])

saver = tf.train.Saver()

with tf.Session() as sess:
  # Restore variables from disk.
  saver.restore(sess, "/tmp/model.ckpt")

  # Check the values of the variables
  print("v1 : %s" % v1.eval())
  print("v2 : %s" % v2.eval())

如果你只想保存模型的其中一些变量，那么你可以如下操作：

import tensorflow as tf
import numpy as np 


v1 = tf.get_variable("v1", [3], initializer = tf.zeros_initializer)
v2 = tf.get_variable("v2", [5], initializer = tf.zeros_initializer)

# Save only v2
saver = tf.train.Saver({"v2": v2})

with tf.Session() as sess:
  # Initialize v1 since the saver will not.
  v1.initializer.run()
  saver.restore(sess, "/tmp/model.ckpt")

定期加载模型并保存检查点

这个实例代码在模型的开始先保存模型，并且在训练的期间定期的保存模型。

import tensorflow as tf
import os

def loadmodel(session, saver, checkpoint_dir):
    session.run(tf.global_variables_initializer())
    ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
    if ckpt and ckpt.model_checkpoint_path:
        ckpt_name = os.path.basename(ckpt.model_checkpoint_path)
        saver.restore(session, os.path.join(checkpoint_dir, ckpt_name))
        return True
    else:
        return False


def save(session, saver, checkpoint_dir, step):
    dir = os.path.join(checkpoint_dir, "model")
    saver.save(session, dir, global_step=step)


with tf.Session() as session:
    saver = tf.train.Saver()
    ...
    loadmodel(session, saver, "./checkpoint")
    ...
    for i in range(10000):
        ...
        if (i % 1000 == 0):
           save(session, saver, "./checkpoint", i)

可训练与不可训练参数

在迁移学习中，我们可能会从检查点加载模型，但模型的有些部分可能不用训练，所有我们可以通过设置 trainable=False 来进行控制。

freezed_W = tf.get_variable('CNN_W!', [5, 5, 3, 32], trainable=False,
                            initializer=tf.truncated_normal_initializer(stddev=0.02))
...
loadmodel(session, saver, "./checkpoint")

在一些问题中，我们可能需要同时训练几个多层的深度网络。针对不同的可训练参数，我们会使用不同的优化器和不同的损失函数。

import tensorflow as tf

def scope_variables(name):
    with tf.variable_scope(name):
        return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 
                       scope=tf.get_variable_scope().name)

# Model parameters for the discriminator network
with tf.variable_scope("discriminator"):
   v1 = tf.get_variable("v1", [3], initializer=tf.zeros_initializer)
   ...

# Model parameters for the generator network
with tf.variable_scope("generator"):
   v2 = tf.get_variable("v2", [2], initializer=tf.zeros_initializer)
   ...

# Get all the trainable parameters for the discriminator   
discriminator_variables = scope_variables("discriminator")

# Get all the trainable parameters for the generator 
generator_variables = scope_variables("generator")

# 2 optimizers each for different networks
train_discriminator = discriminator_optimizer.minimize(d_loss, 
                              var_list=discriminator_variables)
train_generator = generator_optimizer.minimize(g_loss, 
                              var_list=generator_variables)

作用域（scoping）

我们可以使用作用域来创建两个不同的网络层，使得他们有各自不同的参数。比如，CNN1 和 CNN2 有自己的权重 w 和偏差项 b。

import tensorflow as tf

def conv2d(input, output_dim, filter_h=5, filter_w=5, stride_h=2, stride_w=2, stddev=0.02):
    w = tf.get_variable('w', [filter_h, filter_w, input.get_shape()[-1], output_dim],
                        initializer=tf.truncated_normal_initializer(stddev=stddev))
    conv = tf.nn.conv2d(input, w, strides=[1, stride_h, stride_w, 1], padding='SAME')
    biases = tf.get_variable('biases', [output_dim], initializer=tf.constant_initializer(0.0))
    conv = tf.reshape(tf.nn.bias_add(conv, biases), conv.get_shape())

    return conv

input1 = tf.random_normal([1,10,10,32])
input2 = tf.random_normal([1,20,20,32])

with tf.variable_scope("conv1"):
    cnn1 = conv2d(input1, 16)

with tf.variable_scope("conv2"):
    cnn1 = conv2d(input2, 16)

变量共享

在研究变量共享之前，我们首先描述 tf.Variable 是如何工作的。tf.Variable 总是会创建一个新的变量，即使给定了相同的名称。

# tf.Variable always create new variable even given the same name.
v1 = tf.Variable(10, name="name1")
v2 = tf.Variable(10, name="name1")
assert(v1 is not v2)
print(v1.name)  # name1:0
print(v2.name)  # name1_1:0

如果名称为 name1 已经存在，那么 TensorFlow 会在命名后面添加上 _1，_2 等，用来保证命名的唯一性。

因此，当我们调用下面的 affine 方法，我们创建了 2 组不同的权重 w 和偏差项 b，也就是说每个 affine 都有他们自己的权重 w 和偏差项 b。

def affine(x, shape):
    W = tf.Variable(tf.truncated_normal(shape))
    b = tf.Variable(tf.zeros([shape[1]]))

    model = tf.nn.relu(tf.matmul(x, W) + b)
    return model

x = tf.placeholder(tf.float32, [None, 784])
with tf.variable_scope("n1"):
    n1 = affine(x, [784, 500])

with tf.variable_scope("n1"):
    n2 = affine(x, [784, 500])

有时候，在一个复杂的网络中，我们想要共享一个图层或者参数，那么我们如何修改一下刚刚的 affine 函数，使得能共享相同的权重 w 和偏差项 b。

def affine_reuseable(x, shape):
    W = tf.get_variable("W", shape,
                    initializer=tf.random_normal_initializer())
    b = tf.get_variable("b", [shape[1]],
                    initializer=tf.constant_initializer(0.0))
    model = tf.nn.relu(tf.matmul(x, W) + b)
    return model

x = tf.placeholder(tf.float32, [None, 784])
with tf.variable_scope("n2"):
    nn1 = affine_reuseable(x, [784, 500])

with tf.variable_scope("n2", reuse=True):
    nn2 = affine_reuseable(x, [784, 500])

如果一个变量给定的 scope/name 存在，则 tf.get_variable 将会返回现在存在的变量，而不会去重新创建一个。

W = tf.get_variable("W", shape, initializer=tf.random_normal_initializer())

因此，对于第二次调用 affine 函数的时候，tf.get_variable 就调用了已经存在的权重 w 和偏差项 b。

with tf.variable_scope("n2", reuse=True):
    nn2 = affine_reuseable(x, [784, 500])

重用（Reuse）

但是，TensorFlow 希望开发人员可以自己知道哪些变量已经被使用了。在调用 tf.get_variable 之前，开发人员需要明确知道是否可以设置 reuse 这个标志。在调用 tf.get_varialbe 时，厦门的这两种情况会引发异常：

如果标志 reuse 设置为 False 或者 None（默认），且变量已经存在了。
如果标志 reuse 设置为 True，但是变量不存在。

如下程序：

with tf.variable_scope("foo"):
    v = tf.get_variable("v", [1])
    v1 = tf.get_variable("v")
    # Raises ValueError("... v already exists ...").

with tf.variable_scope("foo", reuse=True):
    v = tf.get_variable("v")
    # Raises ValueError("... v does not exists ...").

对于 reuse 标志我们可能需要按照如下使用：

with tf.variable_scope("foo"):
    v = tf.get_variable("v2", [1]) # Create a new variable.

with tf.variable_scope("foo", reuse=True):
    v1 = tf.get_variable("v2")  # reuse/share the variable "foo/v2".
assert v1 == v

with tf.variable_scope("foo") as scope:
    v = tf.get_variable("v3", [1])
    scope.reuse_variables()
    v1 = tf.get_variable("v3")
assert v1 == v

我们可以重新使用作用了，而不是再次去编写作用域的范围：

with tf.variable_scope("model") as scope:
  output1 = my_image_filter(input1)
with tf.variable_scope(scope, reuse=True):  # Can use scope instead of "model"
  output2 = my_image_filter(input2)

嵌套范围

with tf.variable_scope("foo"):
    with tf.variable_scope("bar"):
        v = tf.get_variable("v", [1])
        assert v.name == "foo/bar/v:0"

警告变量共享

很多的开发者都熟悉使用 tf.name_scope 和 tf.Variables 方法。但是，这些 API 不适合于共享变量。例如，下面的 tf.get_varialbe 不会选择从 tf.name_scope 创建的命名。

with tf.name_scope("foo1"):
    v1 = tf.get_variable("v", [1])
    v2 = tf.Variable(1, name="v2")

with tf.variable_scope("foo2"):
    v3 = tf.get_variable("v", [1])
    v4 = tf.Variable(1, name="v2")

print(v1.name)  # v:0 (Unexpected!)
print(v2.name)  # foo1/v2:0
print(v3.name)  # foo2/v:0  
print(v4.name)  # foo2/v2:0

为了避免这些问题，我们最好采用如下方法：

不要对共享变量使用 tf.name_scope 和 tf.Variable；
总是使用 tf.variable_scope 来定义共享变量的范围；
使用 tf.get_variable 来创建和检索共享变量；

with tf.variable_scope("foo"):
    v = tf.get_variable("v2", [1])    # Create a new variable

with tf.variable_scope("foo", reuse=True):
    v1 = tf.get_variable("v2")        # Reuse a variable created before.

分配（Assignment）

v = tf.get_variable("v", shape=(), initializer=tf.zeros_initializer())

v1 = v.assign_add(1)  # 1.0
v.assign(v1)          # 1.0

with tf.Session() as session:
    tf.global_variables_initializer().run()
    value, value1 = session.run([v, v1])
    print(value, value1)

# 1.0 1.0