tensorflow 实战Google深度学习框架10_tensorflow计算加速


1. GPU基本操作

tf.reset_default_graph()

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  #忽略警告
import tensorflow as tf

a = tf.constant([1.0, 2.0, 3.0], shape=[3], name='a')
b = tf.constant([1.0, 2.0, 3.0], shape=[3], name='b')
c = a + b

# 通过log_device_placement参数来记录运行每一个运算的设备。
sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
print(sess.run(c))

"""
在没有gpu的机器上会输出:
Device mapping: no known devices.
add: (Add): /job:localhost/replica:0/task:0/device:CPU:0
a: (Const): /job:localhost/replica:0/task:0/device:CPU:0
b: (Const): /job:localhost/replica:0/task:0/device:CPU:0
[2. 4. 6.]
[Finished in 3.0s]
"""

[2. 4. 6.]

import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0"
import tensorflow as tf

# 通过tf.device将运算指定到特定的设备上。
with tf.device('/cpu:0'):
    a = tf.constant([1.0, 2.0, 3.0], shape=[3], name='a')
    b = tf.constant([1.0, 2.0, 3.0], shape=[3], name='b')
with tf.device('/gpu:0'): # tf.device('/gpu:1'):
    c = a + b

sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True))
print(sess.run(c))

[2. 4. 6.]

tf.reset_default_graph()

a_cpu = tf.Variable(0, name="a_cpu")
with tf.device('/gpu:0'):
    a_gpu = tf.Variable(0, name="a_gpu")

# allow_soft_placement 设置为True时,如果运算无法在gpu上执行,则自动放到cpu上执行
# log_device_placement 参数来记录运行每一个运算的设备
graph_path = r"D:\task1\tensorflow_google\chapter10\logs_and_models/show01"
summary_writer = tf.summary.FileWriter(graph_path, tf.get_default_graph())
sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True))
sess.run(tf.global_variables_initializer())
summary_writer.close()

在这里插入图片描述

2. 多GPU并行

“D:\task1\tensorflow_google\chapter10\mnist_multi_gpu_train.py”

# 多GPU并行
# https://blog.csdn.net/s_sunnyy/article/details/70999462

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

from datetime import datetime
import os
import time

import tensorflow as tf
import mnist_inference

# 定义训练神经网络时需要用到的参数。
BATCH_SIZE = 100
LEARNING_RATE_BASE = 0.001
LEARNING_RATE_DECAY = 0.99
REGULARAZTION_RATE = 0.0001
TRAINING_STEPS = 1000
MOVING_AVERAGE_DECAY = 0.99
N_GPU = 4

# 定义日志和模型输出的路径。
LOG_SAVE_PATH = "logs/show02"
MODEL_SAVE_PATH = "models/"
MODEL_NAME = "model.ckpt"
DATA_PATH = "tfrecords/output"


# 定义输入队列得到训练数据,具体细节可以参考第七章。
def get_input():
    filename_queue = tf.train.string_input_producer([DATA_PATH])
    reader = tf.TFRecordReader()
    _, serialized_example = reader.read(filename_queue)

    # 定义数据解析格式。
    features = tf.parse_single_example(
        serialized_example,
        features={
            'image_raw': tf.FixedLenFeature([], tf.string),
            'pixels': tf.FixedLenFeature([], tf.int64),
            'label': tf.FixedLenFeature([], tf.int64),
        })

    # 解析图片和标签信息。
    decoded_image = tf.decode_raw(features['image_raw'], tf.uint8)
    reshaped_image = tf.reshape(decoded_image, [784])
    retyped_image = tf.cast(reshaped_image, tf.float32)
    label = tf.cast(features['label'], tf.int32)

    # 定义输入队列并返回。
    min_after_dequeue = 10000
    capacity = min_after_dequeue + 3 * BATCH_SIZE
    return tf.train.shuffle_batch(
        [retyped_image, label],
        batch_size=BATCH_SIZE,
        capacity=capacity,
        min_after_dequeue=min_after_dequeue)

# 定义损失函数。
def get_loss(x, y_, regularizer, scope, reuse_variables=None):
    with tf.variable_scope(tf.get_variable_scope(), reuse=reuse_variables):
        y = mnist_inference.inference(x, regularizer)
    cross_entropy = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=y, labels=y_))
    regularization_loss = tf.add_n(tf.get_collection('losses', scope))
    loss = cross_entropy + regularization_loss
    return loss

# 计算每一个变量梯度的平均值。
def average_gradients(tower_grads):
    average_grads = []

    # 枚举所有的变量和变量在不同GPU上计算得出的梯度。
    for grad_and_vars in zip(*tower_grads):
        # 计算所有GPU上的梯度平均值。
        grads = []
        for g, _ in grad_and_vars:
            expanded_g = tf.expand_dims(g, 0)
            grads.append(expanded_g)
        grad = tf.concat(grads, 0)
        grad = tf.reduce_mean(grad, 0)

        v = grad_and_vars[0][1]
        grad_and_var = (grad, v)
        # 将变量和它的平均梯度对应起来。
        average_grads.append(grad_and_var)
    # 返回所有变量的平均梯度,这个将被用于变量的更新。
    return average_grads

# 主训练过程。
def main(argv=None):
    # 将简单的运算放在CPU上,只有神经网络的训练过程放在GPU上。
    with tf.Graph().as_default(), tf.device('/cpu:0'):
        # 定义基本的训练过程
        x, y_ = get_input()
        regularizer = tf.contrib.layers.l2_regularizer(REGULARAZTION_RATE)

        global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False)
        learning_rate = tf.train.exponential_decay(
            LEARNING_RATE_BASE, global_step, 60000 / BATCH_SIZE, LEARNING_RATE_DECAY)

        opt = tf.train.GradientDescentOptimizer(learning_rate)

        tower_grads = []
        reuse_variables = False
        # 将神经网络的优化过程跑在不同的GPU上。
        for i in range(N_GPU):
            # 将优化过程指定在一个GPU上。
            with tf.device('/gpu:%d' % i):
                with tf.name_scope('GPU_%d' % i) as scope:
                    cur_loss = get_loss(x, y_, regularizer, scope, reuse_variables)
                    reuse_variables = True
                    grads = opt.compute_gradients(cur_loss)
                    tower_grads.append(grads)

        # 计算变量的平均梯度。
        grads = average_gradients(tower_grads)
        for grad, var in grads:
            if grad is not None:
                tf.summary.histogram('gradients_on_average/%s' % var.op.name, grad)

        # 使用平均梯度更新参数。
        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
        for var in tf.trainable_variables():
            tf.summary.histogram(var.op.name, var)

        # 计算变量的滑动平均值。
        variable_averages = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY, global_step)
        variables_to_average = (tf.trainable_variables() + tf.moving_average_variables())
        variables_averages_op = variable_averages.apply(variables_to_average)
        # 每一轮迭代需要更新变量的取值并更新变量的滑动平均值。
        train_op = tf.group(apply_gradient_op, variables_averages_op)

        saver = tf.train.Saver(tf.all_variables())
        summary_op = tf.summary.merge_all()
        init = tf.initialize_all_variables()
        with tf.Session(config=tf.ConfigProto(
                allow_soft_placement=True, log_device_placement=True)) as sess:
            # 初始化所有变量并启动队列。
            init.run()
            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(sess=sess, coord=coord)
            # threads = tf.train.QueueRunner(sess=sess, coord=coord) #------------------
            summary_writer = tf.summary.FileWriter(LOG_SAVE_PATH, sess.graph)

            for step in range(TRAINING_STEPS):
                # 执行神经网络训练操作,并记录训练操作的运行时间。
                start_time = time.time()
                _, loss_value = sess.run([train_op, cur_loss])
                duration = time.time() - start_time

                # 每隔一段时间数据当前的训练进度,并统计训练速度。
                if step != 0 and step % 10 == 0:
                    # 计算使用过的训练数据个数。
                    num_examples_per_step = BATCH_SIZE * N_GPU
                    examples_per_sec = num_examples_per_step / duration
                    sec_per_batch = duration / N_GPU

                    # 输出训练信息。
                    format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f sec/batch)')
                    print(format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch))

                    # 通过TensorBoard可视化训练过程。
                    summary = sess.run(summary_op)
                    summary_writer.add_summary(summary, step)

                # 每隔一段时间保存当前的模型。
                if step % 1000 == 0 or (step + 1) == TRAINING_STEPS:
                    checkpoint_path = os.path.join(MODEL_SAVE_PATH, MODEL_NAME)
                    saver.save(sess, checkpoint_path, global_step=step)

            coord.request_stop()
            coord.join(threads)

if __name__ == '__main__':
    tf.app.run()

2019-06-12 09:27:59.312603: step 940, loss = 3.64 (39999.1 examples/sec; 0.003 sec/batch)
2019-06-12 09:27:59.487603: step 950, loss = 2.05 (26666.9 examples/sec; 0.004 sec/batch)
2019-06-12 09:27:59.649603: step 960, loss = 1.52 (39999.1 examples/sec; 0.003 sec/batch)
2019-06-12 09:27:59.804603: step 970, loss = 2.06 (44444.1 examples/sec; 0.002 sec/batch)
2019-06-12 09:27:59.972603: step 980, loss = 1.54 (40000.0 examples/sec; 0.002 sec/batch)
2019-06-12 09:28:00.130603: step 990, loss = 1.15 (49999.2 examples/sec; 0.002 sec/batch)
Process finished with exit code 0

在这里插入图片描述

3. 分布式TensorFlow

3.1 创建一个本地集群

import tensorflow as tf

c = tf.constant("Hello, distributed TensorFlow!")
# 创建一个本地tensorflow集群
server = tf.train.Server.create_local_server()
# 在集群上创建一个集群会话
sess = tf.Session(server.target)
print(sess.run(c))

b’Hello, distributed TensorFlow!’

3.2 在本地运行有两个任务的tensorflow集群

  • 在 pycharm tensorflow 虚拟环境下运行
  • 如果出现 Could not start gRPC server 的问题,可能端口被占用,修改端口
# 第一个任务代码
import tensorflow as tf
c = tf.constant("Hello from server1!")
# 生成一个有两个任务的集群,一个任务跑在本地2222端口,另一个任务跑在本地2223端口
cluster = tf.train.ClusterSpec({"local": ["localhost:2200", "localhost:2201"]})
server = tf.train.Server(cluster, job_name="local", task_index=0)
sess = tf.Session(server.target, config=tf.ConfigProto(log_device_placement=True))
print(sess.run(c))
server.join()
# 持续输出 CreateSession still waiting for response from worker: /job:local/replica:0/task:1
# 直至第二个任务启动,输出 b'Hello from server1!'

2019-06-25 15:55:09.574812: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX AVX2
2019-06-25 15:55:09.574812: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:252] Initialize GrpcChannelCache for job local -> {0 -> localhost:2200, 1 -> localhost:2201}
2019-06-25 15:55:09.574812: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:391] Started server with target: grpc://localhost:2200
2019-06-25 15:55:19.590412: I tensorflow/core/distributed_runtime/master.cc:267] CreateSession still waiting for response from worker: /job:local/replica:0/task:1
2019-06-25 15:55:29.595812: I tensorflow/core/distributed_runtime/master.cc:267] CreateSession still waiting for response from worker: /job:local/replica:0/task:1
Const: (Const): /job:local/replica:0/task:0/device:CPU:0
b’Hello from server1!’
2019-06-25 15:55:45.416213: I tensorflow/core/distributed_runtime/master_session.cc:1192] Start master session f775e55f5a886db9 with config: log_device_placement: true
2019-06-25 15:55:45.416213: I tensorflow/core/common_runtime/placer.cc:1059] Const: (Const)/job:local/replica:0/task:0/device:CPU:0

# 第二个任务代码
import tensorflow as tf
c = tf.constant("Hello from server2!")
cluster = tf.train.ClusterSpec({"local": ["localhost:2200", "localhost:2201"]})
server = tf.train.Server(cluster, job_name="local", task_index=1)
sess = tf.Session(server.target, config=tf.ConfigProto(log_device_placement=True))
print(sess.run(c))
server.join()

2019-06-25 15:55:45.369413: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX AVX2
2019-06-25 15:55:45.369413: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:252] Initialize GrpcChannelCache for job local -> {0 -> localhost:2200, 1 -> localhost:2201}
2019-06-25 15:55:45.369413: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:391] Started server with target: grpc://localhost:2201
2019-06-25 15:55:45.385013: I tensorflow/core/distributed_runtime/master_session.cc:1192] Start master session e5a4851a517699de with config: log_device_placement: true
Const: (Const): /job:local/replica:0/task:0/device:CPU:0
b’Hello from server2!’
2019-06-25 15:55:45.400613: I tensorflow/core/common_runtime/placer.cc:1059] Const: (Const)/job:local/replica:0/task:0/device:CPU:0

tf.device 指定操作运行在哪个任务上,将第二个任务修改为:

# 第二个任务代码
import tensorflow as tf

with tf.device("/job:local/task:1"):
    c = tf.constant("Hello from server2!")
cluster = tf.train.ClusterSpec({"local": ["localhost:2200", "localhost:2201"]})
server = tf.train.Server(cluster, job_name="local", task_index=1)
sess = tf.Session(server.target, config=tf.ConfigProto(log_device_placement=True))
print(sess.run(c))
server.join()

2019-06-25 16:00:56.925414: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX AVX2
2019-06-25 16:00:56.925414: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:252] Initialize GrpcChannelCache for job local -> {0 -> localhost:2200, 1 -> localhost:2201}
2019-06-25 16:00:56.925414: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:391] Started server with target: grpc://localhost:2201
Const: (Const): /job:local/replica:0/task:1/device:CPU:0
b’Hello from server2!’
2019-06-25 16:00:56.956614: I tensorflow/core/distributed_runtime/master_session.cc:1192] Start master session e1c1de10fadc4153 with config: log_device_placement: true
2019-06-25 16:00:56.956614: I tensorflow/core/common_runtime/placer.cc:1059] Const: (Const)/job:local/replica:0/task:1/device:CPU:0

3.3 分布式tensorflow模型训练

3.3.1 异步模式的分布式神经网络训练过程

  • 一个参数服务器,两个计算服务器
# 《TensorFlow实战Google深度学习框架》10 TensorFlow 计算加速
# 异步更新模式样例程序
import warnings
warnings.filterwarnings("ignore")
import time
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
import mnist_inference

# 配置神经网络的参数。
BATCH_SIZE = 100
LEARNING_RATE_BASE = 0.01
LEARNING_RATE_DECAY = 0.99
REGULARAZTION_RATE = 0.0001
TRAINING_STEPS = 5000
MOVING_AVERAGE_DECAY = 0.99

# 定义日志和模型输出的路径。
LOG_SAVE_PATH = "logs/show03"
MODEL_SAVE_PATH = "models/model_asynchronous_distributed"
MODEL_NAME = "model.ckpt"
DATA_PATH = "./MNIST_data"
# DATA_PATH = "tfrecords/output"

# flags 指定运行的参数
FLAGS = tf.app.flags.FLAGS

port0 = 2200
port1 = 2201
# 指定当前程序是参数服务器还是计算服务器。
tf.app.flags.DEFINE_string('job_name', 'worker', ' "ps" or "worker" ')
# 指定集群中的参数服务器地址。
tf.app.flags.DEFINE_string(
    'ps_hosts', 'tf-ps0:%s,tf-ps1:%s'%(port0, port1),
    'Comma-separated list of hostname:port for the parameter server jobs. e.g. "tf-ps0:%s,tf-ps1:%s" '%(port0, port1))
# 指定集群中的计算服务器地址。
tf.app.flags.DEFINE_string(
    'worker_hosts', 'tf-worker0:%s,tf-worker1:%s'%(port0, port1),
    'Comma-separated list of hostname:port for the worker jobs. e.g. "tf-worker0:%s,tf-worker1:%s" '%(port0, port1))
# 指定当前程序的任务ID。
tf.app.flags.DEFINE_integer('task_id', 0, 'Task ID of the worker/replica running the training.')

# 定义TensorFlow的计算图,并返回每一轮迭代时需要运行的操作。
def build_model(x, y_, is_chief):
    regularizer = tf.contrib.layers.l2_regularizer(REGULARAZTION_RATE)
    # 通过和5.5节给出的mnist_inference.py代码计算神经网络前向传播的结果。
    y = mnist_inference.inference(x, regularizer)
    global_step = tf.Variable(0, trainable=False)

    # 计算损失函数并定义反向传播过程。
    cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=y, labels=tf.argmax(y_, 1))
    cross_entropy_mean = tf.reduce_mean(cross_entropy)
    loss = cross_entropy_mean + tf.add_n(tf.get_collection('losses'))
    learning_rate = tf.train.exponential_decay(
        LEARNING_RATE_BASE, global_step, 60000 / BATCH_SIZE, LEARNING_RATE_DECAY)

    train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)

    # 定义每一轮迭代需要运行的操作。
    if is_chief:
        # 计算变量的滑动平均值。
        variable_averages = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY, global_step)
        variables_averages_op = variable_averages.apply(tf.trainable_variables())
        with tf.control_dependencies([variables_averages_op, train_op]):
            train_op = tf.no_op()
    return global_step, loss, train_op


def main(argv=None):
    # 解析flags并通过tf.train.ClusterSpec配置TensorFlow集群。
    ps_hosts = FLAGS.ps_hosts.split(',')
    print(ps_hosts)
    worker_hosts = FLAGS.worker_hosts.split(',')
    cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts})
    # 通过tf.train.ClusterSpec以及当前任务创建tf.train.Server。
    server = tf.train.Server(cluster, job_name = FLAGS.job_name, task_index=FLAGS.task_id)

    # 参数服务器只需要管理TensorFlow中的变量,不需要执行训练的过程。server.join()会
    # 一致停在这条语句上。
    if FLAGS.job_name == 'ps':
        with tf.device("/cpu:0"):              # ----------------------------
            server.join()

    # 定义计算服务器需要运行的操作。
    is_chief = (FLAGS.task_id == 0)
    mnist = input_data.read_data_sets(DATA_PATH, one_hot=True)

    device_setter = tf.train.replica_device_setter(worker_device="/job:worker/task:%d" % FLAGS.task_id, cluster=cluster)
    with tf.device(device_setter):

        # 定义输入并得到每一轮迭代需要运行的操作。
        x = tf.placeholder(tf.float32, [None, mnist_inference.INPUT_NODE], name='x-input')
        y_ = tf.placeholder(tf.float32, [None, mnist_inference.OUTPUT_NODE], name='y-input')
        global_step, loss, train_op = build_model(x, y_, is_chief)

        # 定义用于保存模型的saver。
        saver = tf.train.Saver()
        # 定义日志输出操作。
        summary_op = tf.summary.merge_all()
        # 定义变量初始化操作。
        init_op = tf.global_variables_initializer()
        # 通过tf.train.Supervisor管理训练深度学习模型时的通用功能。
        sv = tf.train.Supervisor(
            is_chief=is_chief,
            logdir=MODEL_SAVE_PATH,
            init_op=init_op,
            summary_op=summary_op,
            saver=saver,
            global_step=global_step,
            save_model_secs=60,
            save_summaries_secs=60)

        sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
        # 通过tf.train.Supervisor生成会话。
        sess = sv.prepare_or_wait_for_session(server.target, config=sess_config)

        step = 0
        start_time = time.time()

        # 执行迭代过程。
        while not sv.should_stop():
            xs, ys = mnist.train.next_batch(BATCH_SIZE)
            _, loss_value, global_step_value = sess.run([train_op, loss, global_step], feed_dict={x: xs, y_: ys})
            if global_step_value >= TRAINING_STEPS: break

           # 每隔一段时间输出训练信息。
            if step > 0 and step % 100 == 0:
                duration = time.time() - start_time
                sec_per_batch = duration / global_step_value
                format_str = "After %d training steps (%d global steps), loss on training batch is %g.  (%.3f sec/batch)"
                print(format_str % (step, global_step_value, loss_value, sec_per_batch))
            step += 1
    sv.stop()

if __name__ == "__main__":
#     FLAGS.job_name = 'ps'
#     FLAGS.task_id = 0
#     FLAGS.ps_hosts = 'tf-ps0:2200'
#     FLAGS.worker_hosts = 'tf-worker0:2200,tf-worker1:2200'
    # python -W ignore mnist_train_asynchronous_distributed.py --FLAGS.job_name='ps' --FLAGS.task_id=0 --FLAGS.ps_hosts='tf-ps0:2200' --FLAGS.worker_hosts='tf-worker0:2200,tf-worker1:2200'
    tf.app.run()
  • 启动参数服务器,cmd 执行 :

python -W ignore mnist_train_asynchronous_distributed.py --FLAGS.job_name=‘ps’ --FLAGS.task_id=0 --FLAGS.ps_hosts=‘tf-ps0:2200’ --FLAGS.worker_hosts=‘tf-worker0:2200,tf-worker1:2200’

输出:

[‘tf-ps0:2200’]
2019-06-26 10:54:30.359640: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX AVX2
2019-06-26 10:54:30.359640: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:252] Initialize GrpcChannelCache for job ps -> {0 -> localhost:2200}
2019-06-26 10:54:30.359640: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:252] Initialize GrpcChannelCache for job worker -> {0 -> tf-worker0:2200, 1 -> tf-worker1:2200}
2019-06-26 10:54:30.375240: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:391] Started server with target: grpc://localhost:2200

  • 启动计算服务器,cmd 执行 :

python -W ignore mnist_train_asynchronous_distributed.py --FLAGS.job_name=‘worker’ --FLAGS.task_id=0 --FLAGS.ps_hosts=‘tf-ps0:2200’ --FLAGS.worker_hosts=‘tf-worker0:2200,tf-worker1:2200’

输出:

After 700 training steps (1383 global steps), loss on training batch is 0.647718. (0.020 sec/batch)
After 800 training steps (1578 global steps), loss on training batch is 0.413686. (0.020 sec/batch)
After 900 training steps (1773 global steps), loss on training batch is 0.539272. (0.020 sec/batch)
After 1000 training steps (1968 global steps), loss on training batch is 0.439053. (0.019 sec/batch)
Process finished with exit code -1073741819 (0xC0000005)

  • 启动计算服务器,cmd 执行 :

python -W ignore mnist_train_asynchronous_distributed.py --FLAGS.job_name=‘worker’ --FLAGS.task_id=1 --FLAGS.ps_hosts=‘tf-ps0:2200’ --FLAGS.worker_hosts=‘tf-worker0:2200,tf-worker1:2200’

输出:

After 3500 training steps (4554 global steps), loss on training batch is 0.428365. (0.020 sec/batch)
After 3600 training steps (4654 global steps), loss on training batch is 0.307349. (0.020 sec/batch)
After 3700 training steps (4754 global steps), loss on training batch is 0.457617. (0.020 sec/batch)
After 3800 training steps (4854 global steps), loss on training batch is 0.299042. (0.020 sec/batch)
After 3900 training steps (4954 global steps), loss on training batch is 0.344163. (0.020 sec/batch)
2019-06-26 12:41:38.586070: W tensorflow/core/distributed_runtime/master_session.cc:1363] Timeout for closing worker session
Process finished with exit code 0

在这里插入图片描述

3.3.2 同步模式的分布式神经网络训练过程

  • 一个参数服务器,两个计算服务器
# 《TensorFlow实战Google深度学习框架》10 TensorBoard计算加速
# 同步更新模式样例程序

import time
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
import mnist_inference

# 配置神经网络的参数。
BATCH_SIZE = 100
LEARNING_RATE_BASE = 0.8
LEARNING_RATE_DECAY = 0.99
REGULARAZTION_RATE = 0.0001
TRAINING_STEPS = 10000
MOVING_AVERAGE_DECAY = 0.99

LOG_SAVE_PATH = "logs/show03"
MODEL_SAVE_PATH = "models/model_synchronous_distributed"
MODEL_NAME = "model.ckpt"
DATA_PATH = "./MNIST_data"
# DATA_PATH = "tfrecords/output"

port0='2203'
port1='2204'

# 和异步模式类似的设置flags。
FLAGS = tf.app.flags.FLAGS

tf.app.flags.DEFINE_string('job_name', 'worker', ' "ps" or "worker" ')
tf.app.flags.DEFINE_string(
    'ps_hosts', 'tf-ps0:%s,tf-ps1:%s'%(port0, port1),
    'Comma-separated list of hostname:port for the parameter server jobs. e.g. "tf-ps0:%s,tf-ps1:%s" '%(port0, port1))
tf.app.flags.DEFINE_string(
    'worker_hosts', 'tf-worker0:%s,tf-worker1:%s'%(port0, port1),
    'Comma-separated list of hostname:port for the worker jobs. e.g. "tf-worker0:%s,tf-worker1:%s" '%(port0, port1))
tf.app.flags.DEFINE_integer('task_id', 0, 'Task ID of the worker/replica running the training.')

# 和异步模式类似的定义TensorFlow的计算图。唯一的区别在于使用
# tf.train.SyncReplicasOptimizer函数处理同步更新。
def build_model(x, y_, n_workers, is_chief):
    regularizer = tf.contrib.layers.l2_regularizer(REGULARAZTION_RATE)
    y = mnist_inference.inference(x, regularizer)
    global_step = tf.Variable(0, trainable=False)

    variable_averages = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY, global_step)
    variables_averages_op = variable_averages.apply(tf.trainable_variables())

    cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=y, labels=tf.argmax(y_, 1))
    cross_entropy_mean = tf.reduce_mean(cross_entropy)
    loss = cross_entropy_mean + tf.add_n(tf.get_collection('losses'))
    learning_rate = tf.train.exponential_decay(
        LEARNING_RATE_BASE, global_step, 60000 / BATCH_SIZE, LEARNING_RATE_DECAY)

    # 通过tf.train.SyncReplicasOptimizer函数实现同步更新。
    opt = tf.train.SyncReplicasOptimizer(
        tf.train.GradientDescentOptimizer(learning_rate),
        replicas_to_aggregate=n_workers,
        total_num_replicas=n_workers)

    train_op = opt.minimize(loss, global_step=global_step)
    if is_chief:
        variable_averages = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY, global_step)
        variables_averages_op = variable_averages.apply(tf.trainable_variables())
        with tf.control_dependencies([variables_averages_op, train_op]):
            train_op = tf.no_op()

    return global_step, loss, train_op, opt


def main(argv=None):
    # 和异步模式类似的创建TensorFlow集群。
    ps_hosts = FLAGS.ps_hosts.split(',')
    worker_hosts = FLAGS.worker_hosts.split(',')
    print('PS hosts are: %s' % ps_hosts)
    print('Worker hosts are: %s' % worker_hosts)
    n_workers = len(worker_hosts)

    cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts})
    server = tf.train.Server(
        cluster, job_name=FLAGS.job_name, task_index=FLAGS.task_id)

    if FLAGS.job_name == 'ps':
        with tf.device("/cpu:0"):
            server.join()

    is_chief = (FLAGS.task_id == 0)
    mnist = input_data.read_data_sets(DATA_PATH, one_hot=True)

    with tf.device(tf.train.replica_device_setter(
            worker_device="/job:worker/task:%d" % FLAGS.task_id, cluster=cluster)):
        x = tf.placeholder(tf.float32, [None, mnist_inference.INPUT_NODE], name='x-input')
        y_ = tf.placeholder(tf.float32, [None, mnist_inference.OUTPUT_NODE], name='y-input')
        global_step, loss, train_op, opt = build_model(x, y_, n_workers, is_chief)
        # 和异步模式类似的声明一些辅助函数。
        saver = tf.train.Saver()
        summary_op = tf.summary.merge_all()
        init_op = tf.global_variables_initializer()

        # 在同步模式下,主计算服务器需要协调不同计算服务器计算得到的参数梯度并最终更新参数。
        # 这需要主计算服务器完成一些额外的初始化工作。
        if is_chief:
            # 获取协调不同计算服务器的队列。在更新参数之前,主计算服务器需要先启动这些队列。
            chief_queue_runner = opt.get_chief_queue_runner()
            # 初始化同步更新队列的操作。
            init_tokens_op = opt.get_init_tokens_op(0)

        # 和异步模式类似的声明tf.train.Supervisor。
        sv = tf.train.Supervisor(is_chief=is_chief,
                                 logdir=MODEL_SAVE_PATH,
                                 init_op=init_op,
                                 summary_op=summary_op,
                                 saver=saver,
                                 global_step=global_step,
                                 save_model_secs=60,
                                 save_summaries_secs=60)
        sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
        sess = sv.prepare_or_wait_for_session(server.target, config=sess_config)

        # 在主计算服务器上启动协调同步更新的队列并执行初始化操作。
        if is_chief:
            sv.start_queue_runners(sess, [chief_queue_runner])
            sess.run(init_tokens_op)

        # 和异步模式类似的运行迭代的训练过程。
        step = 0
        start_time = time.time()
        while not sv.should_stop():
            xs, ys = mnist.train.next_batch(BATCH_SIZE)
            _, loss_value, global_step_value = sess.run([train_op, loss, global_step], feed_dict={x: xs, y_: ys})
            if global_step_value >= TRAINING_STEPS: break

            if step > 0 and step % 100 == 0:
                duration = time.time() - start_time
                sec_per_batch = duration / (global_step_value * n_workers)
                format_str = "After %d training steps (%d global steps), loss on training batch is %g.  (%.3f sec/batch)"
                print(format_str % (step, global_step_value, loss_value, sec_per_batch))
            step += 1
    sv.stop()

if __name__ == "__main__":
#     FLAGS.job_name = 'ps'
#     FLAGS.task_id = 0
#     FLAGS.ps_hosts = 'tf-ps0:2203'
#     FLAGS.worker_hosts = 'tf-worker0:2203,tf-worker1:2203'
    # python -W ignore mnist_train_asynchronous_distributed.py --FLAGS.job_name='ps' --FLAGS.task_id=0 --FLAGS.ps_hosts='tf-ps0:2200' --FLAGS.worker_hosts='tf-worker0:2200,tf-worker1:2200'
    tf.app.run()

启动参数服务器,cmd 执行 :

python -W ignore mnist_train_asynchronous_distributed.py --FLAGS.job_name=‘ps’ --FLAGS.task_id=0 --FLAGS.ps_hosts=‘tf-ps0:2203’ --FLAGS.worker_hosts=‘tf-worker0:2203,tf-worker1:2203’

输出:

2019-06-26 14:33:55.551686: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX AVX2
2019-06-26 14:33:55.554686: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:252] Initialize GrpcChannelCache for job ps -> {0 -> localhost:2203}
2019-06-26 14:33:55.554686: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:252] Initialize GrpcChannelCache for job worker -> {0 -> tf-worker0:2203, 1 -> tf-worker1:2203}
2019-06-26 14:33:55.559686: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:391] Started server with target: grpc://localhost:2203
Process finished with exit code -1073741819 (0xC0000005)

启动计算服务器,cmd 执行 :

python -W ignore mnist_train_asynchronous_distributed.py --FLAGS.job_name=‘worker’ --FLAGS.task_id=0 --FLAGS.ps_hosts=‘tf-ps0:2203’ --FLAGS.worker_hosts=‘tf-worker0:2203,tf-worker1:2203’

输出:

After 100 training steps (480 global steps), loss on training batch is 0.200888. (0.005 sec/batch)
After 200 training steps (580 global steps), loss on training batch is 0.218024. (0.007 sec/batch)
After 300 training steps (680 global steps), loss on training batch is 0.251224. (0.009 sec/batch)
After 400 training steps (780 global steps), loss on training batch is 0.17771. (0.011 sec/batch)
After 500 training steps (880 global steps), loss on training batch is 0.231131. (0.012 sec/batch)
After 600 training steps (980 global steps), loss on training batch is 0.231515. (0.013 sec/batch)

启动计算服务器,cmd 执行 :

python -W ignore mnist_train_asynchronous_distributed.py --FLAGS.job_name=‘worker’ --FLAGS.task_id=1 --FLAGS.ps_hosts=‘tf-ps0:2203’ --FLAGS.worker_hosts=‘tf-worker0:2203,tf-worker1:2203’

输出:

After 100 training steps (480 global steps), loss on training batch is 0.245051. (0.005 sec/batch)
After 200 training steps (580 global steps), loss on training batch is 0.215788. (0.007 sec/batch)
After 300 training steps (680 global steps), loss on training batch is 0.215349. (0.009 sec/batch)
After 400 training steps (780 global steps), loss on training batch is 0.178966. (0.011 sec/batch)
After 500 training steps (880 global steps), loss on training batch is 0.213862. (0.012 sec/batch)
After 600 training steps (980 global steps), loss on training batch is 0.17688. (0.013 sec/batch)

在这里插入图片描述

3.4 使用 Caicloud 运行分布式

import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
import caicloud_dist_tensorflow_base as caicloud
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值