tensorflow 单机多卡到分布式 记录

import argparse
import sys

import tensorflow as tf

import model
import icdar

import time
import numpy as np
import tensorflow as tf
from tensorflow.contrib import slim

tf.app.flags.DEFINE_integer('input_size', 512, '')
tf.app.flags.DEFINE_integer('batch_size_per_gpu', 14, '')
tf.app.flags.DEFINE_integer('num_readers',2 , '')
tf.app.flags.DEFINE_float('learning_rate', 0.0001, '')
tf.app.flags.DEFINE_integer('max_steps', 100000, '')
tf.app.flags.DEFINE_float('moving_average_decay', 0.997, '')
tf.app.flags.DEFINE_integer('num_gpus', 1 , '')
tf.app.flags.DEFINE_string('checkpoint_path', ' ', '')
tf.app.flags.DEFINE_boolean('restore', False, 'whether to resotre from checkpoint')
tf.app.flags.DEFINE_integer('save_checkpoint_steps', 1000, '')
tf.app.flags.DEFINE_integer('save_summary_steps', 100, '')
tf.app.flags.DEFINE_string('pretrained_model_path', None, '')

tf.app.flags.DEFINE_enum('job_name', '', ('ps', 'worker', 'controller', ''),
                  'One of "ps", "worker", "controller", "".  Empty for local '
                  'training')
tf.app.flags.DEFINE_string('ps_hosts', '', 'Comma-separated list of target hosts')
tf.app.flags.DEFINE_string('worker_hosts', '', 'Comma-separated list of target hosts')
tf.app.flags.DEFINE_integer('task_index', 0, 'Index of task within the job')

FLAGS = tf.app.flags.FLAGS

def tower_loss(images, score_maps, geo_maps, training_masks, reuse_variables=None):

   return total_loss, model_loss

def average_gradients(tower_grads):

    return average_grads

def main(argv=None):
  ps_hosts = FLAGS.ps_hosts.split(",")
  worker_hosts = FLAGS.worker_hosts.split(",")

  # Create a cluster from the parameter server and worker hosts.
  cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts})

  # Create and start a server for the local task.
  server = tf.train.Server(cluster,
                           job_name=FLAGS.job_name,
                           task_index=FLAGS.task_index)

  if FLAGS.job_name == "ps":
    server.join()

  elif FLAGS.job_name == "worker":
#    worker_device = "/job:worker/replica:0/task:%d/gpu:0" % FLAGS.task_index
    # Assigns ops to the local worker by default.

    with tf.device(tf.train.replica_device_setter(
        ps_device = "/job:ps/cpu:0",
        worker_device="/job:worker/replica:0/task:%d" % FLAGS.task_index,
        cluster=cluster)): 
      global_step = tf.Variable(0, name="global_step", trainable=False)
      # Build model...
 
      input_images = tf.placeholder(tf.float32, shape=[None, None, None, 3], name='input_images')
      input_score_maps = tf.placeholder(tf.float32, shape=[None, None, None, 1], name='input_score_maps')
      if FLAGS.geometry == 'RBOX':
          input_geo_maps = tf.placeholder(tf.float32, shape=[None, None, None, 5], name='input_geo_maps')
      else:
          input_geo_maps = tf.placeholder(tf.float32, shape=[None, None, None, 8], name='input_geo_maps')
      input_training_masks = tf.placeholder(tf.float32, shape=[None, None, None, 1], name='input_training_masks')

      #split
      gpus = range(0,FLAGS.num_gpus)
      input_images_split = tf.split(input_images, len(gpus))
      input_score_maps_split = tf.split(input_score_maps, len(gpus))
      input_geo_maps_split = tf.split(input_geo_maps, len(gpus))
      input_training_masks_split = tf.split(input_training_masks, len(gpus))
 
      num_workers = len(worker_hosts)
 
      opt = tf.train.AdamOptimizer(FLAGS.learning_rate)
      opt = tf.train.SyncReplicasOptimizer(
        opt,
        replicas_to_aggregate=num_workers,
        total_num_replicas=num_workers,
        name="sync_replicas")

      tower_grads = 0
      reuse_variables = None
      for gpu_id in gpus:
          with tf.device("/job:worker/replica:0/task:%d/gpu:%d" % (FLAGS.task_index, gpu_id)):
              # multi_gpu
              with tf.name_scope('model_%d' % gpu_id) as scope:
                  i = gpu_id
                  tf.logging.info('aaaaaaaa')
                  iis = input_images_split[i]
                  isms = input_score_maps_split[i]
                  igms = input_geo_maps_split[i]
                  itms = input_training_masks_split[i]
                  total_loss, model_loss = tower_loss(iis, isms, igms, itms, reuse_variables)
                  batch_norm_updates_op = tf.group(*tf.get_collection(tf.GraphKeys.UPDATE_OPS, scope))
                  reuse_variables = True
                  tower_grads+=total_loss
 
      apply_gradient_op = opt.minimize(tower_grads, global_step=global_step)

    # The StopAtStepHook handles stopping after running given steps.
      with tf.control_dependencies([apply_gradient_op, batch_norm_updates_op]):
        train_op = tf.no_op(name='no')

      sync_replicas_hook = opt.make_session_run_hook((FLAGS.task_index == 0))

      hook = tf.train.StopAtStepHook(last_step=10)
      hooks = [hook,
               sync_replicas_hook,
               tf.train.LoggingTensorHook(tensors={'step': global_step, 'loss': total_loss}, every_n_iter=1)]
      # The MonitoredTrainingSession takes care of session initialization,
      # restoring from a checkpoint, saving to a checkpoint, and closing when done
      # or an error occurs.
      saver = tf.train.Saver(tf.global_variables(),save_relative_paths=True)

      config = tf.ConfigProto()
      config.allow_soft_placement = True
 
      with tf.train.MonitoredTrainingSession(master=server.target,
                                             is_chief=(FLAGS.task_index == 0),
                                             checkpoint_dir=" ",
                                             hooks = hooks,
                                             config = config) as mon_sess:

        data_generator = icdar.get_batch(num_workers=FLAGS.num_readers,
                                           input_size=FLAGS.input_size,
                                           batch_size=FLAGS.batch_size_per_gpu * len(gpus))

        while not mon_sess.should_stop():
          # Run a training step asynchronously.
          # See <a href="../api_docs/python/tf/train/SyncReplicasOptimizer"><code>tf.train.SyncReplicasOptimizer</code></a> for additional details on how to
          # perform *synchronous* training.
          # mon_sess.run handles AbortedError in case of preempted PS.
 
          start = time.time()
          for step in range(FLAGS.max_steps):
              print(step)
              data = next(data_generator)

  mon_sess.run([train_op], feed_dict={input_images: data[0],

                                                             input_score_maps: data[2],

                                                             input_geo_maps: data[3],

                                                            input_training_masks: data[4]})

if __name__ == "__main__":

  tf.app.run()

 

问题记录:

1.with tf.device(tf.train.replica_device_setter(
        ps_device = "/job:ps/cpu:0",
        worker_device="/job:worker/replica:0/task:%d" % FLAGS.task_index,
        cluster=cluster)): 

注意worker_device设置

2.整个程序中没有做参数initialize,但benchmark、mnist分布式程序中都做,尤其是benchmark做的比较详细,这应该是程序继续要改进的地方

3 global_step的设置等

4.tf.summary等要多应用

更新:

tf.logging 先设置,再引用tf.logging.info等

如果有幸被大佬看到,请不吝赐教!

展开阅读全文

没有更多推荐了,返回首页