docker下tensorflow的分布式运行

1、docker安装

sudo yum install yum-utils device-mapper-persistent-data lvm2
sudo yum install docker-ce docker-ce-cli containerd.io
sudo systemctl start docker
sudo systemctl enable docker
 
# 配置普通用户deploy使用docker命令
sudo useradd docker -g docker
sudo usermod -a -G docker deploy
sudo chmod a+rw /var/run/docker.sock

2、Tensorflow-cpu安装

下载镜像

docker pull tensorflow/tensorflow

下载完毕展示结果:
Status: Downloaded newer image for tensorflow/tensorflow:latest

3、分布式计算(1参数服务器+1计算服务器)

(1)创建容器

## 以节点一为参数服务器,节点二为计算服务器,分别构建容器
## 节点一(如172.16.128.228)
docker run --name ps -it -p 2222:2222 -v ~/tensorflow:/test/data tensorflow/tensorflow
## 节点二(如172.16.128.229)
docker run --name worker -it -p 2222:2222 -v ~/tensorflow:/test/data tensorflow/tensorflow

(2)执行代码

## 节点一(如172.16.128.228)
python distribute.py --ps_hosts=172.16.128.228:2222 --worker_hosts=172.16.128.229:2222 --job_name=ps --task_index=0
## 节点二(如172.16.128.229)
python distribute.py --ps_hosts=172.16.128.228:2222 --worker_hosts=172.16.128.229:2222 --job_name=docker --task_index=0

4、分布式计算(1参数服务器+2计算服务器)

(1)创建容器

## 以节点一为参数服务器和计算服务器,节点二为计算服务器,分别构建容器
## 节点一(如172.16.128.228)参数服务器
docker run --name ps -it -p 2222:2222 -v ~/tensorflow:/test/data tensorflow/tensorflow
## 节点一(如172.16.128.228)计算服务器
docker run --name worker -it -p 2223:2223 -v ~/tensorflow:/test/data tensorflow/tensorflow
## 节点二(如172.16.128.229)
docker run --name worker -it -p 2222:2222 -v ~/tensorflow:/test/data tensorflow/tensorflow

(2)执行代码

## 节点一(如172.16.128.228)参数服务器
python distribute.py --ps_hosts=172.16.128.228:2222 --worker_hosts=172.16.128.229:2223,172.16.128.229:2222 --job_name=ps --task_index=0
## 节点一(如172.16.128.228)参数服务器
python distribute.py --ps_hosts=172.16.128.228:2222 --worker_hosts=172.16.128.229:2223,172.16.128.229:2222 --job_name=worker --task_index=0
## 节点二(如172.16.128.229)
python distribute.py --ps_hosts=172.16.128.228:2222 --worker_hosts=172.16.128.229:2223,172.16.128.229:2222 --job_name=docker --task_index=1

5、测试代码

#coding=utf-8
import numpy as np
import tensorflow as tf

# Define parameters
FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_float('learning_rate', 0.00003, 'Initial learning rate.')
tf.app.flags.DEFINE_integer('steps_to_validate', 1000,
                     'Steps to validate and print loss')

# For distributed
tf.app.flags.DEFINE_string("ps_hosts", "172.16.128.228:2222",
                           "Comma-separated list of hostname:port pairs")
tf.app.flags.DEFINE_string("worker_hosts", "172.16.128.229:2222",
                           "Comma-separated list of hostname:port pairs")
tf.app.flags.DEFINE_string("job_name", "", "One of 'ps', 'worker'")
tf.app.flags.DEFINE_integer("task_index", 0, "Index of task within the job")
tf.app.flags.DEFINE_integer("issync", 0, "是否采用分布式的同步模式,1表示同步模式,0表示异步模式")

# Hyperparameters
learning_rate = FLAGS.learning_rate
steps_to_validate = FLAGS.steps_to_validate

def main(_):
  ps_hosts = FLAGS.ps_hosts.split(",")
  worker_hosts = FLAGS.worker_hosts.split(",")
  cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts})
  server = tf.train.Server(cluster,job_name=FLAGS.job_name,task_index=FLAGS.task_index)

  issync = FLAGS.issync
  if FLAGS.job_name == "ps":
    server.join()
  elif FLAGS.job_name == "worker":
    with tf.device(tf.train.replica_device_setter(
                    worker_device="/job:worker/task:%d" % FLAGS.task_index,
                    cluster=cluster)):
      global_step = tf.Variable(0, name='global_step', trainable=False)

      input = tf.placeholder("float")
      label = tf.placeholder("float")

      weight = tf.get_variable("weight", [1], tf.float32, initializer=tf.random_normal_initializer())
      tf.summary.histogram('weight',weight)
      biase  = tf.get_variable("biase", [1], tf.float32, initializer=tf.random_normal_initializer())
      tf.summary.histogram('biase',biase)
      pred = tf.multiply(input, weight) + biase

      loss_value = loss(label, pred)
      optimizer = tf.train.GradientDescentOptimizer(learning_rate)

      grads_and_vars = optimizer.compute_gradients(loss_value)
      if issync == 1:
        #同步模式计算更新梯度
        rep_op = tf.train.SyncReplicasOptimizer(optimizer,
                                                replicas_to_aggregate=len(
                                                  worker_hosts),
                                                replica_id=FLAGS.task_index,
                                                total_num_replicas=len(
                                                  worker_hosts),
                                                use_locking=True)
        train_op = rep_op.apply_gradients(grads_and_vars,
                                       global_step=global_step)
        init_token_op = rep_op.get_init_tokens_op()
        chief_queue_runner = rep_op.get_chief_queue_runner()
      else:
        #异步模式计算更新梯度
        train_op = optimizer.apply_gradients(grads_and_vars,
                                       global_step=global_step)


      init_op = tf.initialize_all_variables()
      
      saver = tf.train.Saver()
      tf.summary.scalar('cost', loss_value)
      summary_op = tf.summary.merge_all()
 
    sv = tf.train.Supervisor(is_chief=(FLAGS.task_index == 0),
                            logdir="./logs/",
                            init_op=init_op,
                            summary_op=summary_op,
                            saver=saver,
                            global_step=global_step,
                            save_model_secs=60)

    with sv.prepare_or_wait_for_session(server.target) as sess:
      # 如果是同步模式
      if FLAGS.task_index == 0 and issync == 1:
        sv.start_queue_runners(sess, [chief_queue_runner])
        sess.run(init_token_op)
      step = 0
      while  step < 100000:
        train_x = np.random.randn(1)
        train_y = 2 * train_x + np.random.randn(1) * 0.33  + 10
        _, loss_v, step = sess.run([train_op, loss_value,global_step], feed_dict={input:train_x, label:train_y})
        if step % steps_to_validate == 0:
          w,b = sess.run([weight,biase])
          print("step: %d, weight: %f, biase: %f, loss: %f" %(step, w, b, loss_v))

    sv.stop()

def loss(label, pred):
  return tf.square(label - pred)



if __name__ == "__main__":
  tf.app.run()
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值