参考文献:
https://blog.csdn.net/CodeMaster_/article/details/76223835
https://blog.csdn.net/u012436149/article/details/53140869
分布式主要概念有:
集群cluster
服务器server
定义好后再使用
tf.train.replica_device_setter指定ps和worker服务器即可
实例:
异步
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
BATCH_SIZE = 100
LEARNING_RATE_BASE = .01
LEARNING_RATE_DECAY = .99
REGULARIZATION_RATE = .0001
TRAINING_STEPS = 20000
MOVING_AVERAGE_DECAY = .99
MODEL_SAVE_PATH = 'logs/log_async'
DATA_PATH = 'MNIST_data'
FLAGS = tf.app.flags.FLAGS
#使用FLAGS定义变量可以在命令行中动态的赋值
#参数:名称,默认值,备注
tf.app.flags.DEFINE_string('job_name','worker','parameter_server or worker')
tf.app.flags.DEFINE_string('ps_hosts','localhost:1111,localhost:1112','the list of parameter host name')
tf.app.flags.DEFINE_string('worker_hosts','localhost:2222,localhost:2223','the list of worker host name')
tf.app.flags.DEFINE_integer('task_id',0,'task id of worker hosts of ps hosts')
def get_weight_variable(shape, regularizer):
weights = tf.get_variable("weights", shape, initializer=tf.truncated_normal_initializer(stddev=0.1))
if regularizer != None: tf.add_to_collection('losses', regularizer(weights))
return weights
#实现两层的全连接神经网络
def inference(input_tensor, regularizer):
with tf.variable_scope('layer1'):
weights = get_weight_variable([784, 500], regularizer)
biases = tf.get_variable("biases", [500], initializer=tf.constant_initializer(0.0))
layer1 = tf.nn.relu(tf.matmul(input_tensor, weights) + biases)
with tf.variable_scope('layer2'):
weights = get_weight_variable([500, 10], regularizer)
biases = tf.get_variable("biases", [10], initializer=tf.constant_initializer(0.0))
layer2 = tf.matmul(layer1, weights) + biases
return layer2
def build_model(x,y_,is_chief):
regularizer = tf.contrib.layers.l2_regularizer(REGULARIZATION_RATE)
y = inference(x,regularizer)
global_step = tf.contrib.frameworks.get_or_create_global_step()
cross_entropy = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=y,labels=tf.argmax(y_,1)))
loss = cross_entropy + tf.add_n(tf.get_collection('losses'))
learning_rate = tf.train.exponential_decay(
LEARNING_RATE_BASE,
global_step,
60000/BATCH_SIZE,
LEARNING_RATE_DECAY
)
train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss,global_step=global_step)
#是否使用滑动平均
if is_chief:
ema = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY,global_step)
ema_op = ema.apply(tf.trainable_variables())
train_op = tf.group(train_op,ema_op)
return global_step,loss,train_op
def main(_):
ps_hosts = FLAGS.ps_hosts.split(',')
worker_hosts = FLAGS.worker_hosts.split(',')
#定义一个集群,'ps','worker'即job_name,
#ps_hosts,worker_hosts是一个列表,表示执行这个job的服务器及端口,即 地址:端口,如127.0.0.1:1001,task_id相当于下标
cluster = tf.train.ClusterSpec(
{
'ps':ps_hosts,
'worker':worker_hosts
}
)
#定义执行某个任务的服务器
server = tf.train.Server(cluster,job_name=FLAGS.job_name,task_index=FLAGS.task_id)
#如果是参数服务器,在这里开始block,阻塞以监听worker发来的参数并更新
if FLAGS.job_name == 'ps':
with tf.device('/cpu:0'):
server.join()
#重要参数,见下面的注释
is_chief = (FLAGS.task_id == 0)
mnist = input_data.read_data_sets(DATA_PATH,one_hot=True)
#tf.train.replica_device_setter(ps_tasks=0, ps_device='/job:ps', worker_device='/job:worker', merge_devices=True, cluster=None, ps_ops=None))
#这个函数指定了ps服务器和worker服务器,使得数据能在两个服务器之间传输
device_setter = tf.train.replica_device_setter(
worker_device='/job:worker/task:%d'%FLAGS.task_id,
cluster=cluster
)
with tf.device(device_setter):
x = tf.placeholder(tf.float32,[None,784],name='x-input')
y_ = tf.placeholder(tf.float32,[None,10],name='y-output')
global_step,loss,train_op = build_model(x,y_,is_chief)
#定义最大步数,执行到最大步数后自动停止
hooks = [tf.train.StopAtStepHook(last_step=TRAINING_STEPS)]
config = tf.ConfigProto(allow_soft_placement=True,log_device_placement=True)
#参数:master:执行分布式任务的服务器,
#is_chief:是否初始化参数,一般只需要一台worker初始化参数,其他worker只需要等待初始化完毕即可,这里我们选择task_id=0的worker来初始化
#checkpoint_dir:存储变量值的路径,
#hooks:设置停止训练的条件,save_checkpoint_secs:存储变量的频率
#这个session用于监控两个服务器间的数据传输
with tf.train.MonitoredTrainingSession(
master=server.target,
is_chief=is_chief,
checkpoint_dir=MODEL_SAVE_PATH,
hooks=hooks,
save_checkpoint_secs=60,
config=config
) as sess:
print('session start:')
step = 0
#当满足停止训练的条件时,sess.should_stop()返回False
while not sess.should_stop():
xs,ys = mnist.train.next_batch(BATCH_SIZE)
_,loss_value,global_step_val = sess.run([train_op,loss,global_step],feed_dict={x:xs,y_:ys})
if step > 0 and step % 10 == 0:
print('global_step:{},step:{},loss:{}'.format(global_step_val,step,loss_value))
step += 1
if __name__ == '__main__':
tf.app.run()
分别运行ps,worker1,worker2
python train.py
--job_name='ps'
--task_id=0
python train.py
--job_name='worker'
--task_id=0
python train.py
--job_name='worker'
--task_id=1
同步的代码和异步大同小异,使用了SyncReplicasOptimizer和sync_replicas_hook实现了worker间的同步更新和协调。
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
BATCH_SIZE = 100
LEARNING_RATE_BASE = .01
LEARNING_RATE_DECAY = .99
REGULARIZATION_RATE = .0001
TRAINING_STEPS = 20000
MOVING_AVERAGE_DECAY = .99
MODEL_SAVE_PATH = 'logs/log_async'
DATA_PATH = 'MNIST_data'
FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_string('job_name','worker','parameter_server or worker')
tf.app.flags.DEFINE_string('ps_hosts','localhost:2222,localhost:2223','the list of parameter host name')
tf.app.flags.DEFINE_string('worker_hosts','localhost:2222,localhost:2223','the list of worker host name')
tf.app.flags.DEFINE_integer('task_id',0,'task id of worker hosts of ps hosts')
def get_weight_variable(shape, regularizer):
weights = tf.get_variable("weights", shape, initializer=tf.truncated_normal_initializer(stddev=0.1))
if regularizer != None: tf.add_to_collection('losses', regularizer(weights))
return weights
def inference(input_tensor, regularizer):
with tf.variable_scope('layer1'):
weights = get_weight_variable([784, 500], regularizer)
biases = tf.get_variable("biases", [500], initializer=tf.constant_initializer(0.0))
layer1 = tf.nn.relu(tf.matmul(input_tensor, weights) + biases)
with tf.variable_scope('layer2'):
weights = get_weight_variable([500, 10], regularizer)
biases = tf.get_variable("biases", [10], initializer=tf.constant_initializer(0.0))
layer2 = tf.matmul(layer1, weights) + biases
return layer2
def build_model(x,y_,n_workers,is_chief):
regularizer = tf.contrib.layers.l2_regularizer(REGULARIZATION_RATE)
y = inference(x,regularizer)
global_step = tf.contrib.frameworks.get_or_create_global_step()
cross_entropy = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=y,labels=tf.argmax(y_,1)))
loss = cross_entropy + tf.add_n(tf.get_collection('losses'))
learning_rate = tf.train.exponential_decay(
LEARNING_RATE_BASE,
global_step,
60000/BATCH_SIZE,
LEARNING_RATE_DECAY
)
#使用一个用于同步的Optimizer更新参数,并返回一个同步的hook用于控制worker间的同步
opt = tf.train.SyncReplicasOptimizer(tf.train.GradientDescentOptimizer(learning_rate),replicas_to_aggregate=n_workers,total_num_replicas=n_workers)
sync_replicas_hook = opt.make_session_run_hook(is_chief)
train_op = opt.minimize(loss,global_step=global_step)
if is_chief:
ema = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY,global_step)
ema_op = ema.apply(tf.trainable_variables())
train_op = tf.group(train_op,ema_op)
return global_step,loss,train_op,sync_replicas_hook
def main(_):
ps_hosts = FLAGS.ps_hosts.split(',')
worker_hosts = FLAGS.worker_hosts.split(',')
cluster = tf.train.ClusterSpec(
{
'ps':ps_hosts,
'worker':worker_hosts
}
)
server = tf.train.Server(cluster,job_name=FLAGS.job_name,task_index=FLAGS.task_id)
if FLAGS.job_name == 'ps':
with tf.device('/cpu:0'):
server.join()
is_chief = (FLAGS.task_id == 0)
mnist = input_data.read_data_sets(DATA_PATH,one_hot=True)
device_setter = tf.train.replica_device_setter(
worker_device='/job:worker/task:%d'%FLAGS.task_id,
cluster=cluster
)
with tf.device(device_setter):
x = tf.placeholder(tf.float32,[None,784],name='x-input')
y_ = tf.placeholder(tf.float32,[None,10],name='y-output')
global_step,loss,train_op,sync_replicas_hook = build_model(x,y_,is_chief)
#sync_replicas_hook用于控制同步
hooks = [sync_replicas_hook,tf.train.StopAtStepHook(last_step=TRAINING_STEPS)]
config = tf.ConfigProto(allow_soft_placement=True,log_device_placement=True)
with tf.train.MonitoredTrainingSession(
master=server.target,
is_chief=is_chief,
checkpoint_dir=MODEL_SAVE_PATH,
hooks=hooks,
save_checkpoint_secs=60,
config=config
) as sess:
print('session start:')
step = 0
while not sess.should_stop():
xs,ys = mnist.train.next_batch(BATCH_SIZE)
_,loss_value,global_step_val = sess.run([train_op,loss,global_step],feed_dict={x:xs,y_:ys})
if step > 0 and step % 10 == 0:
print('global_step:{},step:{},loss:{}'.format(global_step_val,step,loss_value))
step += 1
if __name__ == '__main__':
tf.app.run()