给出一个tensorflow多gpu的demo。用了minst 数据集,可直接运行。(tf1.8)
注:tensorflow gpu之间的操作好像有问题,比如下面的c != a + b。期待解答!
import tensorflow as tf
import numpy as np
import os
import argparse
def arg_config():
parser = argparse.ArgumentParser()
parser.add_argument('-gpu', type=str, required=False, default='1, 2')
args = parser.parse_args()
# config
log_device_placement = True # 是否打印设备分配日志
allow_soft_placement = True # 如果你指定的设备不存在,允许TF自动分配设备
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.90, allow_growth=True)
os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu # 使用 GPU id
config = tf.ConfigProto(log_device_placement=log_device_placement,
allow_soft_placement=allow_soft_placement,
gpu_options=gpu_options)
return args, config
def data_loader():
mnist = tf.contrib.learn.datasets.load_dataset("mnist")
train_data = mnist.train.images # Returns np.array
train_data = np.reshape(train_data, newshape=(-1, 28, 28, 1))
train_labels = np.asarray(mnist.train.labels, dtype=np.int32)
# print(train_data.shape)
# print(train_labels.shape)
# print(train_labels.dtype)
ds = tf.data.Dataset.from_tensor_slices((train_data.astype(np.float32), train_labels))
ds = ds.repeat().batch(2000)
it = ds.make_one_shot_iterator()
dt, lb = it.get_next()
return dt, lb
# multi-gpu try
class Model(object):
def __init__(self):
pass
@staticmethod
def var_on_cpu(name, shape, initializer=tf.truncated_normal_initializer(stddev=0.1)):
with tf.device('/cpu:0'):
var = tf.get_variable(name, shape, tf.float32, initializer)
return var
def conv2d(self, inputs, filters, kernel_size, strides=(1, 1), name=None):
kernel = self.var_on_cpu(name+'/kernel', shape=kernel_size+(inputs.shape[3].value, filters,))
bias = self.var_on_cpu(name+'/bias', shape=(filters,))
conv = tf.nn.conv2d(input=inputs, filter=kernel, strides=(1,)+strides+(1,),
padding='SAME', name=name)
bias_add = tf.nn.bias_add(conv, bias)
return tf.nn.relu(bias_add)
def dense(self, inputs, units, name, activation=None):
var = self.var_on_cpu(name+'/kernel', shape=(inputs.shape[1].value, units))
bias = self.var_on_cpu(name+'/bias', shape=(units, ))
ds = tf.matmul(inputs, var) + bias
if activation is not None:
return activation(ds)
else:
return ds
def model(self, data, training=True, scope_name='haha'):
with tf.variable_scope(name_or_scope=scope_name, reuse=tf.AUTO_REUSE):
conv1 = self.conv2d(inputs=data, filters=32, kernel_size=(5, 5), name='conv1')
pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2)
conv2 = self.conv2d(inputs=pool1, filters=64, kernel_size=(5, 5), name='conv2')
pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2)
pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 64])
dense = self.dense(inputs=pool2_flat, units=1024, name='dense1', activation=tf.nn.relu)
dropout = tf.layers.dropout(inputs=dense, rate=0.4, training=training)
# Logits Layer
logits = self.dense(inputs=dropout, units=10, name='dense2')
return logits
def get_loss(self, data, labels, training=True):
outs = self.model(data, training=training)
ls = tf.losses.sparse_softmax_cross_entropy(labels, outs)
return tf.reduce_mean(ls)
def average_grads(tower):
averaged_grads = []
for grads_and_vars in zip(*tower):
# print(grads_and_vars)
grads = []
for g, _ in grads_and_vars:
expanded_grad = tf.expand_dims(g, 0, 'expand_grads')
grads.append(expanded_grad)
grad = tf.concat(values=grads, axis=0)
grad = tf.reduce_mean(input_tensor=grad, axis=0, keepdims=False)
g_and_v = (grad, grads_and_vars[0][1])
averaged_grads.append(g_and_v)
return averaged_grads
def train_multi_gpu():
with tf.device('/cpu:0'):
args, config = arg_config()
gpu_num = len(args.gpu.split(','))
global_step = tf.Variable(0, dtype=tf.int64, trainable=False)
model = Model()
data_all, label_all = data_loader()
data = tf.split(data_all, gpu_num)
label = tf.split(label_all, gpu_num)
optimizer = tf.train.MomentumOptimizer(0.01, 0.9)
tower = []
with tf.variable_scope('gpu_vars'):
for i in range(gpu_num):
with tf.device("/gpu:{}".format(i)), tf.name_scope('tower_{}'.format(i)):
loss_op = model.get_loss(data[i], label[i])
# print(loss_op)
tf.add_to_collection(name='total_loss', value=loss_op)
grads_and_vars = optimizer.compute_gradients(loss_op, tf.trainable_variables())
tower.append(grads_and_vars)
mean_grads_and_vars = average_grads(tower)
total_loss_op = tf.get_collection('total_loss', 'gpu_vars')
with tf.control_dependencies([g for g, _ in mean_grads_and_vars]):
train_op = optimizer.apply_gradients(mean_grads_and_vars, global_step=global_step, name='optimizer')
# print(tf.trainable_variables())
# exit()
print('running...')
with tf.Session(config=config) as sess:
sess.run(tf.global_variables_initializer())
step = 0
while step < 1000:
_, loss = sess.run([train_op, total_loss_op])
print(step, loss)
step += 1
if __name__ == "__main__":
train_multi_gpu()
# c != a+b
with tf.device('/gpu:1'), tf.variable_scope('haha', reuse=tf.AUTO_REUSE):
a = tf.get_variable(name='a', shape=[3], dtype=tf.float32, initializer=tf.truncated_normal_initializer())
# a = tf.Variable([2, 3, 4], name='a', dtype=tf.float32, )
with tf.device('/gpu:2'):
b = tf.constant(value=[1, 2, 3], dtype=tf.float32, shape=[3], name='b')
with tf.device('/gpu:1'):
c = tf.add(a, b)
with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as sess:
sess.run([tf.global_variables_initializer(), tf.local_variables_initializer()])
print('c: ', sess.run([c]))
print('b: ', sess.run([b]))
print('a: ', sess.run([a]))