https://www.tensorflow.org/tutorials/deep_cnn/
多机多卡(未验证):
# coding=utf-8
'''
Created on Jan 4, 2017
@author: colinliang
tensorflow 单机多卡程序示例,
参考: tensorflow示例cifar10_multi_gpu_train.py
'''
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
import numpy as np
def _allocate_variable(name, shape, initializer, dtype=tf.float32):
# 分配变量,Tensorflow 会自动处理变量在不同设备间的通信问题,因而可以放在GPU上,也可以放在CPU上
# 如果是单机单卡,都放在GPU上比较快 (无需显式指定device, tf自动分配即可)
# 如果是单机多卡,则放在CPU上略快; 可能是我这里使用了SLI连接两块GPU,GPU间通信速度还算可以
with tf.device('/cpu:0'): #强制放在主内存上
# with tf.device(None): # 默认放在当前设备上
var = tf.get_variable(name, shape, initializer=initializer, dtype=dtype)
print('%s: %s' % (var.op.name, var.device))
return var
# 创建网络 y=xw+b
def tower(input_tensor, target_tensor, scope, dims=[]):
for i, d in enumerate(dims):
with tf.variable_scope('affine%d' % i) as varscope: # 仅仅用于生成变量的全名,与存放设备无关
w = _allocate_variable('w', shape=[input_tensor.get_shape()[1], d], initializer=tf.truncated_normal_initializer(0, 1));
b = _allocate_variable('b', shape=[], initializer=tf.zeros_initializer);
input_tensor = tf.matmul(input_tensor, w) + b;
input_tensor = tf.nn.relu(input_tensor)
with tf.variable_scope('affine_last') as varscope: # 仅仅用于生成变量的全名,与存放设备无关<