在学习tensorflow的分布式学习时,在实验代码中遇到以下错误:
InternalError: Blas GEMM launch failed : a.shape=(100, 784), b.shape=(784, 10), m=100, n=10...
查看了几种解决方案:https://blog.csdn.net/Vinsuan1993/article/details/81142855 ,都不work,电脑重启后仍然出错。
最后通过做如下修改,错误解决,然后再将代码改回去,错误未复现。
IMAGE_PIXELS = 28
# x = tf.placeholder(dtype=tf.float32, shape=[None, IMAGE_PIXELS * IMAGE_PIXELS])
x = tf.placeholder(dtype=tf.float32, shape=[None, 784])
## hid_w = tf.Variable(tf.truncated_normal([IMAGE_PIXELS * IMAGE_PIXELS, ##HIDDEN_UNITS],stddev=1.0 / IMAGE_PIXELS), name='hid_w')
hid_w = tf.Variable(tf.truncated_normal([784, HIDDEN_UNITS],stddev=1.0 / IMAGE_PIXELS), name='hid_w')
我知道这种办法有点SB,但是病急乱投医下管用了,也不知道为什么。这篇博客主要记录下自己当时碰到这个问题是怎么解决的,避免忘记,大家勿喷。
最后附实验源码:
"""
python single_device_single_gpu_mnist.py --job_name=ps --task_index=0
python single_device_single_gpu_mnist.py --job_name=worker --task_index=0
python single_device_single_gpu_mnist.py --job_name=worker --task_index=1
"""
# encoding:utf-8
import math
import tempfile
import time
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
import os
flags = tf.app.flags
flags.DEFINE_string('job_name', None, 'job name: worker or ps')
flags.DEFINE_integer('task_index', None, 'Index of task within the job')
flags.DEFINE_string('gpu_available', "0,1", 'gpu_available')
flags.DEFINE_integer("issync", None, "是否采用分布式的同步模式,1表示同步模式,0表示异步模式")
FLAGS = flags.FLAGS
os.environ["CUDA_DEVICES_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = FLAGS.gpu_available
##选择服务器:
cluster = tf.train.ClusterSpec({'ps': ["101.6.66.3:22221"],
'worker': ["101.6.66.3:22225"]})
# 定义超参数
IMAGE_PIXELS = 28
DATA_DIR = "./mnist"
HIDDEN_UNITS=100
TRAIN_STEPS = 10000
BATCH_SIZE=100
LR=0.01
## 获取数据集
mnist = input_data.read_data_sets(DATA_DIR, one_hot=True)
##判断终端输入的正误
if FLAGS.job_name not in ["worker","ps"]:
raise ValueError("--------FLAGS.job_name input error,must be 'worker' or 'ps'")
if FLAGS.task_index not in range(0,2):
raise ValueError("--------FLAGS.task_index input error, must be an int")
## 根据终端输入,选择服务器
server = tf.train.Server(cluster,job_name=FLAGS.job_name,task_index=FLAGS.task_index)
if FLAGS.job_name == "ps":
server.join()
## 训练代码
is_chief = (FLAGS.task_index == 0)
target = server.target
worker_device = "/job:worker/task:{}".format(FLAGS.task_index)
ps_device = "/job:ps/task:0/device:CPU:0"
with tf.device(tf.train.replica_device_setter(worker_device=worker_device,ps_device = ps_device,cluster=cluster)):
x = tf.placeholder(dtype=tf.float32, shape=[None, IMAGE_PIXELS * IMAGE_PIXELS])
y_ = tf.placeholder(dtype=tf.float32, shape=[None, 10])
global_step = tf.Variable(0, name='global_step', trainable=False) # 创建纪录全局训练步数变量
## variables
hid_w = tf.Variable(tf.truncated_normal([IMAGE_PIXELS * IMAGE_PIXELS, HIDDEN_UNITS],stddev=1.0 / IMAGE_PIXELS), name='hid_w')
hid_b = tf.Variable(tf.zeros([100]), name='hid_b')
sm_w = tf.Variable(tf.truncated_normal([HIDDEN_UNITS, 10],stddev=1.0 / math.sqrt(HIDDEN_UNITS)), name='sm_w')
sm_b = tf.Variable(tf.zeros([10]), name='sm_b')
## ops
hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b)
hid = tf.nn.relu(hid_lin)
y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b))
cross_entropy = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))
opt = tf.train.AdamOptimizer(LR)
train_step = opt.minimize(cross_entropy, global_step=global_step)
##创建sess,执行训练
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.log_device_placement = True
with tf.Session(target,config=config) as sess:
sess.run(tf.global_variables_initializer())
print("worker {} : session initialization complete.".format(FLAGS.task_index))
batch_xs, batch_ys = mnist.train.next_batch(BATCH_SIZE)
train_feed = {x: batch_xs, y_: batch_ys}
_, step = sess.run([train_step, global_step], feed_dict=train_feed)
print("successfully!")