记录一次失败的TensorFlow调试经历

ljyfree

已于 2022-11-03 18:01:32 修改

阅读量846

点赞数

分类专栏： Linux hpc 文章标签： tensorflow 深度学习 python

于 2022-11-03 16:39:38 首次发布

本文链接：https://blog.csdn.net/ljyfree/article/details/127673067

版权

Linux 同时被 2 个专栏收录

25 篇文章 1 订阅

订阅专栏

hpc

2 篇文章 0 订阅

订阅专栏

最后没有调通是令人沮丧的，但是并非一无所获

选例

发现《TensorFlow分布式MNIST手写字体识别实例》
直接给了代码，就它了

软件版本

Ubuntu20.04用pip3 安装tensorflow和tensorflow-datasets后，发现下载数据报错

>>> tfds.load('mnist', data_dir='/root/mnist/MNIST_data', with_info=True)
2022-11-03 11:04:23.302791: W tensorflow/core/platform/cloud/google_auth_provider.cc:184] All attempts to get a Google authentication bearer token failed, returning an empty token. Retrieving token from files failed with "NOT_FOUND: Could not locate the credentials file.". Retrieving token from GCE failed with "FAILED_PRECONDITION: Error executing an HTTP request: libcurl code 6 meaning 'Couldn't resolve host name', error details: Could not resolve host: metadata".

改为安装指定版本，问题解决

pip3 install tensorflow==2.3.0

文件缺失

运行时遇到

Traceback (most recent call last):
  File "distributed.py", line 6, in <module>
    from tensorflow.examples.tutorials.mnist import input_data
ModuleNotFoundError: No module named 'tensorflow.examples.tutorials'

# git clone https://github.com/tensorflow/tensorflow
# cp -r tensorflow/tensorflow/examples/ /usr/local/lib/python3.8/dist-packages/tensorflow/

#链接：https://pan.baidu.com/s/1UTt6VE5KLIh-6J-OZqTkuw 提取码：sbh0 
#下载后解压缩
# cp -r tutorials /usr/local/lib/python3.8/dist-packages/tensorflow/examples/

修改

调试中遇到的问题包括

#报错一
python2的print “”“”需要改为print()

#报错二
Traceback (most recent call last):
  File "distributed.py", line 8, in <module>
    flags = tf.app.flags
AttributeError: module 'tensorflow' has no attribute 'app'

解决方法
-import tensorflow as tf
+import tensorflow.compat.v1 as tf


#报错三
Traceback (most recent call last):
  File "distributed.py", line 122, in <module>
    tf.app.run()
  File "/usr/local/lib/python3.8/dist-packages/tensorflow/python/platform/app.py", line 36, in run
    _run(main=main, argv=argv, flags_parser=_parse_flags_tolerate_undef)
  File "/usr/local/lib/python3.8/dist-packages/absl/app.py", line 308, in run
    _run_main(main, args)
  File "/usr/local/lib/python3.8/dist-packages/absl/app.py", line 254, in _run_main
    sys.exit(main(argv))
  File "distributed.py", line 56, in main
    with tf.device(tf.train.replica_device_setter(
  File "/usr/local/lib/python3.8/dist-packages/tensorflow/python/framework/ops.py", line 5501, in device
    raise RuntimeError(
RuntimeError: tf.device does not support functions when eager execution is enabled.
解决方法
def main(unused_argv):
+    if tf.__version__ >='2.0.0':
+        tf.compat.v1.disable_eager_execution()

最终修改

# diff -ruN distributed.py_bak distributed.py
--- distributed.py_bak  2022-11-03 16:30:25.538814566 +0800
+++ distributed.py      2022-11-03 15:24:09.920309171 +0800
@@ -2,22 +2,22 @@
 import math
 import tempfile
 import time
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.examples.tutorials.mnist import input_data
 
 flags = tf.app.flags
 IMAGE_PIXELS = 28
 # 定义默认训练参数和数据路径
-flags.DEFINE_string('data_dir', '/tmp/mnist-data', 'Directory  for storing mnist data')
+flags.DEFINE_string('data_dir', '/root/tensorflow_share', 'Directory  for storing mnist data')
 flags.DEFINE_integer('hidden_units', 100, 'Number of units in the hidden layer of the NN')
 flags.DEFINE_integer('train_steps', 10000, 'Number of training steps to perform')
 flags.DEFINE_integer('batch_size', 100, 'Training batch size ')
 flags.DEFINE_float('learning_rate', 0.01, 'Learning rate')
 # 定义分布式参数
 # 参数服务器parameter server节点
-flags.DEFINE_string('ps_hosts', '192.168.32.145:22221', 'Comma-separated list of hostname:port pairs')
+flags.DEFINE_string('ps_hosts', '5.5.5.254:22221', 'Comma-separated list of hostname:port pairs')
 # 两个worker节点
-flags.DEFINE_string('worker_hosts', '192.168.32.146:22221,192.168.32.160:22221',
+flags.DEFINE_string('worker_hosts', '5.5.5.1:22221,5.5.5.2:22221',
                     'Comma-separated list of hostname:port pairs')
 # 设置job name参数
 flags.DEFINE_string('job_name', None, 'job name: worker or ps')
@@ -30,16 +30,18 @@
 
 
 def main(unused_argv):
+    if tf.__version__ >='2.0.0':
+        tf.compat.v1.disable_eager_execution()
     mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)
 
     if FLAGS.job_name is None or FLAGS.job_name == '':
         raise ValueError('Must specify an explicit job_name !')
     else:
-        print 'job_name : %s' % FLAGS.job_name
+        print('job_name : %s' % FLAGS.job_name)
     if FLAGS.task_index is None or FLAGS.task_index == '':
         raise ValueError('Must specify an explicit task_index!')
     else:
-        print 'task_index : %d' % FLAGS.task_index
+        print('task_index : %d' % FLAGS.task_index)
 
     ps_spec = FLAGS.ps_hosts.split(',')
     worker_spec = FLAGS.worker_hosts.split(',')
@@ -85,14 +87,15 @@
                                  global_step=global_step)
 
         if is_chief:
-            print 'Worker %d: Initailizing session...' % FLAGS.task_index
+            print('Worker %d: Initailizing session...' % FLAGS.task_index)
         else:
-            print 'Worker %d: Waiting for session to be initaialized...' % FLAGS.task_index
-        sess = sv.prepare_or_wait_for_session(server.target)
-        print 'Worker %d: Session initialization  complete.' % FLAGS.task_index
+            print('Worker %d: Waiting for session to be initaialized...' % FLAGS.task_index)
+        config = tf.ConfigProto(allow_soft_placement = True)
+        sess = sv.prepare_or_wait_for_session(server.target,config=config)
+        print('Worker %d: Session initialization  complete.' % FLAGS.task_index)
 
         time_begin = time.time()
-        print 'Traing begins @ %f' % time_begin
+        print('Traing begins @ %f' % time_begin)
 
         local_step = 0
         while True:
@@ -103,19 +106,19 @@
             local_step += 1
 
             now = time.time()
-            print '%f: Worker %d: traing step %d dome (global step:%d)' % (now, FLAGS.task_index, local_step, step)
+            print('%f: Worker %d: traing step %d dome (global step:%d)' % (now, FLAGS.task_index, local_step, step))
 
             if step >= FLAGS.train_steps:
                 break
 
         time_end = time.time()
-        print 'Training ends @ %f' % time_end
+        print('Training ends @ %f' % time_end)
         train_time = time_end - time_begin
-        print 'Training elapsed time:%f s' % train_time
+        print('Training elapsed time:%f s' % train_time)
 
         val_feed = {x: mnist.validation.images, y_: mnist.validation.labels}
         val_xent = sess.run(cross_entropy, feed_dict=val_feed)
-        print 'After %d training step(s), validation cross entropy = %g' % (FLAGS.train_steps, val_xent)
+        print('After %d training step(s), validation cross entropy = %g' % (FLAGS.train_steps, val_xent))
     sess.close()
 
 if __name__ == '__main__':

最后卡在

ps节点运行正常
work节点运行报错

Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/tensorflow/python/client/session.py", line 1378, in _do_call
    return fn(*args)
  File "/usr/local/lib/python3.8/dist-packages/tensorflow/python/client/session.py", line 1361, in _run_fn
    return self._call_tf_sessionrun(options, feed_dict, fetch_list,
  File "/usr/local/lib/python3.8/dist-packages/tensorflow/python/client/session.py", line 1454, in _call_tf_sessionrun
    return tf_session.TF_SessionRun_wrapper(self._session, options, feed_dict,
tensorflow.python.framework.errors_impl.InvalidArgumentError: Converting GraphDef to Graph has failed. The binary trying to import the GraphDef was built when GraphDef version was 440. The GraphDef was produced by a binary built when GraphDef version was 1205. The difference between these versions is larger than TensorFlow's forward compatibility guarantee. The following error might be due to the binary trying to import the GraphDef being too old: NodeDef mentions attr 'validate_shape' not in Op<name=AssignVariableOp; signature=resource:resource, value:dtype -> ; attr=dtype:type; is_stateful=true>; NodeDef: {{node Adam/AssignVariableOp_1}}. (Check whether your GraphDef-interpreting binary is up to date with your GraphDef-generating binary.).

看上去是版本相差太多了…
源文件提交时间是2017年，但是妥妥的是tensorflow 1.x
Ubuntu20.04，只能装2.x

# pip3 install tensorflow==1.10.0
ERROR: Could not find a version that satisfies the requirement tensorflow==1.10.0 (from versions: 2.2.0, 2.2.1, 2.2.2, 2.2.3, 2.3.0, 2.3.1, 2.3.2, 2.3.3, 2.3.4, 2.4.0, 2.4.1, 2.4.2, 2.4.3, 2.4.4, 2.5.0, 2.5.1, 2.5.2, 2.5.3, 2.6.0rc0, 2.6.0rc1, 2.6.0rc2, 2.6.0, 2.6.1, 2.6.2, 2.6.3, 2.6.4, 2.6.5, 2.7.0rc0, 2.7.0rc1, 2.7.0, 2.7.1, 2.7.2, 2.7.3, 2.7.4, 2.8.0rc0, 2.8.0rc1, 2.8.0, 2.8.1, 2.8.2, 2.8.3, 2.9.0rc0, 2.9.0rc1, 2.9.0rc2, 2.9.0, 2.9.1, 2.9.2, 2.10.0rc0, 2.10.0rc1, 2.10.0rc2, 2.10.0rc3, 2.10.0, 2.11.0rc0, 2.11.0rc1, 2.11.0rc2)
ERROR: No matching distribution found for tensorflow==1.10.0