爬坑tensorflow多gpu并行训练时loss为nan

最新推荐文章于 2023-07-13 17:35:36 发布

潜行隐耀

最新推荐文章于 2023-07-13 17:35:36 发布

阅读量1.9k

点赞数

分类专栏： tensorflow

本文链接：https://blog.csdn.net/PanYHHH/article/details/106578699

版权

tensorflow 专栏收录该内容

6 篇文章 0 订阅

订阅专栏

一、前言

有关tensorflow多gpu并行训练的知识点可以参考这篇博客，本文主要是记录一下自己训练时遇到的几个易错点，代码使用的分布式策略是数据并行中的同步方式。

文章最后会放上训练（含验证）的代码。

二、遇到的坑

1. 数据划分不当导致输出结果为nan

划分数据到不同的gpu上的代码片段如下:

inputs = tf.placeholder(tf.float32, [None, image_size, image_size, 3])
labels = tf.placeholder(tf.int32, [None, num_classes])

with tf.variable_scope(tf.get_variable_scope()):
    for i in range(num_gpus):
        with tf.device('/gpu:{}'.format(i)):
            # 切分数据到不同的gpu上
            _inputs = inputs[i * current_batch_size:(i + 1) * current_batch_size]
            _labels = labels[i * current_batch_size:(i + 1) * current_batch_size]

其中current_batch_size为每块gpu上的数据的batch大小，值为训练时总batch_size的一半，如果过大会导致划分出来的四维数组的第一个维度为0，最终导致loss值为nan。

2. 数组最后一个batch过小导致划分出错

我们知道模型是一个batch一个batch分批次读取数据的，读到最后一个batch时batch_size会小于预定的值，所以当上述代码中的current_batch_size为固定值时，最后一个batch的数据划分会出错，导致输出结果为nan。

解决方案是将current_batch_size设置为一个占位符

current_batch_size = tf.placeholder(tf.int32)

然后每次训练时设为当前数据的batch的一半。验证时同理。

batch_x, batch_y = sess.run([image_batch, label_batch])
c_batch_size = int(batch_x.shape[0] / 2)
train_dict = {inputs: batch_x,
              labels: batch_y,
              is_training: True,
              keep_prob: 0.6,
              current_batch_size: c_batch_size}

最后附上训练（含验证）过程的代码：

# -*- coding: utf-8 -*-
import tensorflow as tf
from utils.preprocess import preprocess_for_train, preprocess_for_test
from utils.parse_functions import parser_tfrecords, parse_list
from utils.generate_list import images_and_labels_list
from tensorflow.contrib.slim import nets
import os
import numpy as np
import time
from prepared_functions import average_gradients

slim = tf.contrib.slim

# 指定多块gpu并行训练
os.environ["CUDA_VISIBLE_DEVICES"] = "0, 1"
"""
""net parameters
"""
num_gpus = 2
num_classes = 10
image_size = 32
batch_size = 256
valid_batch_size = 256

shuffle_buffer = 50000
NUM_EPOCHS = 200
starter_learning_rate = 0.001
boundaries = [10000, 25000]
values = [0.001, 0.0001, 0.00001]
"""
""path parameters
"""
resnet_model_path = './resnet_v2_50.ckpt'
models_path = './models/temporary/train-model.ckpt'  # saver保存路径

logs_dir = "./logs/temporary"
logs_train_dir = os.path.join(logs_dir, "train")
logs_valid_dir = os.path.join(logs_dir, "valid")

train_tfrecords = ["./tfrecords/cifar10/train.tfrecords"]
test_tfrecords = ["./tfrecords/cifar10/test.tfrecords"]

for dir_name in [logs_dir, logs_train_dir, logs_valid_dir]:
    if not os.path.exists(dir_name):
        os.mkdir(dir_name)
"""
""Datasets
"""
dataset = tf.data.TFRecordDataset(train_tfrecords)
dataset = dataset.map(parser_tfrecords)
dataset = dataset.map(
    lambda image, label: (
        preprocess_for_test(image), label))

dataset = dataset.shuffle(shuffle_buffer)
dataset = dataset.batch(batch_size)
dataset = dataset.repeat(NUM_EPOCHS)
iterator = dataset.make_initializable_iterator()
image_batch, label_batch = iterator.get_next()
label_batch = tf.one_hot(indices=tf.cast(label_batch, tf.int32), depth=num_classes)

val_dataset = tf.data.TFRecordDataset(test_tfrecords)
val_dataset = val_dataset.map(parser_tfrecords)
val_dataset = val_dataset.map(
    lambda image, label: (
        preprocess_for_test(image), label))

val_dataset = val_dataset.batch(valid_batch_size)
val_iterator = val_dataset.make_initializable_iterator()
val_image_batch, val_label_batch = val_iterator.get_next()
val_label_batch = tf.one_hot(indices=tf.cast(val_label_batch, tf.int32), depth=num_classes)


"""
""placeholder
"""
inputs = tf.placeholder(tf.float32, [None, image_size, image_size, 3])
labels = tf.placeholder(tf.int32, [None, num_classes])
keep_prob = tf.placeholder(tf.float32)
is_training = tf.placeholder(tf.bool)
current_batch_size = tf.placeholder(tf.int32)
"""
""Learning rate
"""
global_step = tf.Variable(0, trainable=False)
learning_rate = tf.train.piecewise_constant(global_step, boundaries, values)

# optimizer
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
# 存储梯度
tower_grads = []

with tf.variable_scope(tf.get_variable_scope()):
    for i in range(num_gpus):
        with tf.device('/gpu:{}'.format(i)):
            # 切分数据到不同的gpu上
            _inputs = inputs[i * current_batch_size:(i + 1) * current_batch_size]
            _labels = labels[i * current_batch_size:(i + 1) * current_batch_size]
            """
            ""inference
            """
            with slim.arg_scope(nets.resnet_v2.resnet_arg_scope()):
                net, end_points = nets.resnet_v2.resnet_v2_50(_inputs, num_classes=None, is_training=is_training)
            with tf.variable_scope('Logits'):
                net = tf.squeeze(net, axis=[1, 2])
                net = slim.dropout(net, keep_prob=keep_prob, scope='scope')
                logits = slim.fully_connected(net, num_outputs=num_classes,
                                              activation_fn=None, scope='fc')
            # 不同gpu上的模型参数可复用
            tf.get_variable_scope().reuse_variables()
            """
            ""Loss
            """
            losses = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=_labels)
            loss = tf.reduce_mean(losses)
            # 计算梯度并累加
            grads = optimizer.compute_gradients(loss)
            tower_grads.append(grads)

            if i == 0:
                """
                ""Accuracy
                """
                correct_prediction = tf.equal(
                    tf.cast(tf.argmax(logits, 1), tf.int32), tf.cast(tf.argmax(_labels, 1), tf.int32))
                accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
"""
""Optimizer
"""
grads = average_gradients(tower_grads)
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
    train_step = optimizer.apply_gradients(grads, global_step)
"""
""Summary
"""
tf.summary.scalar("lr", learning_rate)
tf.summary.scalar("accuracy", accuracy)
tf.summary.scalar("loss", loss)
summary_op = tf.summary.merge_all()  # 混合所有summary类型log
"""
" Restore resnet50
"""
checkpoint_exclude_scopes = 'Logits'
exclusions = None
if checkpoint_exclude_scopes:
    exclusions = [
        scope.strip() for scope in checkpoint_exclude_scopes.split(',')]
variables_to_restore = []
for var in slim.get_model_variables():
    excluded = False
    for exclusion in exclusions:
        if var.op.name.startswith(exclusion):
            excluded = True
    if not excluded:
        variables_to_restore.append(var)
saver_restore = tf.train.Saver(var_list=variables_to_restore)
saver = tf.train.Saver()
"""
" Open Session
"""
init = tf.global_variables_initializer()
config = tf.ConfigProto(allow_soft_placement=True)
config.gpu_options.allow_growth = True
with tf.Session(config=config) as sess:
    sess.run(init)
    saver_restore.restore(sess, resnet_model_path)

    train_writer = tf.summary.FileWriter(os.path.join(logs_dir, 'train'), sess.graph)  # 训练日志
    valid_writer = tf.summary.FileWriter(os.path.join(logs_dir, 'valid'), sess.graph)  # 验证日志
    # 初始化训练数据的迭代器
    sess.run(iterator.initializer)
    step = 0
    best_acc = 0.0
    start_time = time.time()
    while True:
        try:
            step += 1
            batch_x, batch_y = sess.run([image_batch, label_batch])
            c_batch_size = int(batch_x.shape[0] / 2)
            train_dict = {inputs: batch_x,
                          labels: batch_y,
                          is_training: True,
                          keep_prob: 0.6,
                          current_batch_size: c_batch_size}
            summary_str, _, loss_t, acc_t = sess.run([summary_op, train_step, loss, accuracy],
                                                     feed_dict=train_dict)
            if step % 10 == 0:
                print('Step: {},Train_Acc: {:.4f},Loss: {:.8f}'.format(step, acc_t, loss_t))
                train_writer.add_summary(summary_str, step)
            if step % 100 == 0:  # 验证准确率
                sess.run(val_iterator.initializer)
                acc_reg = []
                loss_reg = []
                while True:
                    try:
                        batch_x, batch_y = sess.run([val_image_batch, val_label_batch])
                        c_batch_size = int(batch_x.shape[0] / 2)
                        valid_dict = {inputs: batch_x,
                                      labels: batch_y,
                                      is_training: False,
                                      keep_prob: 1.,
                                      current_batch_size: c_batch_size}
                        loss_v, acc_v, summary_str = sess.run([loss, accuracy, summary_op],
                                                              feed_dict=valid_dict)
                        valid_writer.add_summary(summary_str, step)
                        acc_reg.append(acc_v)
                        loss_reg.append(loss_v)
                    except tf.errors.OutOfRangeError:
                        break
                avg_acc = np.mean(np.array(acc_reg))
                avg_loss = np.mean(np.array(loss_reg))
                print('------------------------------------------------------')
                print('Valid-----> ,Valid_Acc: {:.4f}, Valid_Loss: {:.7f}'.format(avg_acc, avg_loss))
                print('------------------------------------------------------')
                """
                " Save the best model
                """
                if avg_acc > best_acc:
                    best_acc = avg_acc
                    saver.save(sess=sess, save_path=models_path, global_step=step)
                    print("模型保存成功")
                    print("Save the best model with val_acc %0.4f" % best_acc)
                else:
                    print("Val_acc stay with val_acc %0.4f" % best_acc)
        except tf.errors.OutOfRangeError:
            train_writer.close()
            valid_writer.close()
            saver.save(sess=sess, save_path=models_path, global_step=step)
            break
    end_time = time.time()
    print("总共用时：", end_time - start_time)
    print('Ended......')

潜行隐耀

关注

0
点赞
踩
5

收藏

觉得还不错? 一键收藏
0
评论
爬坑tensorflow多gpu并行训练时loss为nan

一、前言有关tensorflow多gpu分布式的知识点介绍可以参考这篇博客，本文主要是记录一下自己训练时遇到的几个易错点，代码使用的分布式策略是数据并行中的同步方式。文章最后会放上完整代码，包括训练及在训练过程中验证的过程。二、易错点1. 数据划分不当导致loss为nan划分数据到不同的gpu上的代码片段如下:inputs = tf.placeholder(tf.float32, [None, image_size, image_size, 3])labels = tf.plac
复制链接

扫一扫

专栏目录