一、前言
有关tensorflow多gpu并行训练的知识点可以参考这篇博客,本文主要是记录一下自己训练时遇到的几个易错点,代码使用的分布式策略是数据并行中的同步方式。
文章最后会放上训练(含验证)的代码。
二、遇到的坑
1. 数据划分不当导致输出结果为nan
划分数据到不同的gpu上的代码片段如下:
inputs = tf.placeholder(tf.float32, [None, image_size, image_size, 3])
labels = tf.placeholder(tf.int32, [None, num_classes])
with tf.variable_scope(tf.get_variable_scope()):
for i in range(num_gpus):
with tf.device('/gpu:{}'.format(i)):
# 切分数据到不同的gpu上
_inputs = inputs[i * current_batch_size:(i + 1) * current_batch_size]
_labels = labels[i * current_batch_size:(i + 1) * current_batch_size]
其中current_batch_size为每块gpu上的数据的batch大小,值为训练时总batch_size的一半,如果过大会导致划分出来的四维数组的第一个维度为0,最终导致loss值为nan。
2. 数组最后一个batch过小导致划分出错
我们知道模型是一个batch一个batch分批次读取数据的,读到最后一个batch时batch_size会小于预定的值,所以当上述代码中的current_batch_size为固定值时,最后一个batch的数据划分会出错,导致输出结果为nan。
解决方案是将current_batch_size设置为一个占位符
current_batch_size = tf.placeholder(tf.int32)
然后每次训练时设为当前数据的batch的一半。验证时同理。
batch_x, batch_y = sess.run([image_batch, label_batch])
c_batch_size = int(batch_x.shape[0] / 2)
train_dict = {inputs: batch_x,
labels: batch_y,
is_training: True,
keep_prob: 0.6,
current_batch_size: c_batch_size}
最后附上训练(含验证)过程的代码:
# -*- coding: utf-8 -*-
import tensorflow as tf
from utils.preprocess import preprocess_for_train, preprocess_for_test
from utils.parse_functions import parser_tfrecords, parse_list
from utils.generate_list import images_and_labels_list
from tensorflow.contrib.slim import nets
import os
import numpy as np
import time
from prepared_functions import average_gradients
slim = tf.contrib.slim
# 指定多块gpu并行训练
os.environ["CUDA_VISIBLE_DEVICES"] = "0, 1"
"""
""net parameters
"""
num_gpus = 2
num_classes = 10
image_size = 32
batch_size = 256
valid_batch_size = 256
shuffle_buffer = 50000
NUM_EPOCHS = 200
starter_learning_rate = 0.001
boundaries = [10000, 25000]
values = [0.001, 0.0001, 0.00001]
"""
""path parameters
"""
resnet_model_path = './resnet_v2_50.ckpt'
models_path = './models/temporary/train-model.ckpt' # saver保存路径
logs_dir = "./logs/temporary"
logs_train_dir = os.path.join(logs_dir, "train")
logs_valid_dir = os.path.join(logs_dir, "valid")
train_tfrecords = ["./tfrecords/cifar10/train.tfrecords"]
test_tfrecords = ["./tfrecords/cifar10/test.tfrecords"]
for dir_name in [logs_dir, logs_train_dir, logs_valid_dir]:
if not os.path.exists(dir_name):
os.mkdir(dir_name)
"""
""Datasets
"""
dataset = tf.data.TFRecordDataset(train_tfrecords)
dataset = dataset.map(parser_tfrecords)
dataset = dataset.map(
lambda image, label: (
preprocess_for_test(image), label))
dataset = dataset.shuffle(shuffle_buffer)
dataset = dataset.batch(batch_size)
dataset = dataset.repeat(NUM_EPOCHS)
iterator = dataset.make_initializable_iterator()
image_batch, label_batch = iterator.get_next()
label_batch = tf.one_hot(indices=tf.cast(label_batch, tf.int32), depth=num_classes)
val_dataset = tf.data.TFRecordDataset(test_tfrecords)
val_dataset = val_dataset.map(parser_tfrecords)
val_dataset = val_dataset.map(
lambda image, label: (
preprocess_for_test(image), label))
val_dataset = val_dataset.batch(valid_batch_size)
val_iterator = val_dataset.make_initializable_iterator()
val_image_batch, val_label_batch = val_iterator.get_next()
val_label_batch = tf.one_hot(indices=tf.cast(val_label_batch, tf.int32), depth=num_classes)
"""
""placeholder
"""
inputs = tf.placeholder(tf.float32, [None, image_size, image_size, 3])
labels = tf.placeholder(tf.int32, [None, num_classes])
keep_prob = tf.placeholder(tf.float32)
is_training = tf.placeholder(tf.bool)
current_batch_size = tf.placeholder(tf.int32)
"""
""Learning rate
"""
global_step = tf.Variable(0, trainable=False)
learning_rate = tf.train.piecewise_constant(global_step, boundaries, values)
# optimizer
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
# 存储梯度
tower_grads = []
with tf.variable_scope(tf.get_variable_scope()):
for i in range(num_gpus):
with tf.device('/gpu:{}'.format(i)):
# 切分数据到不同的gpu上
_inputs = inputs[i * current_batch_size:(i + 1) * current_batch_size]
_labels = labels[i * current_batch_size:(i + 1) * current_batch_size]
"""
""inference
"""
with slim.arg_scope(nets.resnet_v2.resnet_arg_scope()):
net, end_points = nets.resnet_v2.resnet_v2_50(_inputs, num_classes=None, is_training=is_training)
with tf.variable_scope('Logits'):
net = tf.squeeze(net, axis=[1, 2])
net = slim.dropout(net, keep_prob=keep_prob, scope='scope')
logits = slim.fully_connected(net, num_outputs=num_classes,
activation_fn=None, scope='fc')
# 不同gpu上的模型参数可复用
tf.get_variable_scope().reuse_variables()
"""
""Loss
"""
losses = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=_labels)
loss = tf.reduce_mean(losses)
# 计算梯度并累加
grads = optimizer.compute_gradients(loss)
tower_grads.append(grads)
if i == 0:
"""
""Accuracy
"""
correct_prediction = tf.equal(
tf.cast(tf.argmax(logits, 1), tf.int32), tf.cast(tf.argmax(_labels, 1), tf.int32))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
"""
""Optimizer
"""
grads = average_gradients(tower_grads)
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
train_step = optimizer.apply_gradients(grads, global_step)
"""
""Summary
"""
tf.summary.scalar("lr", learning_rate)
tf.summary.scalar("accuracy", accuracy)
tf.summary.scalar("loss", loss)
summary_op = tf.summary.merge_all() # 混合所有summary类型log
"""
" Restore resnet50
"""
checkpoint_exclude_scopes = 'Logits'
exclusions = None
if checkpoint_exclude_scopes:
exclusions = [
scope.strip() for scope in checkpoint_exclude_scopes.split(',')]
variables_to_restore = []
for var in slim.get_model_variables():
excluded = False
for exclusion in exclusions:
if var.op.name.startswith(exclusion):
excluded = True
if not excluded:
variables_to_restore.append(var)
saver_restore = tf.train.Saver(var_list=variables_to_restore)
saver = tf.train.Saver()
"""
" Open Session
"""
init = tf.global_variables_initializer()
config = tf.ConfigProto(allow_soft_placement=True)
config.gpu_options.allow_growth = True
with tf.Session(config=config) as sess:
sess.run(init)
saver_restore.restore(sess, resnet_model_path)
train_writer = tf.summary.FileWriter(os.path.join(logs_dir, 'train'), sess.graph) # 训练日志
valid_writer = tf.summary.FileWriter(os.path.join(logs_dir, 'valid'), sess.graph) # 验证日志
# 初始化训练数据的迭代器
sess.run(iterator.initializer)
step = 0
best_acc = 0.0
start_time = time.time()
while True:
try:
step += 1
batch_x, batch_y = sess.run([image_batch, label_batch])
c_batch_size = int(batch_x.shape[0] / 2)
train_dict = {inputs: batch_x,
labels: batch_y,
is_training: True,
keep_prob: 0.6,
current_batch_size: c_batch_size}
summary_str, _, loss_t, acc_t = sess.run([summary_op, train_step, loss, accuracy],
feed_dict=train_dict)
if step % 10 == 0:
print('Step: {},Train_Acc: {:.4f},Loss: {:.8f}'.format(step, acc_t, loss_t))
train_writer.add_summary(summary_str, step)
if step % 100 == 0: # 验证准确率
sess.run(val_iterator.initializer)
acc_reg = []
loss_reg = []
while True:
try:
batch_x, batch_y = sess.run([val_image_batch, val_label_batch])
c_batch_size = int(batch_x.shape[0] / 2)
valid_dict = {inputs: batch_x,
labels: batch_y,
is_training: False,
keep_prob: 1.,
current_batch_size: c_batch_size}
loss_v, acc_v, summary_str = sess.run([loss, accuracy, summary_op],
feed_dict=valid_dict)
valid_writer.add_summary(summary_str, step)
acc_reg.append(acc_v)
loss_reg.append(loss_v)
except tf.errors.OutOfRangeError:
break
avg_acc = np.mean(np.array(acc_reg))
avg_loss = np.mean(np.array(loss_reg))
print('------------------------------------------------------')
print('Valid-----> ,Valid_Acc: {:.4f}, Valid_Loss: {:.7f}'.format(avg_acc, avg_loss))
print('------------------------------------------------------')
"""
" Save the best model
"""
if avg_acc > best_acc:
best_acc = avg_acc
saver.save(sess=sess, save_path=models_path, global_step=step)
print("模型保存成功")
print("Save the best model with val_acc %0.4f" % best_acc)
else:
print("Val_acc stay with val_acc %0.4f" % best_acc)
except tf.errors.OutOfRangeError:
train_writer.close()
valid_writer.close()
saver.save(sess=sess, save_path=models_path, global_step=step)
break
end_time = time.time()
print("总共用时:", end_time - start_time)
print('Ended......')