解决因LSTM导致的tensorflow gpu利用率低

最新推荐文章于 2023-04-21 15:48:42 发布

大黄老鼠

最新推荐文章于 2023-04-21 15:48:42 发布

阅读量2.7k

点赞数 1

文章标签： 1024程序员节

本文链接：https://blog.csdn.net/qq_32768743/article/details/108851686

版权

最后的解决办法就是用CudnnLSTM替换原来的LSTM.

我发现跑的模型，有的时候GPU利用率比较低。经过对模型的拆解运行，最后确定是LSTM拉低了GPU利用率。
老的利用率截图
在这里插入图片描述
新的利用率接近100%
我最后重新写的代码见附录

原来的代码见附录，我找了一番资料后，感觉写得也没有问题。

tensorflow如何高效利用gpu进行rnn
https://www.zhihu.com/question/299843655
创建双向LSTM
https://riptutorial.com/zh-CN/tensorflow/example/17004/%E5%88%9B%E5%BB%BA%E5%8F%8C%E5%90%91lstm

搜索一番后，发现可以用CuDNNLSTM

https://blog.csdn.net/ssswill/article/details/89889395
https://stackoverflow.com/questions/49987261/what-is-the-difference-between-cudnnlstm-and-lstm-in-keras
官方文档
https://www.tensorflow.org/versions/r1.15/api_docs/python/tf/contrib/cudnn_rnn/CudnnLSTM
找到使用demo
https://gist.github.com/protoget/9b45881f23c96e201a90581c8f4b692d
这个代码我贴附录了，实际使用时遇到 Fail to find the dnn implementation.
https://github.com/tensorflow/tensorflow/issues/20067
https://github.com/keras-team/keras/issues/10634
但我指定GPU后就没有这个问题了

如果遇到

tensorflow.python.framework.errors_impl.UnknownError: Fail to find the dnn implementation.
         [[node cudnn_lstm_1/CudnnRNNCanonicalToParams (defined at lstm.py:596) ]]

看一下用的GPU是不是有人用了，如果是别人用了的（即便还有显存也不行），换一个新的GPU。

附

改写的版本

bilstm = my_rnn.CudnnLSTM(
            num_layers=1, num_units=parent_hidden_size//2,
            direction='bidirectional',
            dropout=0.3,
            dtype=tf.float32
        )
# 省略数据变换
bilstm.build(inputsPath.get_shape())
# [time_len, batch_size, input_size] -> [time_len, batch_size, num_dirs * num_units]
my_rnn_outputs, _ = bilstm(inputsPath, training=is_training)
# 只取最后一个
root_path_output = my_rnn_outputs[-1,:,:]
# 省略其他的操作

原来的双向LSTM


def encode_par_path(embedding_inputs, parent_hidden_size, rnn_layers=1, keep_prob=0.7, bi_lstm=True):
    with tf.variable_scope('path_encoder') as encoder_scope:
        def build_cell(hidden_size):
            def get_single_cell(hidden_size, keep_prob):
                cell = tf.nn.rnn_cell.BasicLSTMCell(hidden_size)
                if keep_prob < 1:
                    cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob=keep_prob)
                return cell

            cell = tf.nn.rnn_cell.MultiRNNCell(
                [get_single_cell(hidden_size, keep_prob) for _ in range(rnn_layers)])

            return cell

        if not bi_lstm:
            encoder_cell = build_cell(parent_hidden_size)
            encoder_outputs, encoder_final_state = tf.nn.dynamic_rnn(
                encoder_cell, embedding_inputs,
                # sequence_length=self.par_seq_len,
                dtype=tf.float32, scope=encoder_scope)
            return encoder_outputs, encoder_final_state
        else:
            encoder_cell = build_cell(parent_hidden_size / 2)
            bw_encoder_cell = build_cell(parent_hidden_size / 2)
            encoder_outputs, (fw_state, bw_state) = tf.nn.bidirectional_dynamic_rnn(
                encoder_cell, bw_encoder_cell,
                embedding_inputs,
                # sequence_length=self.par_seq_len,
                dtype=tf.float32, scope=encoder_scope)

            state = []
            for i in range(rnn_layers):
                fs = fw_state[i]
                bs = bw_state[i]
                encoder_final_state_c = tf.concat((fs.c, bs.c), 1)
                encoder_final_state_h = tf.concat((fs.h, bs.h), 1)
                encoder_final_state = tf.nn.rnn_cell.LSTMStateTuple(
                    c=encoder_final_state_c,
                    h=encoder_final_state_h)
                state.append(encoder_final_state)
            encoder_final_state = tuple(state)

            encoder_outputs = tf.concat([encoder_outputs[0], encoder_outputs[1]], -1)
            return encoder_outputs, encoder_final_state


# Path2root
root_path = []
with tf.variable_scope("RNN"):
    for time_step in range(num_steps):
        if time_step > 0:
            tf.get_variable_scope().reuse_variables()
        path_output, path_state = encode_par_path(
            inputsPath[:, time_step, :, :], parent_hidden_size)  # [bz, parent_len, hidden]
        root_path.append(path_output[:, -1, :])  # [seq_len, bz, hidden]

root_path_output = tf.stack(axis=0, values=root_path)  # [bz, seq_len, hidden]

使用CudnnLSTM

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np

import tensorflow as tf


shape = [2, 2, 2]
n_cell_dim = 2

def init_vars(sess):
  sess.run(tf.global_variables_initializer())


def train_graph():
  with tf.Graph().as_default(), tf.device('/gpu:0'):
    with tf.Session() as sess:
      is_training = True

      inputs = tf.random_uniform(shape, dtype=tf.float32)

      lstm = tf.contrib.cudnn_rnn.CudnnLSTM(
          num_layers=1,
          num_units=n_cell_dim,
          direction='bidirectional',
          dtype=tf.float32)
      lstm.build(inputs.get_shape())
      outputs, output_states = lstm(inputs, training=is_training)

      with tf.device('/cpu:0'):
        saver = tf.train.Saver()

      init_vars(sess)
      saver.save(sess, '/tmp/model')


def inf_graph():
  with tf.Graph().as_default(), tf.device('/cpu:0'):
    with tf.Session() as sess:
      single_cell = lambda: tf.contrib.cudnn_rnn.CudnnCompatibleLSTMCell(
          n_cell_dim, reuse=tf.get_variable_scope().reuse)

      inputs = tf.random_uniform(shape, dtype=tf.float32)
      lstm_fw_cell = [single_cell() for _ in range(1)]
      lstm_bw_cell = [single_cell() for _ in range(1)]
      (outputs, output_state_fw,
       output_state_bw) = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(
           lstm_fw_cell,
           lstm_bw_cell,
           inputs,
           dtype=tf.float32,
           time_major=True)
      saver = tf.train.Saver()

      saver.restore(sess, '/tmp/model')
      print(sess.run(outputs))


def main(unused_argv):
  train_graph()
  inf_graph()


if __name__ == '__main__':
  tf.app.run(main)

一个跑mnist的例子

import tensorflow as tf
import numpy as np
from tqdm import tqdm
from tensorflow.examples.tutorials.mnist import input_data
import os

os.environ['CUDA_VISIBLE_DEVICES'] = "0"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

mnist = input_data.read_data_sets('/tmp/data', one_hot=True)
element_size = 28
time_steps = 28
num_classes = 10
batch_size = 64
hidden_layer_size = 128
LOG_DIR = 'culstm'
_inputs = tf.placeholder(tf.float32, shape=[batch_size, time_steps, element_size], name='inputs')
y = tf.placeholder(tf.float32, shape=[None, num_classes], name='labels')

with tf.name_scope('rnn'):
    rnn_input = tf.transpose(_inputs, [1, 0, 2])
    from tensorflow.contrib.cudnn_rnn import CudnnLSTM

    is_training = True
    lstm = tf.contrib.cudnn_rnn.CudnnLSTM(
        num_layers=1, num_units=hidden_layer_size,
        # UnknownError (see above for traceback): CUDNN_STATUS_EXECUTION_FAILED
        # dropout=0.3,
        dtype=tf.float32
    )
    lstm.build(rnn_input.get_shape())
    # [time_len, batch_size, input_size] -> [time_len, batch_size, num_dirs * num_units]
    outputs, _ = lstm(rnn_input, training=is_training)
    output = outputs[-1]
with tf.name_scope('fc'):
    w = tf.Variable(tf.truncated_normal([hidden_layer_size, num_classes], mean=0, stddev=0.01), dtype=tf.float32)
    b = tf.Variable(tf.truncated_normal([num_classes], mean=0, stddev=0.01), dtype=tf.float32)
    y_pred = tf.matmul(output, w) + b

loss = tf.nn.softmax_cross_entropy_with_logits(logits=y_pred, labels=y)
loss = tf.reduce_mean(loss)
optimizer = tf.train.RMSPropOptimizer(0.001, 0.9)
train = optimizer.minimize(loss)
correct_prediction = tf.equal(tf.argmax(y_pred, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

test_data = mnist.test.images[:batch_size].reshape(-1, time_steps, element_size)
test_label = mnist.test.labels[:batch_size]
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    pbar = tqdm(range(10000))
    pbar.set_description(
        f'Train loss:       , '
        f'accuracy      '
        f'Test loss:      , '
        f'accuracy        ')
    print()
    for i in pbar:
        batch_x, batch_y = mnist.train.next_batch(batch_size)
        batch_x = batch_x.reshape(-1, time_steps, element_size)

        _, loss_np, accuracy_np = sess.run([train, loss, accuracy], feed_dict={_inputs: batch_x, y: batch_y})
        if i % 100 == 99:
            test_loss_np, test_accuracy_np = sess.run([loss, accuracy], feed_dict={_inputs: test_data, y: test_label})
            pbar.set_description(
                f'Train loss: {loss_np:.4f}, '
                f'accuracy {accuracy_np:.4f} '
                f'Test loss: {test_loss_np:.4f}, '
                f'accuracy {test_accuracy_np:.4f}')

大黄老鼠

关注

1
点赞
踩
12

收藏

觉得还不错? 一键收藏
2
评论
解决因LSTM导致的tensorflow gpu利用率低

最后的解决办法就是用CudnnLSTM替换原来的LSTM.我发现跑的模型，有的时候GPU利用率比较低。经过对模型的拆解运行，最后确定是LSTM拉低了GPU利用率。老的利用率截图新的利用率接近100%我最后重新写的代码见附录原来的代码见附录，我找了一番资料后，感觉写得也没有问题。tensorflow如何高效利用gpu进行rnnhttps://www.zhihu.com/question/299843655创建双向LSTMhttps://riptutorial.com/zh-CN/tens
复制链接

扫一扫