tensorflow 性能优化

最新推荐文章于 2024-08-23 06:30:00 发布

青盏

最新推荐文章于 2024-08-23 06:30:00 发布

阅读量2.1k

点赞数

分类专栏： DL tools

原文链接：https://blog.csdn.net/qq_16234613

版权

DL tools 专栏收录该内容

54 篇文章 1 订阅

订阅专栏

TF-Serving使用Batch提升性能
 OPTIMIZING, PROFILING, AND TUNING TENSORFLOW + GPUS
Save a model for inference that was train with Horovod

环境变量TF_GPU_THREAD_MODE
lantency收益：当 TF_GPU_THREAD_MODE 设置为 gpu_shared 、gpu_private 时，warm-up 无变化，avg 延时有明显收益。TF_GPU_THREAD_COUNT 增加到 4、8 时，表现没有提高，甚至有所下降。
cpu收益：当 TF_GPU_THREAD_MODE 设置为 gpu_shared 、gpu_private 时, cpu 利用率有明显下降。

frozen&trim&trt-optimized
frozen&trim graph 的平均用时与 trt_optimized_graph 相同，意味着主要的收益来自于 graph → frozen&trim graph 的变化.应该和此过程中减少了很多 node 有关。目前的 graph 中能被 trt 优化的子图部分占比太小，所以效果不够显著。

python bin/relevance_scripts/freeze_graph.py --input_saved_model_dir=/mnt/1564989211/ --output_graph=saved_model.pb --output_node_names=loss/out_query_emb,loss/out_image_emb

tf tensorrt
Speed up TensorFlow Inference on GPUs with TensorRT

重新编译
如何将TensorFlow Serving的性能提高超过70%？
基于TensorFlow Serving的深度学习在线预估

优化流程
Optimizing TensorFlow Models for Serving
github
graph_transforms

import os
import numpy as np
from datetime import datetime
import sys

import tensorflow as tf
from tensorflow import data
from tensorflow.python.saved_model import tag_constants
from tensorflow.python.tools import freeze_graph
from tensorflow.python import ops
from tensorflow.tools.graph_transforms import TransformGraph

def get_size(model_dir, model_file='saved_model.pb'):
  model_file_path = os.path.join(model_dir, model_file)
  print(model_file_path, '')
  pb_size = os.path.getsize(model_file_path)
  variables_size = 0
  if os.path.exists(
      os.path.join(model_dir,'variables/variables.data-00000-of-00001')):
    variables_size = os.path.getsize(os.path.join(
        model_dir,'variables/variables.data-00000-of-00001'))
    variables_size += os.path.getsize(os.path.join(
        model_dir,'variables/variables.index'))
  print('Model size: {} KB'.format(round(pb_size/(1024.0),3)))
  print('Variables size: {} KB'.format(round( variables_size/(1024.0),3)))
  print('Total Size: {} KB'.format(round((pb_size + variables_size)/(1024.0),3)))

def describe_graph(graph_def, show_nodes=False):
  print('Input Feature Nodes: {}'.format(
      [node.name for node in graph_def.node if node.op=='Placeholder']))
  print('')
  print('Unused Nodes: {}'.format(
      [node.name for node in graph_def.node if 'unused'  in node.name]))
  print('')
  print('Output Nodes: {}'.format(
      [node.name for node in graph_def.node if (
          'predictions' in node.name or 'loss/out' in node.name)]))
  print('')
  print('Quantization Nodes: {}'.format(
      [node.name for node in graph_def.node if 'quant' in node.name]))
  print('')
  print('Constant Count: {}'.format(
      len([node for node in graph_def.node if node.op=='Const'])))
  print('')
  print('Variable Count: {}'.format(
      len([node for node in graph_def.node if 'Variable' in node.op])))
  print('')
  print('Identity Count: {}'.format(
      len([node for node in graph_def.node if node.op=='Identity'])))
  print('', 'Total nodes: {}'.format(len(graph_def.node)), '')

  if show_nodes==True:
    for node in graph_def.node:
      print('Op:{} - Name: {}'.format(node.op, node.name))

def get_graph_def_from_file(graph_filepath):
  with ops.Graph().as_default():
    with tf.gfile.GFile(graph_filepath, 'rb') as f:
      graph_def = tf.GraphDef()
      graph_def.ParseFromString(f.read())
      return graph_def

def get_graph_def_from_saved_model(saved_model_dir):
  with tf.Session() as session:
    meta_graph_def = tf.saved_model.loader.load(
    session,
    tags=[tag_constants.SERVING],
    export_dir=saved_model_dir
  )
  return meta_graph_def.graph_def


def freeze_model(saved_model_dir, output_node_names, output_filename):
  output_graph_filename = os.path.join(saved_model_dir, output_filename)
  initializer_nodes = ''
  freeze_graph.freeze_graph(
      input_saved_model_dir=saved_model_dir,
      output_graph=output_graph_filename,
      saved_model_tags = tag_constants.SERVING,
      output_node_names=output_node_names,
      initializer_nodes=initializer_nodes,
      input_graph=None,
      input_saver=False,
      input_binary=False,
      input_checkpoint=None,
      restore_op_name=None,
      filename_tensor_name=None,
      clear_devices=False,
      input_meta_graph=False,
  )
  print('graph freezed!')


def optimize_graph(model_dir, graph_filename, transforms, output_node, output_filename):
  input_names = []
  output_names = output_node
  if graph_filename is None:
    graph_def = get_graph_def_from_saved_model(model_dir)
  else:
    graph_def = get_graph_def_from_file(os.path.join(model_dir, graph_filename))
  optimized_graph_def = TransformGraph(
      graph_def,
      input_names,
      output_names,
      transforms)
  tf.train.write_graph(optimized_graph_def,
                      logdir=model_dir,
                      as_text=False,
                      name=output_filename)
  print('Graph optimized!')


def convert_graph_def_to_saved_model(export_dir, graph_filepath):
  if tf.gfile.Exists(export_dir):
    tf.gfile.DeleteRecursively(export_dir)
  graph_def = get_graph_def_from_file(graph_filepath)
  with tf.Session(graph=tf.Graph()) as session:
    tf.import_graph_def(graph_def, name='')
    # tf.saved_model.simple_save(
    #     session,
    #     export_dir,
    #     inputs={
    #         node.name: session.graph.get_tensor_by_name(
    #             '{}:0'.format(node.name))
    #         for node in graph_def.node if node.op=='Placeholder'},
    #     outputs={'query_emb': session.graph.get_tensor_by_name(
    #         'loss/out_query_emb:0')}
    # )
    builder = tf.saved_model.builder.SavedModelBuilder(export_dir)
    builder.add_meta_graph_and_variables(session, ['serve'],
             {
                "serving_default":
                    tf.saved_model.signature_def_utils.build_signature_def(
                        inputs={
                            node.name: tf.saved_model.utils.build_tensor_info(session.graph.get_tensor_by_name('{}:0'.format(node.name)))
                            for node in graph_def.node if node.op=='Placeholder' and node.name!="image_list"
                        },
                        outputs={
                            'query_emb': tf.saved_model.utils.build_tensor_info(session.graph.get_tensor_by_name('loss/out_query_emb:0')),
                        },
                        method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME
                    ),
                "serving_cnn":
                    tf.saved_model.signature_def_utils.build_signature_def(
                        inputs={
                            node.name: tf.saved_model.utils.build_tensor_info(session.graph.get_tensor_by_name('{}:0'.format(node.name)))
                            for node in graph_def.node if node.op=='Placeholder' and node.name=="image_list"
                        },
                        outputs={
                            'image_emb': tf.saved_model.utils.build_tensor_info(session.graph.get_tensor_by_name('loss/out_image_emb:0')),
                        },
                        method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME
                    ),
             })

    builder.save()
    print('Optimized graph converted to SavedModel!')


saved_model_dir = "/mnt/cephfs_hl/common/ad/search_ad/huangqingkang/optim/"
output_node_names = "loss/out_query_emb,loss/out_image_emb"
output_node_list = output_node_names.split(",")

# freeze model
print("### starting freeze ###")
frozen_filename = "frozen_model.pb"
frozen_filepath = os.path.join(saved_model_dir, frozen_filename)

freeze_model(saved_model_dir, output_node_names, frozen_filename)

describe_graph(get_graph_def_from_file(frozen_filepath))
get_size(saved_model_dir, frozen_filename)


# optimize model
print("### starting optimize ###")
transforms = [
 "remove_nodes(op=Identity)",
 # "merge_duplicate_nodes",
 "strip_unused_nodes",
 "fold_constants(ignore_errors=true)",
 "fold_batch_norms"
]
optimized_filename = "optimized_model.pb"
optimized_filepath = os.path.join(saved_model_dir, optimized_filename)

optimize_graph(saved_model_dir, frozen_filename , transforms, output_node_list, optimized_filename)

describe_graph(get_graph_def_from_file(optimized_filepath))
get_size(saved_model_dir, optimized_filename)

# covert back
print("### starting convert back ###")
export_dir = os.path.join(saved_model_dir, "export")
convert_graph_def_to_saved_model(export_dir, optimized_filepath)

一、通用优化方法

1、输入流优化

（1）将预处理过程在cpu上执行

（2）使用tf.data API代替feed_dict操作，尤其是大批量输入时

（3）对于图像编码和裁剪，可使用函数tf.image.decode_and_crop_jpeg

（4）使用大文件输入，来防止io瓶颈，将输入文件转换为TFRecord文件。若内存允许，最好将整个数据集加载进内存

2、数据格式

使用NCHW代替NHWC，其中N代表batch大小，H代表图像的高，W代表图像的宽，C代表通道。在CPU中NHWC格式数据更快，但在GPU上NCHW更快。默认是NHWC

3、使用融合操作符

如tf.layers.batch_normalization

4、RNN性能

（1）tf.nn.rnn_cell.BasicLSTMCell迫不得已不要使用这个函数
（2）tf.nn.static_rnn与tf.nn.dynamic_rnn相比，运行时间相差无几，但由于图更庞大，编译时间更长

（3）在GPU上可以使用tf.contrib.cudnn_rnn函数，但不支持layer normalization

（4）tf.contrib.rnn.LSTMBlockCell与tf.nn.rnn_cell.BasicLSTMCell少用3-4倍内存

5、从源码安装tensorflow

从源码安装tensorflow，使其更加适用于本地cpu

二、针对cpu的优化

1.增加线程数

config = tf.ConfigProto()
config.intra_op_parallelism_threads = 44
config.inter_op_parallelism_threads = 44
tf.Session(config=config)
intra_op_parallelism_threads表示操作内部的并行数，
inter_op_parallelism_threads表示能并行运算的操作数
默认线程数是等于逻辑cores数，可选的优化措施是等于物理cores数

2.MKL优化

（1）从源码安装tensorflow

tensorflow 1.2.0

./configure

Pick the desired options

bazel build --config=mkl --config=opt //tensorflow/tools/pip_package:build_pip_package
tensorflow 1.2.0
./configure
Do you wish to build TensorFlow with MKL support? [y/N] Y
Do you wish to download MKL LIB from the web? [Y/n] Y