模型部署-转静态图和预测引擎

 self.input_handles[0].copy_from_cpu(input_ids)
            if network in [
                    "lstm", "bilstm", "gru", "bigru", "rnn", "birnn",
                    "bilstm_attn"
            ]:
                self.input_handles[1].copy_from_cpu(seq_lens)
            self.predictor.run()
            logits = self.output_handle.copy_to_cpu()

输入句柄和输出句柄的作业,类似有把numpy类型转换成tensor输入到模型的作业,模型的操作都是tensor的方式
在这里插入图片描述

预测部署

模型训练完成之后接下来我们实现模型的预测部署。虽然训练阶段使用的动态图模式有诸多优点,包括Python风格的编程体验(使用RNN等包含控制流的网络时尤为明显)、友好的debug交互机制等。但Python动态图模式无法更好的满足预测部署阶段的性能要求,同时也限制了部署环境。

静态图是预测部署通常采用的方式。通过静态图中预先定义的网络结构,一方面无需像动态图那样执行开销较大的Python代码;另一方面,预先固定的图结构也为基于图的优化提供了可能,这些能够有效提升预测部署的性能。常用的基于图的优化策略有内存复用和算子融合,这需要预测引擎的支持。下面是算子融合的一个示例(将Transformer Block的FFN中的矩阵乘->加bias->relu激活替换为单个算子):
高性能预测部署需要静态图模型导出和预测引擎两方面的支持,这里分别介绍。

动转静导出模型

基于静态图的预测部署要求将动态图的模型转换为静态图形式的模型(网络结构和参数权重)。
在这里插入图片描述
Paddle静态图形式的模型(由变量和算子构成的网络结构)使用Program来存放,Program的构造可以通过Paddle的静态图模式说明,静态图模式下网络构建执行的各API会将输入输出变量和使用的算子添加到Program中。

import paddle
# 默认为动态图模式,这里开启静态图模式
paddle.enable_static()
# 定义输入变量,静态图下变量只是一个符号化表示,并不像动态图 Tensor 那样持有实际数据
x = paddle.static.data(shape=[None, 128], dtype='float32', name='x')
linear = paddle.nn.Linear(128, 256, bias_attr=False)
# 定义计算网络,输入和输出也都是符号化表示
y = linear(x)
# 打印 program
print(paddle.static.default_main_program())
# 关闭静态图模式
paddle.disable_static()

{ // block 0
    var x : LOD_TENSOR.shape(-1, 128).dtype(float32).stop_gradient(True)
    persist trainable param linear_0.w_0 : LOD_TENSOR.shape(128, 256).dtype(float32).stop_gradient(False)
    var linear_1.tmp_0 : LOD_TENSOR.shape(-1, 256).dtype(float32).stop_gradient(False)
    persist trainable param linear_2.w_0 : LOD_TENSOR.shape(128, 256).dtype(float32).stop_gradient(False)
    var linear_3.tmp_0 : LOD_TENSOR.shape(-1, 256).dtype(float32).stop_gradient(False)

    {Out=['linear_1.tmp_0']} = matmul(inputs={X=['x'], Y=['linear_0.w_0']}, Scale_out = 1.0, Scale_x = 1.0, Scale_y = 1.0, alpha = 1.0, force_fp32_output = False, fused_reshape_Out = [], fused_reshape_X = [], fused_reshape_Y = [], fused_transpose_Out = [], fused_transpose_X = [], fused_transpose_Y = [], head_number = 1, mkldnn_data_type = float32, op_device = , op_namescope = /, op_role = 0, op_role_var = [], transpose_X = False, transpose_Y = False, use_mkldnn = False, use_quantizer = False)
    {Out=['linear_3.tmp_0']} = matmul(inputs={X=['x'], Y=['linear_2.w_0']}, Scale_out = 1.0, Scale_x = 1.0, Scale_y = 1.0, alpha = 1.0, force_fp32_output = False, fused_reshape_Out = [], fused_reshape_X = [], fused_reshape_Y = [], fused_transpose_Out = [], fused_transpose_X = [], fused_transpose_Y = [], head_number = 1, mkldnn_data_type = float32, op_device = , op_namescope = /, op_role = 0, op_role_var = [], transpose_X = False, transpose_Y = False, use_mkldnn = False, use_quantizer = False)
}

结合Paddle的静态图机制,Paddle提供了从动态图模型转换并导出静态图模型(包括网络结构和参数权重)的功能,通过jit.to_static和jit.save完成。

paddle.jit.to_static 完成动态图模型到静态图模型的转换。
网络结构:将动态图模型的forward函数转写(重点将Python控制流转换为Paddle对应API的调用),然后以静态图模式执行,生成Program。
参数权重:将动态图模型的参数在生成Program时对应到其中的变量上。
动转静时还需要使用InputSpec提供模型输入的描述信息(shape、dtype和name)保证Program构建过程中形状和数据类型的正确性。

# 设置log输出转写的代码内容
# paddle.jit.set_code_level(100)

# 加载动态图模型
param_state_dict = paddle.load("best_model_7380_933.pdparams")
student.set_state_dict(param_state_dict)

# 动转静,通过`input_spec`给出模型所需输入数据的描述,shape中的None代表可变的大小,类似上面静态图模式中的`paddle.static.data`
model = paddle.jit.to_static(
    student,
    input_spec=[
        paddle.static.InputSpec(
            shape=[None, None], dtype="int64"),  # input_ids: [batch_size, max_seq_len]
        paddle.static.InputSpec(
            shape=[None], dtype="int64")  # length: [batch_size]
    ])

打印动转静产生的Program以及输入变量

print(model.forward.concrete_program.main_program)
print(model.forward.inputs)

打印模型参数权重内容

print(model.forward.concrete_program.parameters[0].name)
print(model.forward.concrete_program.parameters[0].value)
{ // block 0
var x : LOD_TENSOR.shape(-1, -1).dtype(int64).stop_gradient(False)
var seq_len : LOD_TENSOR.shape(-1,).dtype(int64).stop_gradient(False)
persist trainable param embedding_3.w_0 : LOD_TENSOR.shape(29496, 300).dtype(float32).stop_gradient(False)
var embedding_0.tmp_0 : LOD_TENSOR.shape(-1, -1, 300).dtype(float32).stop_gradient(False)
persist trainable param lstm_cell_0.w_0 : LOD_TENSOR.shape(1200, 300).dtype(float32).stop_gradient(False)
persist trainable param lstm_cell_0.w_1 : LOD_TENSOR.shape(1200, 300).dtype(float32).stop_gradient(False)
persist trainable param lstm_cell_0.b_0 : LOD_TENSOR.shape(1200,).dtype(float32).stop_gradient(False)
persist trainable param lstm_cell_0.b_1 : LOD_TENSOR.shape(1200,).dtype(float32).stop_gradient(False)
persist trainable param lstm_cell_1.w_0 : LOD_TENSOR.shape(1200, 300).dtype(float32).stop_gradient(False)
persist trainable param lstm_cell_1.w_1 : LOD_TENSOR.shape(1200, 300).dtype(float32).stop_gradient(False)
persist trainable param lstm_cell_1.b_0 : LOD_TENSOR.shape(1200,).dtype(float32).stop_gradient(False)
persist trainable param lstm_cell_1.b_1 : LOD_TENSOR.shape(1200,).dtype(float32).stop_gradient(False)
var lstm_0._generated_var_0 : LOD_TENSOR.shape().dtype(uint8).stop_gradient(True)
var fill_constant_batch_size_like_0.tmp_0 : LOD_TENSOR.shape(2, -1, 300).dtype(float32).stop_gradient(True)
var fill_constant_batch_size_like_1.tmp_0 : LOD_TENSOR.shape(2, -1, 300).dtype(float32).stop_gradient(True)
var transpose_0.tmp_0 : LOD_TENSOR.shape(-1, -1, 300).dtype(float32).stop_gradient(False)
var transpose_0.tmp_1 : LOD_TENSOR.shape(0, -1, -1, 300).dtype(float32).stop_gradient(False)
var lstm_0.tmp_0 : LOD_TENSOR.shape(-1, -1, 600).dtype(float32).stop_gradient(False)
var lstm_0.tmp_1 : LOD_TENSOR.shape(2, -1, 300).dtype(float32).stop_gradient(False)
var lstm_0.tmp_2 : LOD_TENSOR.shape(2, -1, 300).dtype(float32).stop_gradient(False)
var lstm_0.tmp_3 : LOD_TENSOR.shape().dtype(uint8).stop_gradient(True)
var transpose_1.tmp_0 : LOD_TENSOR.shape(-1, -1, 600).dtype(float32).stop_gradient(False)
var transpose_1.tmp_1 : LOD_TENSOR.shape(0, -1, -1, 600).dtype(float32).stop_gradient(False)
var lstm_0.tmp_1_slice_0 : LOD_TENSOR.shape(-1, 300).dtype(float32).stop_gradient(False)
var lstm_0.tmp_1_slice_1 : LOD_TENSOR.shape(-1, 300).dtype(float32).stop_gradient(False)
var concat_0.tmp_0 : LOD_TENSOR.shape(-1, 600).dtype(float32).stop_gradient(False)
persist trainable param linear_20.w_0 : LOD_TENSOR.shape(600, 300).dtype(float32).stop_gradient(False)
persist trainable param linear_20.b_0 : LOD_TENSOR.shape(300,).dtype(float32).stop_gradient(False)
var linear_0.tmp_0 : LOD_TENSOR.shape(-1, 300).dtype(float32).stop_gradient(False)
var linear_0.tmp_1 : LOD_TENSOR.shape(-1, 300).dtype(float32).stop_gradient(False)
var tanh_0.tmp_0 : LOD_TENSOR.shape(-1, 300).dtype(float32).stop_gradient(False)
persist trainable param linear_21.w_0 : LOD_TENSOR.shape(300, 2).dtype(float32).stop_gradient(False)
persist trainable param linear_21.b_0 : LOD_TENSOR.shape(2,).dtype(float32).stop_gradient(False)
var linear_1.tmp_0 : LOD_TENSOR.shape(-1, 2).dtype(float32).stop_gradient(False)
var linear_1.tmp_1 : LOD_TENSOR.shape(-1, 2).dtype(float32).stop_gradient(False)

{Out=['embedding_0.tmp_0']} = lookup_table_v2(inputs={Ids=['x'], W=['embedding_3.w_0']}, epmap = [], height_sections = [], is_distributed = False, is_sparse = False, op_device = , op_namescope = /, op_role = 0, op_role_var = [], padding_idx = 0, remote_prefetch = False, table_names = [], trainer_id = 0)
{Out=['fill_constant_batch_size_like_0.tmp_0']} = fill_constant_batch_size_like(inputs={Input=['embedding_0.tmp_0']}, dtype = 5, force_cpu = False, input_dim_idx = 0, op_device = , op_namescope = /, op_role = 0, op_role_var = [], output_dim_idx = 1, shape = [2, -1, 300], str_value = 0.0, value = 0.0)
{Out=['fill_constant_batch_size_like_1.tmp_0']} = fill_constant_batch_size_like(inputs={Input=['embedding_0.tmp_0']}, dtype = 5, force_cpu = False, input_dim_idx = 0, op_device = , op_namescope = /, op_role = 0, op_role_var = [], output_dim_idx = 1, shape = [2, -1, 300], str_value = 0.0, value = 0.0)
{Out=['transpose_0.tmp_0'], XShape=['transpose_0.tmp_1']} = transpose2(inputs={X=['embedding_0.tmp_0']}, axis = [1, 0, 2], data_format = AnyLayout, mkldnn_data_type = float32, op_device = , op_namescope = /, op_role = 0, op_role_var = [], use_mkldnn = False, use_quantizer = False)
{DropoutState=['lstm_0._generated_var_0'], Out=['lstm_0.tmp_0'], Reserve=['lstm_0.tmp_3'], State=['lstm_0.tmp_1', 'lstm_0.tmp_2']} = rnn(inputs={Input=['transpose_0.tmp_0'], PreState=['fill_constant_batch_size_like_0.tmp_0', 'fill_constant_batch_size_like_1.tmp_0'], SequenceLength=['seq_len'], WeightList=['lstm_cell_0.w_0', 'lstm_cell_0.w_1', 'lstm_cell_1.w_0', 'lstm_cell_1.w_1', 'lstm_cell_0.b_0', 'lstm_cell_0.b_1', 'lstm_cell_1.b_0', 'lstm_cell_1.b_1']}, dropout_prob = 0.10000000149011612, hidden_size = 300, input_size = 300, is_bidirec = True, is_test = False, mode = LSTM, num_layers = 1, op_device = , op_namescope = /, op_role = 0, op_role_var = [], seed = 0)
{Out=['transpose_1.tmp_0'], XShape=['transpose_1.tmp_1']} = transpose2(inputs={X=['lstm_0.tmp_0']}, axis = [1, 0, 2], data_format = AnyLayout, mkldnn_data_type = float32, op_device = , op_namescope = /, op_role = 0, op_role_var = [], use_mkldnn = False, use_quantizer = False)
{Out=['lstm_0.tmp_1_slice_0']} = slice(inputs={EndsTensor=[], EndsTensorList=[], Input=['lstm_0.tmp_1'], StartsTensor=[], StartsTensorList=[]}, axes = [0], decrease_axis = [0], ends = [-1], infer_flags = [1], op_device = , op_namescope = /, op_role = 0, op_role_var = [], starts = [-2])
{Out=['lstm_0.tmp_1_slice_1']} = slice(inputs={EndsTensor=[], EndsTensorList=[], Input=['lstm_0.tmp_1'], StartsTensor=[], StartsTensorList=[]}, axes = [0], decrease_axis = [0], ends = [10000000], infer_flags = [1], op_device = , op_namescope = /, op_role = 0, op_role_var = [], starts = [-1])
{Out=['concat_0.tmp_0']} = concat(inputs={AxisTensor=[], X=['lstm_0.tmp_1_slice_0', 'lstm_0.tmp_1_slice_1']}, axis = 1, mkldnn_data_type = float32, op_device = , op_namescope = /, op_role = 0, op_role_var = [], use_mkldnn = False, use_quantizer = False)
{Out=['linear_0.tmp_0']} = matmul(inputs={X=['concat_0.tmp_0'], Y=['linear_20.w_0']}, Scale_out = 1.0, Scale_x = 1.0, Scale_y = 1.0, alpha = 1.0, force_fp32_output = False, fused_reshape_Out = [], fused_reshape_X = [], fused_reshape_Y = [], fused_transpose_Out = [], fused_transpose_X = [], fused_transpose_Y = [], head_number = 1, mkldnn_data_type = float32, op_device = , op_namescope = /, op_role = 0, op_role_var = [], transpose_X = False, transpose_Y = False, use_mkldnn = False, use_quantizer = False)
{Out=['linear_0.tmp_1']} = elementwise_add(inputs={X=['linear_0.tmp_0'], Y=['linear_20.b_0']}, Scale_out = 1.0, Scale_x = 1.0, Scale_y = 1.0, axis = 1, mkldnn_data_type = float32, op_device = , op_namescope = /, op_role = 0, op_role_var = [], use_mkldnn = False, use_quantizer = False, x_data_format = , y_data_format = )
{Out=['tanh_0.tmp_0']} = tanh(inputs={X=['linear_0.tmp_1']}, op_device = , op_namescope = /, op_role = 0, op_role_var = [], use_cudnn = False, use_mkldnn = False)
{Out=['linear_1.tmp_0']} = matmul(inputs={X=['tanh_0.tmp_0'], Y=['linear_21.w_0']}, Scale_out = 1.0, Scale_x = 1.0, Scale_y = 1.0, alpha = 1.0, force_fp32_output = False, fused_reshape_Out = [], fused_reshape_X = [], fused_reshape_Y = [], fused_transpose_Out = [], fused_transpose_X = [], fused_transpose_Y = [], head_number = 1, mkldnn_data_type = float32, op_device = , op_namescope = /, op_role = 0, op_role_var = [], transpose_X = False, transpose_Y = False, use_mkldnn = False, use_quantizer = False)
{Out=['linear_1.tmp_1']} = elementwise_add(inputs={X=['linear_1.tmp_0'], Y=['linear_21.b_0']}, Scale_out = 1.0, Scale_x = 1.0, Scale_y = 1.0, axis = 1, mkldnn_data_type = float32, op_device = , op_namescope = /, op_role = 0, op_role_var = [], use_mkldnn = False, use_quantizer = False, x_data_format = , y_data_format = )

}

[var x : LOD_TENSOR.shape(-1, -1).dtype(int64).stop_gradient(False), var seq_len : LOD_TENSOR.shape(-1,).dtype(int64).stop_gradient(False)]
embedding_3.w_0
<bound method PyCapsule.value of Parameter containing:
Tensor(shape=[29496, 300], dtype=float32, place=CPUPlace, stop_gradient=False,
[[ 0. , 0. , 0. , …, 0. , 0. , 0. ],
[ 0.11659081, -0.04384097, -0.04026616, …, -0.05641207, -0.23784846, -0.09599022],
[-0.01071034, -0.00254965, 0.01612471, …, -0.00325200, -0.02263010, -0.00263259],
…,
[-0.00839648, 0.00858942, 0.00404084, …, -0.01468238, -0.00842635, 0.01307028],
[ 0.01002004, -0.00883020, 0.00299168, …, -0.00572370, 0.01096750, -0.00613657],
[-0.02629061, 0.04087983, -0.06212689, …, -0.03645510, 0.02588729, -0.01578910]])>

paddle.jit.save 完成静态图模型(网络结构和参数权重)的序列化保存。

网络结构:以.pdmodel为扩展名的文件,可以使用visualdl来可视化。
参数权重:以.pdiparams为扩展名的文件。


import os

# 保存动转静后的模型,得到 infer_model/model.pdmodel 和 infer_model/model.pdiparams 文件
paddle.jit.save(model, "infer_model/model")
os.listdir("infer_model/")

使用推理库预测

获得静态图模型之后,我们使用Paddle Inference进行预测部署。Paddle Inference是飞桨的原生推理库,作用于服务器端和云端,提供高性能的推理能力。

Paddle Inference采用 Predictor 进行预测。Predictor 是一个高性能预测引擎,该引擎通过对计算图的分析,完成对计算图的一系列的优化(如OP的融合、内存/显存的优化、 MKLDNN,TensorRT 等底层加速库的支持等),能够大大提升预测性能。另外Paddle Inference提供了Python、C++、GO等多语言的API,可以根据实际环境需要进行选择,为了便于演示这里使用Python API来完成,其已在安装的Paddle包中集成,直接使用即可。使用 Paddle Inference 开发 Python 预测程序仅需以下步骤:
在这里插入图片描述

import paddle.inference as paddle_infer

# 1. 创建配置对象,设置预测模型路径 
config = paddle_infer.Config("infer_model/model.pdmodel", "infer_model/model.pdiparams")
# 启用 GPU 进行预测 - 初始化 GPU 显存 100M, Deivce_ID 为 0
# config.enable_use_gpu(100, 0)
config.disable_gpu()
# 2. 根据配置内容创建推理引擎
predictor = paddle_infer.create_predictor(config)
# 3. 设置输入数据
# 获取输入句柄
input_handles = [
            predictor.get_input_handle(name)
            for name in predictor.get_input_names()
        ]
# 获取输入数据
data = dev_batchify_fn([dev_ds[0]])
# 设置输入数据
for input_field, input_handle in zip(data, input_handles):
    input_handle.copy_from_cpu(input_field)

# 4. 执行预测
predictor.run()

# 5. 获取预测结果
# 获取输出句柄
output_handles = [
            predictor.get_output_handle(name)
            for name in predictor.get_output_names()
        ]
# 从输出句柄获取预测结果
output = [output_handle.copy_to_cpu() for output_handle in output_handles]
# 打印预测结果
print(output)

# 打印直接使用动态图模型预测的结果
print(student(*data[:-1]).numpy())

# Predictor和动态图模型预测速度对照
import time
start_time = time.time()
for i in range(100):
    for input_field, input_handle in zip(data, input_handles):
        input_handle.copy_from_cpu(input_field)
    predictor.run()
    output = [output_handle.copy_to_cpu() for output_handle in output_handles]
print("Predictor inference time: ", time.time() - start_time)

start_time = time.time()
for i in range(100):
    output = student(*data[:-1]).numpy()
print("Dygraph model inference time: ", time.time() - start_time)
[array([[-1.6722751,  1.4945102]], dtype=float32)]
[[-1.6722751  1.4945102]]
Predictor inference time:  1.464249610900879
Dygraph model inference time:  1.6330127716064453
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值