【NLP加速】【基于TensorRT 加速 bert 推理】的代码实现和总结

最新推荐文章于 2025-02-23 15:55:34 发布

Sito_zz

最新推荐文章于 2025-02-23 15:55:34 发布

阅读量2k

点赞数 2

分类专栏： NLP专题文章标签： bert 深度学习人工智能

本文链接：https://blog.csdn.net/m0_37576959/article/details/127123186

版权

NLP专题专栏收录该内容

4 篇文章

订阅专栏

基于TensorRT 加速 bert 推理

提示：此篇文章的阅读前提是你已经在服务器或者私人电脑上安装好了cuda，cudnn，tensorRT，这些不在此文章里面讨论。

前言

出于工作需要，bert 在服务端的推理速度较慢，尤其是遇到大文本的时候，并不能达到用户的实时性需求，故而研究了一番bert在gpu上的推理加速问题，业界大多数都是 CV 界用tensorRT 加速的比较多，NLP 这方面的文章相当少。这里也是有所总结，在这记录一下。

提示：博主使用的 win10 的环境，cuda 版本10.2，显卡是 GTX 系列的，基于pytorch实现的模型，因此需要先将模型导出成onnx格式，在利用 tensorRT 的 trtexec.exe 工具将onnx模型转换成 tensorRT 的 engine，最后利用 tensorRT 的 python API 进行推理调用

一、pytorch 模型转换成 onnx 代码

# coding:utf-8
from transformers import BertModel, BertTokenizer
import torch.nn as nn
import torch


class MyModel(nn.Module):

    def __init__(self, pretrain_path):
        super(MyModel, self).__init__()
        self.bert = BertModel.from_pretrained(pretrain_path)
        self.classifier = nn.Linear(768, 2)

    def forward(self, input_ids, attention_mask, token_type_ids):
        with torch.no_grad():
            outputs = self.bert(input_ids=input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids)
            outputs = outputs[1]
            return nn.Softmax(dim=-1)(self.classifier(outputs)).argmax(dim=-1)


pretrain_path = 'bert-ext-chinese'
pt_model_path = 'cls.pt'
onnx_model_path = 'cls.onnx'
trt_model_path = 'cls.trt'

x = '我们组老哥加班太猛了'
# , '前天看的电影还挺好看的。'

tokenizer = BertTokenizer.from_pretrained(pretrain_path)
inputs = tokenizer.encode_plus(x, return_tensors='pt', add_special_tokens=True, padding='max_length',
                               max_length=20,
                               truncation=True)
input_ids, attention_mask, token_type_ids = inputs['input_ids'], inputs['attention_mask'], inputs['token_type_ids']

# save to pt model
model = MyModel(pretrain_path)
output = model(input_ids, attention_mask, token_type_ids)
torch.save(model.state_dict(), pt_model_path)

# convert to onnx

torch.onnx.export(model,  # model being run
                  (input_ids, attention_mask, token_type_ids),  # model input (or a tuple for multiple inputs)
                  onnx_model_path,  # where to save the model (can be a file or file-like object)
                  export_params=True,  # store the trained parameter weights inside the model file
                  opset_version=10,  # the ONNX version to export the model to
                  do_constant_folding=True,  # whether to execute constant folding for optimization
                  input_names=['input_ids','attention_mask','token_type_ids'],  # the model's input names
                  output_names=['output'],  # the model's output names
                  dynamic_axes={'input_ids': {0: 'batch_size', 1: 'seq_len'},  # variable length axes
                                'attention_mask': {0: 'batch_size', 1: 'seq_len'},  # variable length axes
                                'token_type_ids': {0: 'batch_size', 1: 'seq_len'},  # variable length axes
                                'output': {0: 'batch_size'}})

# 模型验证
import onnx

onnx_model = onnx.load(onnx_model_path)
onnx.checker.check_model(onnx_model)

# 模型推理
import onnxruntime

ort_session = onnxruntime.InferenceSession(onnx_model_path)

def to_numpy(tensor):
    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()

# 计算ONNX输出
ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(input_ids),
              ort_session.get_inputs()[1].name: to_numpy(attention_mask),
              ort_session.get_inputs()[2].name: to_numpy(token_type_ids)}
ort_outs = ort_session.run(None, ort_inputs)
print(ort_outs)

二、将导出的 onnx 模型转换成 tensorRT engine

利用 trtexec.exe 工具进行转换！！！
–onnx 是原始的 onnx 模型
–saveEngine 是转换后的 engine 地址
–minShapes 和 --maxShapes 这两个比较重要，是动态输入的指定大小范围，这个搞错了会导致推理失败的。一般都是 batch size x seq_len，都是跟着 torch.onnx.export 里面的参数走的。
这里使用的是cmd的命令。

trtexec ^
--onnx=cls.onnx ^
--saveEngine=cls.engine ^
--workspace=10000 ^
--minShapes=input_ids:1x1,attention_mask:1x1,token_type_ids:1x1 ^
--optShapes=input_ids:8x20,attention_mask:8x20,token_type_ids:8x20 ^
--maxShapes=input_ids:8x20,attention_mask:8x20,token_type_ids:8x20 ^

三、调用 tensorRT 的 python API 进行模型推理

import numpy as np
from transformers import BertTokenizer
import tensorrt as trt
import trtutils as trtu

"""
a、获取 engine，建立上下文
"""
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)


def get_engine(engine_file_path):
    print("Reading engine from file {}".format(engine_file_path))
    with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
        engine = runtime.deserialize_cuda_engine(f.read())
        return engine


engine_model_path = "cls.engine"
# Build a TensorRT engine.
engine = get_engine(engine_model_path)
# Contexts are used to perform inference.
context = engine.create_execution_context()


"""
b、从engine中获取inputs, outputs, bindings, stream 的格式以及分配缓存
"""


def to_numpy(tensor):
    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()


x =  '我们组老哥加班太猛了'
    # , '前天看的电影还挺好看的。'

pretrain_path = 'bert-ext-chinese'
tokenizer = BertTokenizer.from_pretrained(pretrain_path)
inputs = tokenizer.encode_plus(x, return_tensors='pt', add_special_tokens=True, padding='max_length',
                               max_length=20,
                               truncation=True)

# 2. 选择是否采用FP16精度，与导出的trt模型保持一致
USE_FP16 = False
target_dtype = np.float16 if USE_FP16 else np.float32

input_ids = to_numpy(inputs['input_ids']).astype(target_dtype)
attention_mask = to_numpy(inputs['attention_mask']).astype(target_dtype)
token_type_ids = to_numpy(inputs['token_type_ids']).astype(target_dtype)

context.active_optimization_profile = 0
origin_inputshape = context.get_binding_shape(0)  # (1,-1)
origin_inputshape[0], origin_inputshape[1] = input_ids.shape  # (batch_size, max_sequence_length)
context.set_binding_shape(0, (origin_inputshape))
context.set_binding_shape(1, (origin_inputshape))
context.set_binding_shape(2, (origin_inputshape))

"""
c、输入数据填充
"""
inputs, outputs, bindings, stream = trtu.allocate_buffers_v2(engine, context)
inputs[0].host = input_ids
inputs[1].host = attention_mask
inputs[2].host = token_type_ids

"""
d、tensorrt推理
"""
trt_outputs = trtu.do_inference_v2(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)
preds = np.argmax(trt_outputs, axis=1)
print("====preds====:", preds)

trtutils 代码


import argparse
import os
import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda
import tensorrt as trt

try:
    # Sometimes python does not understand FileNotFoundError
    FileNotFoundError
except NameError:
    FileNotFoundError = IOError

EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)


def GiB(val):
    return val * 1 << 30


def add_help(description):
    parser = argparse.ArgumentParser(description=description, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    args, _ = parser.parse_known_args()


def find_sample_data(description="Runs a TensorRT Python sample", subfolder="", find_files=[], err_msg=""):
    '''
    Parses sample arguments.
    Args:
        description (str): Description of the sample.
        subfolder (str): The subfolder containing data relevant to this sample
        find_files (str): A list of filenames to find. Each filename will be replaced with an absolute path.
    Returns:
        str: Path of data directory.
    '''

    # Standard command-line arguments for all samples.
    kDEFAULT_DATA_ROOT = os.path.join(os.sep, "usr", "src", "tensorrt", "data")
    parser = argparse.ArgumentParser(description=description, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("-d", "--datadir",
                        help="Location of the TensorRT sample data directory, and any additional data directories.",
                        action="append", default=[kDEFAULT_DATA_ROOT])
    args, _ = parser.parse_known_args()

    def get_data_path(data_dir):
        # If the subfolder exists, append it to the path, otherwise use the provided path as-is.
        data_path = os.path.join(data_dir, subfolder)
        if not os.path.exists(data_path):
            if data_dir != kDEFAULT_DATA_ROOT:
                print("WARNING: " + data_path + " does not exist. Trying " + data_dir + " instead.")
            data_path = data_dir
        # Make sure data directory exists.
        if not (os.path.exists(data_path)) and data_dir != kDEFAULT_DATA_ROOT:
            print("WARNING: {:} does not exist. Please provide the correct data path with the -d option.".format(
                data_path))
        return data_path

    data_paths = [get_data_path(data_dir) for data_dir in args.datadir]
    return data_paths, locate_files(data_paths, find_files, err_msg)


def locate_files(data_paths, filenames, err_msg=""):
    """
    Locates the specified files in the specified data directories.
    If a file exists in multiple data directories, the first directory is used.
    Args:
        data_paths (List[str]): The data directories.
        filename (List[str]): The names of the files to find.
    Returns:
        List[str]: The absolute paths of the files.
    Raises:
        FileNotFoundError if a file could not be located.
    """
    found_files = [None] * len(filenames)
    for data_path in data_paths:
        # Find all requested files.
        for index, (found, filename) in enumerate(zip(found_files, filenames)):
            if not found:
                file_path = os.path.abspath(os.path.join(data_path, filename))
                if os.path.exists(file_path):
                    found_files[index] = file_path

    # Check that all files were found
    for f, filename in zip(found_files, filenames):
        if not f or not os.path.exists(f):
            raise FileNotFoundError(
                "Could not find {:}. Searched in data paths: {:}\n{:}".format(filename, data_paths, err_msg))
    return found_files


# Simple helper data class that's a little nicer to use than a 2-tuple.
class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        self.host = host_mem
        self.device = device_mem

    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()


# Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
def allocate_buffers(engine):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    for binding in engine:
        size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size  # max_batch_size=1
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)  # # nbytes表示数组中的所有数据消耗掉的字节数
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings, stream


# Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
def allocate_buffers_v2(engine, context):
    """
    Allocates host and device buffer for TRT engine inference.
    This function is similiar to the one in ../../common.py, but
    converts network outputs (which are np.float32) appropriately
    before writing them to Python buffer. This is needed, since
    TensorRT plugins doesn't support output type description, and
    in our particular case, we use NMS plugin as network output.
    Args:
        engine (trt.ICudaEngine): TensorRT engine
    Returns:
        inputs [HostDeviceMem]: engine input memory
        outputs [HostDeviceMem]: engine output memory
        bindings [int]: buffer to device bindings
        stream (cuda.Stream): cuda stream for engine inference synchronization
    """
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    for i, binding in enumerate(engine):
        # binding:input_ids,input_mask,output
        # print(context.get_binding_shape(i)) # (input_ids,input_mask,output).shape (1,105)
        size = trt.volume(context.get_binding_shape(i))  # 1*105
        # dims = engine.get_binding_shape(binding)
        # if dims[1] < 0:
        # size *= -1
        dtype = trt.nptype(engine.get_binding_dtype(binding))  # DataType.FLOAT
        # print(dtype)  # <class 'numpy.float32'>
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings, stream


# This function is generalized for multiple inputs/outputs.
# inputs and outputs are expected to be lists of HostDeviceMem objects.
def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
    # Transfer input data to the GPU.
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    # Run inference. batch_size = 1
    context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    # Synchronize the stream
    stream.synchronize()
    # Return only the host outputs.
    return [out.host for out in outputs]


# This function is generalized for multiple inputs/outputs for full dimension networks.
# inputs and outputs are expected to be lists of HostDeviceMem objects.
def do_inference_v2(context, bindings, inputs, outputs, stream):
    # Transfer input data to the GPU.
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    # Run inference.
    context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    # Synchronize the stream
    stream.synchronize()
    # Return only the host outputs.
    return [out.host for out in outputs]