深度学习系列19：tensorRT基础

最新推荐文章于 2024-04-29 15:04:52 发布

IE06

最新推荐文章于 2024-04-29 15:04:52 发布

阅读量586

点赞数

分类专栏：深度学习系列

本文链接：https://blog.csdn.net/kittyzc/article/details/117108211

版权

深度学习系列专栏收录该内容

70 篇文章 155 订阅

订阅专栏

tensorrt官方库：https://github.com/NVIDIA/TensorRT, git clone一下即可

1. onnx转tensorRT

首先放一张对比图：
在这里插入图片描述
使用官方的tensorrt包，编译出trtexec文件，按照下面的语句执行：

trtexec --onnx=onnx-modifier/result.onnx --batch=1 --saveEngine=onnx-modifier/result.trt --workspace=8196

即可获得tensorRT的trt模型。
也可以用python来进行转换：

import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit

import os
import numpy as np

batch_size=1
onnx_file_path="....onnx"
engine_file_path="model_int8.trt"
G_LOGGER = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(G_LOGGER)
network = builder.create_network(1)
parser = trt.OnnxParser(network, G_LOGGER)
builder.max_batch_size = batch_size
builder.max_workspace_size = 48263040
builder.fp16_mode = True
#builder.int8_mode = True
#builder.int8_calibrator = MyCalibrator()
with open(onnx_file_path, 'rb') as model:
    parser.parse(model.read())
engine = builder.build_cuda_engine(network)
print("Created engine success! ")
with open(engine_file_path, "wb") as f:
    f.write(engine.serialize())
print('Engine file has already saved to {}!'.format(engine_file_path))

2. 使用onnx进行推理

import onnxruntime as rt
import numpy as np
import cv2
sess = rt.InferenceSession("/....onnx")
input_name = sess.get_inputs()[0].name
img = cv2.resize(cv2.imread("....png"),(672,384)).astype(np.float32)
X = np.array([np.transpose(img, (2,0,1))])
pred_onnx = sess.run(None, {input_name: X})

3. 不转模型，使用tensorRT推理

文档参考：https://github.com/onnx/onnx-tensorrt
代码如下：

import onnx
import onnx_tensorrt.backend as backend
import numpy as np

model = onnx.load("/path/to/model.onnx")
engine = backend.prepare(model, device='CUDA:1')
input_data = np.random.random(size=(32, 3, 224, 224)).astype(np.float32)
output_data = engine.run(input_data)[0]
print(output_data)
print(output_data.shape)

4. 使用tensorRT进行推理

import tensorrt as trt
import os
import pycuda.driver as cuda
import cv2
import numpy as np
import pycuda.autoinit
class TensorRTInference(object):
    def __init__(self, engine_file_path, input_shape):
        self.engine_file_path = engine_file_path
        self.shape = input_shape       
        self.engine = self.load_engine()

    def load_engine(self):
        assert os.path.exists(self.engine_file_path)
        with open(self.engine_file_path, 'rb') as f, trt.Runtime(trt.Logger()) as runtime:
            engine_data = f.read()
            engine = runtime.deserialize_cuda_engine(engine_data)
            return engine

    def infer_once(self, img):
        engine = self.engine
        if len(img.shape) == 4:
            b, c, h, w = img.shape
        elif len(img.shape) == 3:
            c, h, w = img.shape
            b = 1
        with engine.create_execution_context() as context:
            context.set_binding_shape(engine.get_binding_index('input'), (b, c, h,w))
            bindings = []
            for binding in engine:
                binding_idx = engine.get_binding_index(binding)
                size = trt.volume(context.get_binding_shape(binding_idx))
                dtype = trt.nptype(engine.get_binding_dtype(binding))
                if engine.binding_is_input(binding):
                    input_buffer = np.ascontiguousarray(img, dtype).astype(np.float32)
                    input_memory = cuda.mem_alloc(img.nbytes)
                    bindings.append(int(input_memory))
                else:
                    output_buffer = cuda.pagelocked_empty(size, dtype)
                    bindings.append(int(output_memory))
            stream = cuda.Stream()
            cuda.memcpy_htod_async(input_memory, input_buffer, stream)
            context.execute_async(bindings=bindings, stream_handle=stream.handle)
            cuda.memcpy_dtoh_async(output_buffer, output_memory, stream)
            stream.synchronize()
            #res = np.reshape(output_buffer, (2, h, w))
        return output_buffer
INPUT_SHAPE = (224, 224)
engine_file_path = '***.trt'
img_path = 's1.png'
img = cv2.resize(cv2.imread(img_path), INPUT_SHAPE) # hwc
img = np.transpose(img, (2,0,1)).astype(np.float32) # chw
trt_infer = TensorRTInference(engine_file_path, INPUT_SHAPE)
engine = trt_infer.load_engine()
trt_infer.infer_once(img)

5. int8量化

首先给出某个模型的对比结果：
onnxruntime-gpu：163ms
tensorrt，float16：45.8ms
tensorrt，int8：34.3ms

参考这篇：https://github.com/qq995431104/Pytorch2TensorRT
int8量化和第一节的操作方法很类似，但是需要一个额外的校准文件。这里使用python来生成：

import tensorrt as trt
class MyCalibrator(trt.IInt8EntropyCalibrator2):
    def __init__(self, files_path='calib.csv'):
        trt.IInt8EntropyCalibrator2.__init__(self)

        self.cache_file = 'MyNet.cache'
        self.batch_size = 1
        self.Channel = 3
        self.Height = 384
        self.Width = 672
        self._txt_file = open(files_path, 'r')
        self._lines = self._txt_file.readlines()
        np.random.shuffle(self._lines)
        self.imgs = [line.split('\n')[0] for line in self._lines]
        
        self.batch_idx = 0
        self.max_batch_idx = len(self.imgs)//self.batch_size
        self.data_size = trt.volume([self.batch_size, self.Channel,self.Height, self.Width]) * trt.float32.itemsize
        self.device_input = cuda.mem_alloc(self.data_size)

    def next_batch(self):
        if self.batch_idx < self.max_batch_idx:
            batch_files = self.imgs[self.batch_idx * self.batch_size:\
                                    (self.batch_idx + 1) * self.batch_size]
            batch_imgs = np.zeros((self.batch_size, self.Channel, self.Height, self.Width),
                                  dtype=np.float32)
            for i, f in enumerate(batch_files):
                print(f)
                img = cv2.resize(cv2.imread(f),(self.Width, self.Height))
                batch_imgs[i] = np.transpose(img, (2,0,1))
            self.batch_idx += 1
            print("batch:[{}/{}]".format(self.batch_idx, self.max_batch_idx))
            return np.ascontiguousarray(batch_imgs)
        else:
            return np.array([])

    def get_batch_size(self):
        return self.batch_size

    def get_batch(self, names, p_str=None):
        try:
            batch_imgs = self.next_batch()
            if batch_imgs.size == 0 or batch_imgs.size != self.batch_size*self.Channel*self.Height*self.Width:
                return None
            cuda.memcpy_htod(self.device_input, batch_imgs.astype(np.float32))
            return [int(self.device_input)]
        except:
            return None

    def read_calibration_cache(self):
        # If there is a cache, use it instead of calibrating again. Otherwise, implicitly return None.
        if os.path.exists(self.cache_file):
            with open(self.cache_file, "rb") as f:
                return f.read()

    def write_calibration_cache(self, cache):
        with open(self.cache_file, "wb") as f:
            f.write(cache)

calib.csv中是用于校准的图片地址集合，一般1000张左右。生成的cache文件大致长这样：
在这里插入图片描述
保存的trt文件可以直接用第3节的代码进行推理。

IE06

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
深度学习系列19：tensorRT基础

tensorrt官方库：https://github.com/NVIDIA/TensorRT, git clone一下即可1. onnx转tensorRT首先放一张对比图：使用官方的tensorrt包，找到里面的trtexec文件，按照下面的语句执行：trtexec --onnx=onnx-modifier/result.onnx --batch=1 --saveEngine=onnx-modifier/result.trt --workspace=8196即可获得tensorRT的trt模型
复制链接

扫一扫

专栏目录