tensorrt官方库:https://github.com/NVIDIA/TensorRT, git clone一下即可
1. onnx转tensorRT
首先放一张对比图:
使用官方的tensorrt包,编译出trtexec文件,按照下面的语句执行:
trtexec --onnx=onnx-modifier/result.onnx --batch=1 --saveEngine=onnx-modifier/result.trt --workspace=8196
即可获得tensorRT的trt模型。
也可以用python来进行转换:
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import os
import numpy as np
batch_size=1
onnx_file_path="....onnx"
engine_file_path="model_int8.trt"
G_LOGGER = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(G_LOGGER)
network = builder.create_network(1)
parser = trt.OnnxParser(network, G_LOGGER)
builder.max_batch_size = batch_size
builder.max_workspace_size = 48263040
builder.fp16_mode = True
#builder.int8_mode = True
#builder.int8_calibrator = MyCalibrator()
with open(onnx_file_path, 'rb') as model:
parser.parse(model.read())
engine = builder.build_cuda_engine(network)
print("Created engine success! ")
with open(engine_file_path, "wb") as f:
f.write(engine.serialize())
print('Engine file has already saved to {}!'.format(engine_file_path))
2. 使用onnx进行推理
import onnxruntime as rt
import numpy as np
import cv2
sess = rt.InferenceSession("/....onnx")
input_name = sess.get_inputs()[0].name
img = cv2.resize(cv2.imread("....png"),(672,384)).astype(np.float32)
X = np.array([np.transpose(img, (2,0,1))])
pred_onnx = sess.run(None, {input_name: X})
3. 不转模型,使用tensorRT推理
文档参考:https://github.com/onnx/onnx-tensorrt
代码如下:
import onnx
import onnx_tensorrt.backend as backend
import numpy as np
model = onnx.load("/path/to/model.onnx")
engine = backend.prepare(model, device='CUDA:1')
input_data = np.random.random(size=(32, 3, 224, 224)).astype(np.float32)
output_data = engine.run(input_data)[0]
print(output_data)
print(output_data.shape)
4. 使用tensorRT进行推理
import tensorrt as trt
import os
import pycuda.driver as cuda
import cv2
import numpy as np
import pycuda.autoinit
class TensorRTInference(object):
def __init__(self, engine_file_path, input_shape):
self.engine_file_path = engine_file_path
self.shape = input_shape
self.engine = self.load_engine()
def load_engine(self):
assert os.path.exists(self.engine_file_path)
with open(self.engine_file_path, 'rb') as f, trt.Runtime(trt.Logger()) as runtime:
engine_data = f.read()
engine = runtime.deserialize_cuda_engine(engine_data)
return engine
def infer_once(self, img):
engine = self.engine
if len(img.shape) == 4:
b, c, h, w = img.shape
elif len(img.shape) == 3:
c, h, w = img.shape
b = 1
with engine.create_execution_context() as context:
context.set_binding_shape(engine.get_binding_index('input'), (b, c, h,w))
bindings = []
for binding in engine:
binding_idx = engine.get_binding_index(binding)
size = trt.volume(context.get_binding_shape(binding_idx))
dtype = trt.nptype(engine.get_binding_dtype(binding))
if engine.binding_is_input(binding):
input_buffer = np.ascontiguousarray(img, dtype).astype(np.float32)
input_memory = cuda.mem_alloc(img.nbytes)
bindings.append(int(input_memory))
else:
output_buffer = cuda.pagelocked_empty(size, dtype)
bindings.append(int(output_memory))
stream = cuda.Stream()
cuda.memcpy_htod_async(input_memory, input_buffer, stream)
context.execute_async(bindings=bindings, stream_handle=stream.handle)
cuda.memcpy_dtoh_async(output_buffer, output_memory, stream)
stream.synchronize()
#res = np.reshape(output_buffer, (2, h, w))
return output_buffer
INPUT_SHAPE = (224, 224)
engine_file_path = '***.trt'
img_path = 's1.png'
img = cv2.resize(cv2.imread(img_path), INPUT_SHAPE) # hwc
img = np.transpose(img, (2,0,1)).astype(np.float32) # chw
trt_infer = TensorRTInference(engine_file_path, INPUT_SHAPE)
engine = trt_infer.load_engine()
trt_infer.infer_once(img)
5. int8量化
首先给出某个模型的对比结果:
onnxruntime-gpu:163ms
tensorrt,float16:45.8ms
tensorrt,int8:34.3ms
参考这篇:https://github.com/qq995431104/Pytorch2TensorRT
int8量化和第一节的操作方法很类似,但是需要一个额外的校准文件。这里使用python来生成:
import tensorrt as trt
class MyCalibrator(trt.IInt8EntropyCalibrator2):
def __init__(self, files_path='calib.csv'):
trt.IInt8EntropyCalibrator2.__init__(self)
self.cache_file = 'MyNet.cache'
self.batch_size = 1
self.Channel = 3
self.Height = 384
self.Width = 672
self._txt_file = open(files_path, 'r')
self._lines = self._txt_file.readlines()
np.random.shuffle(self._lines)
self.imgs = [line.split('\n')[0] for line in self._lines]
self.batch_idx = 0
self.max_batch_idx = len(self.imgs)//self.batch_size
self.data_size = trt.volume([self.batch_size, self.Channel,self.Height, self.Width]) * trt.float32.itemsize
self.device_input = cuda.mem_alloc(self.data_size)
def next_batch(self):
if self.batch_idx < self.max_batch_idx:
batch_files = self.imgs[self.batch_idx * self.batch_size:\
(self.batch_idx + 1) * self.batch_size]
batch_imgs = np.zeros((self.batch_size, self.Channel, self.Height, self.Width),
dtype=np.float32)
for i, f in enumerate(batch_files):
print(f)
img = cv2.resize(cv2.imread(f),(self.Width, self.Height))
batch_imgs[i] = np.transpose(img, (2,0,1))
self.batch_idx += 1
print("batch:[{}/{}]".format(self.batch_idx, self.max_batch_idx))
return np.ascontiguousarray(batch_imgs)
else:
return np.array([])
def get_batch_size(self):
return self.batch_size
def get_batch(self, names, p_str=None):
try:
batch_imgs = self.next_batch()
if batch_imgs.size == 0 or batch_imgs.size != self.batch_size*self.Channel*self.Height*self.Width:
return None
cuda.memcpy_htod(self.device_input, batch_imgs.astype(np.float32))
return [int(self.device_input)]
except:
return None
def read_calibration_cache(self):
# If there is a cache, use it instead of calibrating again. Otherwise, implicitly return None.
if os.path.exists(self.cache_file):
with open(self.cache_file, "rb") as f:
return f.read()
def write_calibration_cache(self, cache):
with open(self.cache_file, "wb") as f:
f.write(cache)
calib.csv中是用于校准的图片地址集合,一般1000张左右。生成的cache文件大致长这样:
保存的trt文件可以直接用第3节的代码进行推理。