使用TensorRT部署Pytorch模型

TensorRT 专栏收录该内容
2 篇文章 0 订阅

   TensorRT是Nvidia公司出的能加速模型推理的框架,本文记录使用TensorRT部署Pytorch模型的过程。
   

1.Pytorch模型转ONNX模型

pytorch模型转化为TensorRT有两种路径,一种是先把pytorch的pt模型转化为onnx,然后再转化为TensorRT;另一种是直接把pytorch的pt模型转成TensorRT。本文先将Pytorch模型转ONNX模型。

import torch
from torch.autograd import Variable
import onnx
print(torch.__version__)
# torch  -->  onnx
input_name = ['input']
output_name = ['output']
input = Variable(torch.randn(1, 3, 224, 224)).cuda()
# model = torchvision.models.resnet50(pretrained=True).cuda()
model = torch.load('resnet50.pth', map_location="cuda:0")
torch.onnx.export(model, input, 'resnet50.onnx', input_names=input_name, output_names=output_name, verbose=True)
# 模型可视化
# netron.start('resnet50.onnx')

   转换完毕之后,可以用netron模型可视化工具看看转换出来的模型结构是否正常。
   

2.ONNX模型转TensorRT模型

这里比较方便的做法是直接使用TensorRT安装目录下 /bin/下的 trtexec可执行文件来转换模型

trtexec --onnx= --saveEngine=

3.测试前向计算

import pycuda.autoinit
import numpy as np
import pycuda.driver as cuda
import tensorrt as trt
import torch
import os
import time
from PIL import Image
import cv2
import torchvision
import pdb
filename = 'xxx/pics/2/5.jpg'
max_batch_size = 1
onnx_model_path = '50.onnx'
TRT_LOGGER = trt.Logger()  # This logger is required to build an engine

def softmax(x):
    x_exp = np.exp(x)
    #如果是列向量,则axis=0
    x_sum = np.sum(x_exp, axis = 1, keepdims = True)
    s = x_exp / x_sum
    return s

def get_img_np_nchw(filename):
    image = cv2.imread(filename)
    image_cv = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image_cv = cv2.resize(image_cv, (224, 224))
    miu = np.array([0.485, 0.456, 0.406])
    std = np.array([0.229, 0.224, 0.225])
    img_np = np.array(image_cv, dtype=float) / 255.
    r = (img_np[:, :, 0] - miu[0]) / std[0]
    g = (img_np[:, :, 1] - miu[1]) / std[1]
    b = (img_np[:, :, 2] - miu[2]) / std[2]
    img_np_t = np.array([r, g, b])
    img_np_nchw = np.expand_dims(img_np_t, axis=0)
    return img_np_nchw

class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        """Within this context, host_mom means the cpu memory and device means the GPU memory
        """
        self.host = host_mem
        self.device = device_mem
    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
    def __repr__(self):
        return self.__str__()

def allocate_buffers(engine):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    for binding in engine:
        size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings, stream

def get_engine(max_batch_size=1, onnx_file_path="", engine_file_path="", \
               fp16_mode=False, int8_mode=False, save_engine=True,
               ):
    """Attempts to load a serialized engine if available, otherwise builds a new TensorRT engine and saves it."""

    def build_engine(max_batch_size, save_engine):
        """Takes an ONNX file and creates a TensorRT engine to run inference with"""
        with trt.Builder(TRT_LOGGER) as builder, \
                builder.create_network() as network, \
                trt.OnnxParser(network, TRT_LOGGER) as parser:

            builder.max_workspace_size = 1 << 30  # Your workspace size
            builder.max_batch_size = max_batch_size
            # pdb.set_trace()
            builder.fp16_mode = fp16_mode  # Default: False
            builder.int8_mode = int8_mode  # Default: False
            if int8_mode:
                # To be updated
                raise NotImplementedError
            # Parse model file
            if not os.path.exists(onnx_file_path):
                quit('ONNX file {} not found'.format(onnx_file_path))

            print('Loading ONNX file from path {}...'.format(onnx_file_path))
            with open(onnx_file_path, 'rb') as model:
                print('Beginning ONNX file parsing')
                parser.parse(model.read())
            print('Completed parsing of ONNX file')
            print('Building an engine from file {}; this may take a while...'.format(onnx_file_path))
            engine = builder.build_cuda_engine(network)
            print("Completed creating Engine")
            if save_engine:
                with open(engine_file_path, "wb") as f:
                    f.write(engine.serialize())
            return engine
    # pdb.set_trace()
    if os.path.exists(engine_file_path):
        # If a serialized engine exists, load it instead of building a new one.
        print("Reading engine from file {}".format(engine_file_path))
        with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
            return runtime.deserialize_cuda_engine(f.read())
    else:
        return build_engine(max_batch_size, save_engine)

def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
    # Transfer data from CPU to the GPU.
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    # Run inference.
    context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    # Synchronize the stream
    stream.synchronize()
    # Return only the host outputs.
    return [out.host for out in outputs]

def postprocess_the_outputs(h_outputs, shape_of_output):
    h_outputs = h_outputs.reshape(*shape_of_output)
    return h_outputs

# These two modes are dependent on hardwares
fp16_mode = False
int8_mode = False
trt_engine_path = '/home/yinliang/software/TensorRT-7.0.0.11/bin/resnet50.trt'
# Build an engine
engine = get_engine(max_batch_size, onnx_model_path, trt_engine_path, fp16_mode, int8_mode)

# Create the context for this engine
context = engine.create_execution_context()
# Allocate buffers for input and output
inputs, outputs, bindings, stream = allocate_buffers(engine) # input, output: host # bindings
start = time.time()

# Do inference
img_np_nchw = get_img_np_nchw(filename)
img_np_nchw = img_np_nchw.astype(dtype=np.float32)
shape_of_output = (max_batch_size, 2)
# Load data to the buffer
inputs[0].host = img_np_nchw.reshape(-1)

# inputs[1].host = ... for multiple input
t1 = time.time()
trt_outputs = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream) # numpy data
t2 = time.time()
feat = postprocess_the_outputs(trt_outputs[0], shape_of_output)
result = softmax(feat)
score, index = np.max(result, axis=1), np.argmax(result, axis=1)
print(score[0], index[0])

4.测试结论

使用ResNet50做一个二分类计算1000张图像,使用Python API耗时18秒,C++ API耗时15秒,使用TensorRT python API耗时10秒。相较于直接使用pytorch的python API,TensorRT的耗时仅仅为前者的0.55倍。
实验版本:
Pytorch==1.2.0
CUDA 10.2
TensorRT 7.0

github: Image_Classify_TRT

  • 1
    点赞
  • 8
    评论
  • 21
    收藏
  • 一键三连
    一键三连
  • 扫一扫,分享海报

相关推荐
<p class="MsoNormal" style="text-align: left; line-height: 16.5pt; mso-pagination: widow-orphan; background: white;" align="left"><span lang="EN-US" style="mso-bidi-font-size: 10.5pt; font-family: '微软雅黑',sans-serif; mso-bidi-font-family: 宋体; color: black; mso-themecolor: text1; mso-font-kerning: 0pt;">PyTorch</span><span style="mso-bidi-font-size: 10.5pt; font-family: '微软雅黑',sans-serif; mso-bidi-font-family: 宋体; color: black; mso-themecolor: text1; mso-font-kerning: 0pt;">版的<span lang="EN-US">YOLOv5</span>是高性能的实时目标检测方法。 <span lang="EN-US">TensorRT</span>是针对英伟达<span lang="EN-US">GPU</span>的加速工具。本课程讲述如何使用TensorRT对<span lang="EN-US">YOLOv5进行</span>加速和部署。 <span lang="EN-US"> </span></span><span style="font-family: 微软雅黑, sans-serif;"> </span></p> <p class="MsoNormal" style="text-align: left; line-height: 16.5pt; mso-pagination: widow-orphan; background: white;" align="left"><span style="mso-bidi-font-size: 10.5pt; font-family: '微软雅黑',sans-serif; mso-bidi-font-family: 宋体; color: black; mso-themecolor: text1; mso-font-kerning: 0pt;">本课程的<span lang="EN-US">YOLOv5</span>使用<span lang="EN-US">ultralytics/yolov5</span>,分别在<span lang="EN-US">Ubuntu</span>和<span lang="EN-US">windows10</span>系统上做TensorRT加速和部署演示。</span></p> <p class="MsoNormal" style="text-align: left; line-height: 16.5pt; mso-pagination: widow-orphan; background: white;" align="left"><span style="mso-bidi-font-size: 10.5pt; font-family: '微软雅黑',sans-serif; mso-bidi-font-family: 宋体; color: black; mso-themecolor: text1; mso-font-kerning: 0pt;">课程内容包括:原理篇(<span lang="EN-US">YOLOv5</span>网络架构与组件、<span lang="EN-US">TensorRT</span>基础、<span lang="EN-US">TensorRT INT8</span>量化、<span lang="EN-US">tensorrtx</span>介绍、<span lang="EN-US">CUDA</span>编程方法)、实践篇(<span lang="EN-US">Ubuntu</span>和<span lang="EN-US">Windows10</span>系统上的<span lang="EN-US">TensorRT</span>部署演示)、代码解析篇(</span><span lang="EN-US" style="mso-bidi-font-size: 10.5pt; font-family: '微软雅黑',sans-serif; mso-bidi-font-family: Arial; color: black; mso-themecolor: text1; mso-font-kerning: 0pt;">YOLOv5</span><span style="mso-bidi-font-size: 10.5pt; font-family: '微软雅黑',sans-serif; mso-bidi-font-family: Arial; color: black; mso-themecolor: text1; mso-font-kerning: 0pt;">的<span lang="EN-US">TensorRT</span>加速的代码解析)</span><span style="mso-bidi-font-size: 10.5pt; font-family: '微软雅黑',sans-serif; mso-bidi-font-family: 宋体; color: black; mso-themecolor: text1; mso-font-kerning: 0pt;"> 。本课程提供注释后的<span lang="EN-US">YOLOv5</span>的</span><span lang="EN-US" style="mso-bidi-font-size: 10.5pt; font-family: '微软雅黑',sans-serif; mso-bidi-font-family: Arial; color: black; mso-themecolor: text1; mso-font-kerning: 0pt;">TensorRT</span><span style="mso-bidi-font-size: 10.5pt; font-family: '微软雅黑',sans-serif; mso-bidi-font-family: Arial; color: black; mso-themecolor: text1; mso-font-kerning: 0pt;">加速代码。</span><span style="font-family: 微软雅黑, sans-serif;"> </span></p> <p class="MsoNormal" style="text-align: left; line-height: 16.5pt; mso-pagination: widow-orphan; background: white;" align="left"><img src="https://img-bss.csdnimg.cn/202103252331175335.jpg" alt="课程内容" /></p> <p class="MsoNormal" style="text-align: left; line-height: 16.5pt; mso-pagination: widow-orphan; background: white;" align="left"><span style="mso-bidi-font-size: 10.5pt; font-family: '微软雅黑',sans-serif; mso-bidi-font-family: 宋体; color: black; mso-themecolor: text1; mso-font-kerning: 0pt;">除本课程《<span lang="EN-US">YOLOv5(PyTorch)</span>目标检测实战:<span lang="EN-US">TensorRT</span>加速部署》外,本人推出了有关<span lang="EN-US">YOLOv5</span>目标检测的系列课程。请关注该系列的其它视频课程,包括:</span></p> <p class="MsoNormal" style="text-align: left; line-height: 16.5pt; mso-pagination: widow-orphan; background: white;" align="left"><span style="mso-bidi-font-size: 10.5pt; font-family: '微软雅黑',sans-serif; mso-bidi-font-family: 宋体; color: black; mso-themecolor: text1; mso-font-kerning: 0pt;">《<span lang="EN-US">YOLOv5(PyTorch)</span>目标检测实战:训练自己的数据集》</span></p> <p class="MsoNormal" style="text-align: left; line-height: 16.5pt; mso-pagination: widow-orphan; background: white;" align="left"><span style="mso-bidi-font-size: 10.5pt; font-family: '微软雅黑',sans-serif; mso-bidi-font-family: 宋体; color: black; mso-themecolor: text1; mso-font-kerning: 0pt;">Ubuntu系统:<span style="color: #843fa1;">https://edu.csdn.net/course/detail/30793</span></span></p> <p class="MsoNormal" style="text-align: left; line-height: 16.5pt; mso-pagination: widow-orphan; background: white;" align="left"><span style="mso-bidi-font-size: 10.5pt; font-family: '微软雅黑',sans-serif; mso-bidi-font-family: 宋体; color: black; mso-themecolor: text1; mso-font-kerning: 0pt;">Windows系统:<span style="color: #843fa1;">https://edu.csdn.net/course/detail/30923</span></span></p> <p class="MsoNormal" style="text-align: left; line-height: 16.5pt; mso-pagination: widow-orphan; background: white;" align="left"><span style="mso-bidi-font-size: 10.5pt; font-family: '微软雅黑',sans-serif; mso-bidi-font-family: 宋体; color: black; mso-themecolor: text1; mso-font-kerning: 0pt;">《<span lang="EN-US">YOLOv5(PyTorch)</span>目标检测:原理与源码解析》</span><span style="color: #843fa1;"><span style="color: #7c79e5; font-family: 微软雅黑, sans-serif;">https://edu.csdn.net/course/detail/31428</span></span></p> <p class="MsoNormal" style="text-align: left; line-height: 16.5pt; mso-pagination: widow-orphan; background: white;" align="left"><span style="mso-bidi-font-size: 10.5pt; font-family: '微软雅黑',sans-serif; mso-bidi-font-family: 宋体; color: black; mso-themecolor: text1; mso-font-kerning: 0pt;">《<span lang="EN-US">YOLOv5(PyTorch)</span>目标检测实战:<span lang="EN-US">Flask Web</span>部署》<span style="color: #843fa1;">https://edu.csdn.net/course/detail/31140</span></span></p>
©️2020 CSDN 皮肤主题: 数字20 设计师:CSDN官方博客 返回首页
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、C币套餐、付费专栏及课程。

余额充值