pytorch与cuda与TensorRT的版本选择

边角料摸鱼

已于 2024-07-26 16:02:40 修改

阅读量1.3k

点赞数 6

文章标签： pytorch 人工智能 python

于 2024-07-26 15:26:04 首次发布

本文链接：https://blog.csdn.net/ReadyShowShow/article/details/140716556

版权

VScode版本

linux最新版本的vscode，可能无法进行python的调试
在这里插入图片描述
选择下载1.85 https://code.visualstudio.com/updates/v1_85

CUDA版本

在这里插入图片描述

https://developer.nvidia.com/Cuda-Toolkit-archive

由于受限于TRT的8.6（下面会说明），所以CUDA Version最高为: 12.1，而pytorch的限制只能是12.1或者是11.8。
重装cuda toolkit后，须重装pytorch

Pytorch版本

在这里插入图片描述
https://pytorch.org/get-started/locally/

选择12.1

conda install pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia

TensorRT版本

在这里插入图片描述
https://developer.nvidia.com/tensorrt/download
受限于CentOS7版本，trt最高为8.6。

安装后的提示，配置路径与卸载命令：

Please make sure that
 -   PATH includes /usr/local/cuda-12.0/bin
 -   LD_LIBRARY_PATH includes /usr/local/cuda-12.0/lib64, or, add /usr/local/cuda-12.0/lib64 to /etc/ld.so.conf and run ldconfig as root

To uninstall the CUDA Toolkit, run cuda-uninstaller in /usr/local/cuda-12.0/bin
To uninstall the NVIDIA Driver, run nvidia-uninstall

报错

nn.Conv1d运算报错

在pytorch中使用 nn.Conv1d 算子，会报错：
[1] 17446 segmentation fault python test.py

这个网上无法搜到的问题，就是版本不匹配导致的问题，重装一遍cuda与pytorch。

TensorRT环境的可用性验证

这是一个带loop的完整样例，直接运行看TensorRT是否正常。

import numpy as np
import tensorrt as trt
from tensorrt import INetworkDefinition
from trt_inference import TRTInference


logger = trt.Logger(trt.Logger.WARNING)
# class MyLogger(trt.ILogger):
#     def __init__(self):
#        trt.ILogger.__init__(self)

#     def log(self, severity, msg):
#         pass # Your custom logging implementation here
# logger = MyLogger()

builder = trt.Builder(logger)
network = builder.create_network(trt.NetworkDefinitionCreationFlag.EXPLICIT_PRECISION)


num_iterations = 3
trip_limit = network.add_constant(shape=(), weights=trt.Weights(np.array([num_iterations], dtype=np.dtype("i"))))
accumaltor_value = network.add_input("input1", dtype=trt.float32, shape=(2, 3))
accumaltor_added_value = network.add_input("input2", dtype=trt.float32, shape=(2, 3))
loop = network.add_loop()
# setting the ITripLimit layer to stop after `num_iterations` iterations
loop.add_trip_limit(trip_limit.get_output(0), trt.TripLimit.COUNT)
# initialzing the IRecurrenceLayer with a init value
rec = loop.add_recurrence(accumaltor_value)
# eltwise inputs are 'accumaltor_added_value', and the IRecurrenceLayer output.
eltwise = network.add_elementwise(accumaltor_added_value, rec.get_output(0), op=trt.ElementWiseOperation.SUM)
# wiring the IRecurrenceLayer with the output of eltwise.
# The IRecurrenceLayer output would now be `accumaltor_value` for the first iteration, and the eltwise output for any other iteration
rec.set_input(1, eltwise.get_output(0))
# marking the IRecurrenceLayer output as the Loop output
loop_out = loop.add_loop_output(rec.get_output(0), trt.LoopOutput.LAST_VALUE)
# marking the Loop output as the network output
network.mark_output(loop_out.get_output(0))


inputs = {}
outputs = {}
expected = {}

inputs[accumaltor_value.name] = np.array(
    [
        [2.7, -4.9, 23.34],
        [8.9, 10.3, -19.8],
    ])
inputs[accumaltor_added_value.name] = np.array(
    [
        [1.1, 2.2, 3.3],
        [-5.7, 1.3, 4.6],
    ])

outputs[loop_out.get_output(0).name] = eltwise.get_input(0).shape
expected[loop_out.get_output(0).name] = inputs[accumaltor_value.name] + inputs[accumaltor_added_value.name] * num_iterations
print("Expected:", expected)

builder_config = builder.create_builder_config()
builder_config.set_flag(trt.BuilderFlag.VERSION_COMPATIBLE)
builder_config.set_flag(trt.BuilderFlag.EXCLUDE_LEAN_RUNTIME)
plan = builder.build_serialized_network(network, builder_config)

# v10_runtime = trt.Runtime(logger)
# v8_shim_runtime = v10_runtime.load_runtime('/home/mark.yj/TensorRT-8.6.1.6/bin/trtexec')
# engine = v10_runtime.deserialize_cuda_engine(plan)
trtInfer = TRTInference(plan)
r = trtInfer.infer(inputs, outputs)
print("Prediction:", r)

上述代码中的 TRTInference是一个封装类：

import pycuda.driver as cuda
import pycuda.autoinit
import tensorrt as trt
import os

import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit  # 注意，必须引入
from collections import OrderedDict
import torch
import numpy as np

class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem, dtype, size):
        self.host = host_mem
        self.device = device_mem
        self.dtype = dtype
        self.size = size

def allocate_buffers(engine, context, input_data):
    # 根据input_data改shape
    for key, value in input_data.items():
        r = context.set_binding_shape(engine.get_binding_index(key), value.shape)
        if not r:
            print(f"set binding shape False: {key}")
    
    inputs = OrderedDict()
    outputs = OrderedDict()
    bindings = []

    for binding_idx, binding_name in enumerate(engine):
        size = trt.volume(context.get_binding_shape(binding_idx))
        # size = trt.volume(engine.get_binding_shape(binding))
        dtype = trt.nptype(engine.get_binding_dtype(binding_idx))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding_idx):
            inputs[binding_name] = HostDeviceMem(host_mem, device_mem, dtype, size)
        else:
            outputs[binding_name] = HostDeviceMem(host_mem, device_mem, dtype, size)
    return inputs, outputs, bindings

# This function is generalized for multiple inputs/outputs for full dimension networks.
# inputs and outputs are expected to be lists of HostDeviceMem objects.
def do_inference_v2(context, bindings, inputs:map, outputs, stream):
    inputs, outputs = inputs.values(), outputs.values()
    # Transfer input data to the GPU.
    for inp in inputs:
        cuda.memcpy_htod_async(inp.device, inp.host, stream)
    # Run inference.
    context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
    # Transfer predictions back from the GPU.
    for out in outputs:
        cuda.memcpy_dtoh_async(out.host, out.device, stream)
    # Synchronize the stream
    stream.synchronize()
    # Return only the host outputs.
    return [out.host for out in outputs]

class TRTInference(object):
    def __init__(self, plan):
        TRT_LOGGER = trt.Logger()
        self.trt_runtime = trt.Runtime(TRT_LOGGER)
        if isinstance(plan, str):
            plan = open(plan, "rb").read()
        self.trt_engine = self.trt_runtime.deserialize_cuda_engine(plan)
        self.context = self.trt_engine.create_execution_context()
    
    def infer(self, input_data:map, output_shapes:map):
        # This allocates memory for network inputs/outputs on both CPU and GPU
        self.inputs, self.outputs, self.bindings = \
            allocate_buffers(self.trt_engine, self.context, input_data)
        
        self.stream  = cuda.Stream()
        
        for binding_name, mem in self.inputs.items():
            input_type = mem.dtype
            input_fix = np.ascontiguousarray(input_data[binding_name].astype(input_type))
            mem.host = input_fix
            # input = np.array(input_data[binding_name], dtype=mem.dtype, order='C')
            # np.copyto(mem.host, input.ravel())

        # Fetch output from the model
        res = do_inference_v2(
            self.context, bindings=self.bindings, inputs=self.inputs,
            outputs=self.outputs, stream=self.stream)

        # Before doing post-processing, we need to reshape the outputs as the common.do_inference will
        # give us flat arrays.
        outputs_reshape = []
        for binding_name, shape in output_shapes.items():
            ot = self.outputs[binding_name]
            outputs_reshape.append(ot.host.reshape(shape))
        # And return results
        return outputs_reshape