将onnx文件导出为engine,FP16格式
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
# 加载ONNX文件
onnx_file_path = 'model.onnx'
engine_file_path = 'model_tesfp16.trt'
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(TRT_LOGGER)
network = builder.create_network(1)
parser = trt.OnnxParser(network, TRT_LOGGER)
# 解析ONNX文件
with open(onnx_file_path, 'rb') as f:
data = f.read()
parser.parse(data)
# 构建TensorRT引擎
builder_config = builder.create_builder_config()
builder_config.max_workspace_size = 4*(1 << 30)
builder_config.set_flag(trt.BuilderFlag.FP16)
engine = builder.build_engine(network, builder_config)
# 构建TensorRT引擎
# builder_config = builder.create_builder_config()
# builder_config.max_workspace_size = 1 << 30
# # builder_config.max_batch_size = 1 # 设置最大批量大小
# builder_config.set_flag(trt.BuilderFlag.FP16)
# # builder_config.set_flag(trt.BuilderFlag.INT8)
# engine = builder.build_engine(network, builder_config)
# 保存TensorRT引擎到文件
with open(engine_file_path, 'wb') as f:
f.write(engine.serialize())
以cyclegan网络,分别用python 、C++对网络进行推理
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
import time
import cv2
# 加载TRT引擎
# engine_file_path = 'model_fp16.trt'
engine_file_path = 'model_tesfp16.trt'
with open(engine_file_path, 'rb') as f:
engine_data = f.read()
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
trt.init_libnvinfer_plugins(TRT_LOGGER, '')
runtime = trt.Runtime(TRT_LOGGER)
engine = runtime.deserialize_cuda_engine(engine_data)
# 创建执行上下文
context = engine.create_execution_context()
# 分配内存
# 创建输入和输出缓冲区
# 分配输入和输出内存
input_shape = (1, 1, 512, 512) # 输入数据的形状 如果是三通道(1,3,512,512)
output_shape = (1, 1,512,512) # 输出数据的形状 如果是三通道(1,3,512,512)
# input_data = np.random.randn(*input_shape).astype(np.float32)
input_data = cv2.imread("image1644.png",0)
input_data = input_data.reshape((1,1,512, 512,)).astype(np.float32) 如果是三通道(1,3,512,512)
output_data = np.empty(output_shape, dtype=np.float32)
# 在GPU上分配内存
d_input = cuda.mem_alloc(input_data.nbytes)
d_output = cuda.mem_alloc(output_data.nbytes)
# 创建CUDA流
stream = cuda.Stream()
# 将输入数据从主机内存复制到GPU内存
cuda.memcpy_htod_async(d_input, input_data, stream)
# 执行TensorRT推理
T1 = time.time()
bindings = [int(d_input), int(d_output)]
stream_handle = stream.handle
context.execute_async_v2(bindings=bindings, stream_handle=stream_handle)
# 将输出数据从GPU内存复制到主机内存
cuda.memcpy_dtoh_async(output_data, d_output, stream)
# 等待推理完成
stream.synchronize()
T2 = time.time()
print('程序运行时间:%s毫秒' % ((T2 - T1)*1000))
# 打印输出结果
print(type(output_data))
a_sque = np.squeeze(output_data)
a_sque =-a_sque*255
a_sque = 255 -a_sque
# print(a_sque)
# img = cv2.cvtColor(a_sque, cv2.COLOR_GRAY2BGR)
cv2.imwrite("tensorrt_ilubuntu.jpg",a_sque)
print("output_data = ",output_data)
C++版本首先配置cmakelists
cmake_minimum_required(VERSION 2.6)
project(cycle_gan)
add_definitions(-std=c++11)
add_definitions(-DAPI_EXPORTS)
option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
set(CMAKE_CXX_STANDARD 11)
# set(CMAKE_BUILD_TYPE Release)
set(CMAKE_BUILD_TYPE Release)
find_package(CUDA REQUIRED)
enable_language(CUDA)
include_directories(${PROJECT_SOURCE_DIR}/include)
# include and link dirs of cuda and tensorrt, you need adapt them if yours are different
# cuda
include_directories(/usr/local/cuda-11.6/include)
link_directories(/usr/local/cuda-11.6/lib64)
# tensorrt
include_directories(/home/mao/bag/TensorRT-8.2.5.1/include/)
link_directories(/home/mao/bag/TensorRT-8.2.5.1/lib/)
# include_directories(/usr/include/x86_64-linux-gnu/)
# link_directories(/usr/lib/x86_64-linux-gnu/)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -g -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")
find_package(OpenCV)
include_directories(${OpenCV_INCLUDE_DIRS})
cuda_add_executable(cycle_gan main.cpp )
#cuda_add_library(yolov5 SHARED ${PROJECT_SOURCE_DIR}/yolov5.cpp ${PROJECT_SOURCE_DIR}/yololayer.cu ${PROJECT_SOURCE_DIR}/yololayer.h ${PROJECT_SOURCE_DIR}/preprocess.cu)
target_link_libraries(cycle_gan nvonnxparser nvinfer nvinfer_plugin)
target_link_libraries(cycle_gan cudart)
target_link_libraries(cycle_gan ${OpenCV_LIBS})
if(UNIX)
add_definitions(-O2 -pthread)
endif(UNIX)
#include "NvInfer.h"
#include "cuda_runtime_api.h"
#include <fstream>
#include <iostream>
#include <map>
#include <sstream>
#include <vector>
#include <chrono>
#include "NvInferPlugin.h"
#include "opencv2/opencv.hpp"
#include <time.h>
// 定义Engine文件路径
const std::string enginePath = "/home/mao/code/style/GAN/model_tesfp16.trt";
class Logger : public nvinfer1::ILogger
{
public:
void log(Severity severity, const char* msg) noexcept override
{
// 根据需要自定义日志输出逻辑
switch (severity)
{
case Severity::kINTERNAL_ERROR:
std::cerr << "INTERNAL_ERROR: " << msg << std::endl;
break;
case Severity::kERROR:
std::cerr << "ERROR: " << msg << std::endl;
break;
case Severity::kWARNING:
std::cerr << "WARNING: " << msg << std::endl;
break;
case Severity::kINFO:
std::cout << "INFO: " << msg << std::endl;
break;
default:
break;
}
}
};
using namespace nvinfer1;
static Logger gLogger;
// void doInference(IExecutionContext& context, float* input, float* output, int batchSize)
// {
// const ICudaEngine& engine = context.getEngine();
// // Pointers to input and output device buffers to pass to engine.
// // Engine requires exactly IEngine::getNbBindings() number of buffers.
// assert(engine.getNbBindings() == 2);
// void* buffers[2];
// // In order to bind the buffers, we need to know the names of the input and output tensors.
// // Note that indices are guaranteed to be less than IEngine::getNbBindings()
// const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
// const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);
// // Create GPU buffers on device
// CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float)));
// CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));
// // Create stream
// cudaStream_t stream;
// CHECK(cudaStreamCreate(&stream));
// // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
// CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
// context.enqueue(batchSize, buffers, stream, nullptr);
// CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
// cudaStreamSynchronize(stream);
// // Release stream and buffers
// cudaStreamDestroy(stream);
// CHECK(cudaFree(buffers[inputIndex]));
// CHECK(cudaFree(buffers[outputIndex]));
// }
int main(int argc, char** argv)
{
// nvinfer1::ILogger* gLogger;
initLibNvInferPlugins(&gLogger, "");
// 创建TensorRT的运行时对象
IRuntime* runtime = createInferRuntime(gLogger);
// 从文件中反序列化Engine对象
std::ifstream engineFile(enginePath, std::ios::binary);
if (!engineFile)
{
std::cerr << "无法打开Engine文件进行读取。" << std::endl;
return 1;
}
engineFile.seekg(0, std::ios::end);
const size_t fileSize = engineFile.tellg();
engineFile.seekg(0, std::ios::beg);
std::vector<char> engineData(fileSize);
engineFile.read(engineData.data(), fileSize);
engineFile.close();
// 反序列化Engine对象
IPluginFactory* pluginFactory = nullptr; // 如果有自定义插件,可以传递一个插件工厂对象
ICudaEngine* engine = runtime->deserializeCudaEngine(engineData.data(), fileSize, pluginFactory);
if (!engine)
{
std::cerr << "无法反序列化Engine对象。" << std::endl;
return 1;
}
// 创建TensorRT的执行上下文对象
IExecutionContext* context = engine->createExecutionContext();
// 分配输入和输出内存
const int batchSize = 1;
const int inputSize = 512 * 512; // 输入张量的大小
const int outputSize = 512 * 512; // 输出张量的大小
void* deviceInput;
void* deviceOutput;
cudaMalloc(&deviceInput, batchSize * inputSize * sizeof(float));
cudaMalloc(&deviceOutput, batchSize * outputSize * sizeof(float));
std::vector<void*> bindings = {deviceInput,deviceOutput};
// 创建CUDA流
cudaStream_t stream;
cudaStreamCreate(&stream);
// 将输入数据复制到GPU内存中
std::vector<float> input(batchSize * inputSize);
// 读取图片作为输入
cv::Mat image = cv::imread("/home/mao/code/style/GAN/0.png",0);
// 将图片数据复制到输入张量
cv::Mat resizedImage;
cv::resize(image, resizedImage, cv::Size(512, 512));
cv::Mat floatImage;
resizedImage.convertTo(floatImage, CV_32F, 1.0 / 255.0);
std::vector<cv::Mat> inputChannels(1);
cv::split(floatImage, inputChannels);
std::memcpy(input.data(), inputChannels[0].data, inputSize * sizeof(float));
cudaMemcpyAsync(deviceInput, input.data(), batchSize * inputSize * sizeof(float), cudaMemcpyHostToDevice, stream);
clock_t t;
t = clock();
// 执行推理
context->enqueue(batchSize, bindings.data(), stream, nullptr);
t = clock() - t;
printf(" %d\n",t);
// 将输出数据从GPU内存复制回主机内存
std::vector<float> output(batchSize * outputSize);
cudaMemcpyAsync(output.data(), deviceOutput, batchSize * outputSize * sizeof(float), cudaMemcpyDeviceToHost, stream);
// 等待推理完成
cudaStreamSynchronize(stream);
// 处理输出结果输出是一维需要自己转换为二维
std::cout << output.data() << std::endl;
// 释放资源
context->destroy();
engine->destroy();
runtime->destroy();
return 0;
}