一、 转换ONNX步骤
使用TensorRT推理肯定是在英伟达GPU环境上进行推理的。
- 设置Batch Size
- 设置精度
- 转换模型
- 验证转换前后精度
二、ONNX转换成TensorRT engine
1. 使用命令行工具trtexec
转换成TensorRT engine
trtexec --onnx=resnet50/model.onnx --saveEngine=resnet_engine_intro.trt --explicitBatch
# 参数解释
--maxBatch:设置一个最大batchsize上限,用于输入的batchsize不确定情况下
--explicitBatch:根据onnx模型后结构自动推导出明确的batchsize
2. 使用TensorRT API 转换成TensorRT engine
def generate_engine(onnx_path, engine_path):
# 1.构建trt日志记录器
logger = trt.Logger(trt.Logger.WARNING)
# 初始化
trt.init_libnvinfer_plugins(logger, namespace="")
# 2.create a builder,logger放入进去
builder = trt.Builder(logger)
# 3.创建配置文件,用于trt如何优化模型
config = builder.create_builder_config()
# 设置工作空间内存大小
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 20) # 1 MiB
# 设置精度
config.set_flag(trt.BuilderFlag.FP16)
# INT8需要进行校准
# 4.创建一个network。EXPLICIT_BATCH:batch是动态的
network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
# 创建ONNX模型解析器
parser = trt.OnnxParser(network, logger)
# 解析ONNX模型,并填充到网络
success = parser.parse_from_file(onnx_path)
# 处理错误
for idx in range(parser.num_errors):
print(parser.get_error(idx))
if not success:
pass # Error handling code here
# 5.engine模型序列化,即生成了trt.engine model
serialized_engine = builder.build_serialized_network(network, config)
# 保存序列化的engine,如果以后要用到的话. 模型不能跨平台,即和trt版本 gpu类型有关
with open(engine_path, "wb") as f:
f.write(serialized_engine)
# 6.反序列化engine。使用runtime接口。即加载engine模型进行推理。
# runtime = trt.Runtime(logger)
# engine = runtime.deserialize_cuda_engine(serialized_engine)
# with open("sample.engine", "rb") as f:
# serialized_engine = f.read()
三、部署TensorRT engine
1. TensorRT engine推理
yolo.h:
// yolo.h
class YOLO
{
public:
// 1.加载engine模型,并进行初始化
YOLO(std::string engine_file_path);
// 使用类虚析构函数:delete掉 父类指向子类的指针时先调用子类的析构函数,然后调用父类的析构函数。不加virtual时:只调用基类析构函数
// 所以只有一个类被当做基类的时候,才把类析构函数写成类虚析构函数
virtual ~YOLO();
// 2.加载图像+图像前处理+推理+结果后处理
void detect_img(std::string image_path);
void detect_video(std::string video_path);
// resize图像+letterbox(矩形推理)
cv::Mat static_resize(cv::Mat& img);
// 图像归一化
float* blobFromImage(cv::Mat& img);
// 3.推理
void doInference(IExecutionContext& context, float* input, float* output, const int output_size, cv::Size input_shape);
private:
static const int INPUT_W = 640;
static const int INPUT_H = 640;
// 用于获取engine 输入输出的name
const char* INPUT_BLOB_NAME = "image_arrays";
const char* OUTPUT_BLOB_NAME = "outputs";
float* prob;
int output_size = 1;
// runtime创建engine,engine创建context
IRuntime* runtime;
ICudaEngine* engine;
IExecutionContext* context;
};
yolo.cpp:
YOLO::YOLO(std::string engine_file_path)
{
size_t size{0};
char *trtModelStream{nullptr};
// 读取文件
std::ifstream file(engine_file_path, std::ios::binary);
if (file.good()) {
file.seekg(0, file.end);
size = file.tellg();
file.seekg(0, file.beg);
trtModelStream = new char[size];
assert(trtModelStream);
file.read(trtModelStream, size);
file.close();
}
std::cout << "engine init finished" << std::endl;
runtime = createInferRuntime(gLogger);
assert(runtime != nullptr);
engine = runtime->deserializeCudaEngine(trtModelStream, size);
assert(engine != nullptr);
context = engine->createExecutionContext();
assert(context != nullptr);
delete[] trtModelStream;
auto out_dims = engine->getBindingDimensions(1);
for(int j=0;j<out_dims.nbDims;j++) {
this->output_size *= out_dims.d[j];
}
this->prob = new float[this->output_size];
}
YOLO::~YOLO()
{
std::cout<<"yolo destroy"<<std::endl;
this->context->destroy();
this->engine->destroy();
this->runtime->destroy();
}
// 前处理 letterbox
cv::Mat YOLO::static_resize(cv::Mat& img) {
float r = std::min(this->INPUT_W / (img.cols*1.0), INPUT_H / (img.rows*1.0));
int unpad_w = r * img.cols;
int unpad_h = r * img.rows;
cv::Mat re(unpad_h, unpad_w, CV_8UC3);
cv::resize(img, re, re.size());
cv::Mat out(this->INPUT_W, this->INPUT_H, CV_8UC3, cv::Scalar(114, 114, 114));
re.copyTo(out(cv::Rect(0, 0, re.cols, re.rows)));
return out;
}
// 前处理 归一化
float* YOLO::blobFromImage(cv::Mat& img){
cv::cvtColor(img, img, cv::COLOR_BGR2RGB);
float* blob = new float[img.total()*3];
int channels = 3;
int img_h = img.rows;
int img_w = img.cols;
for (size_t c = 0; c < channels; c++)
{
for (size_t h = 0; h < img_h; h++)
{
for (size_t w = 0; w < img_w; w++)
{
blob[c * img_w * img_h + h * img_w + w] =
(((float)img.at<cv::Vec3b>(h, w)[c]) / 255.0f);
}
}
}
return blob;
}
// 推理
void YOLO::doInference(IExecutionContext& context, float* input, float* output, const int output_size, cv::Size input_shape) {
const ICudaEngine& engine = context.getEngine();
// Pointers to input and output device buffers to pass to engine.
// Engine requires exactly IEngine::getNbBindings() number of buffers.
assert(engine.getNbBindings() == 2);
void* buffers[2];
// In order to bind the buffers, we need to know the names of the input and output tensors.
// Note that indices are guaranteed to be less than IEngine::getNbBindings()
const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
assert(engine.getBindingDataType(inputIndex) == nvinfer1::DataType::kFLOAT);
const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);
assert(engine.getBindingDataType(outputIndex) == nvinfer1::DataType::kFLOAT);
int mBatchSize = engine.getMaxBatchSize();
// Create GPU buffers on device
CHECK(cudaMalloc(&buffers[inputIndex], 3 * input_shape.height * input_shape.width * sizeof(float)));
CHECK(cudaMalloc(&buffers[outputIndex], output_size*sizeof(float)));
// Create stream
cudaStream_t stream;
CHECK(cudaStreamCreate(&stream));
// DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
CHECK(cudaMemcpyAsync(buffers[inputIndex], input, 3 * input_shape.height * input_shape.width * sizeof(float), cudaMemcpyHostToDevice, stream));
context.enqueue(1, buffers, stream, nullptr);
CHECK(cudaMemcpyAsync(output, buffers[outputIndex], output_size * sizeof(float), cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream);
// Release stream and buffers
cudaStreamDestroy(stream);
CHECK(cudaFree(buffers[inputIndex]));
CHECK(cudaFree(buffers[outputIndex]));
}
// combine pre infer post
void YOLO::detect_img(std::string image_path)
{
cv::Mat img = cv::imread(image_path);
int img_w = img.cols;
int img_h = img.rows;
// letterbox
cv::Mat pr_img = this->static_resize(img);
std::cout << "blob image" << std::endl;
// 归一化
float* blob;
blob = blobFromImage(pr_img);
//缩放比例
float scale = std::min(this->INPUT_W / (img.cols*1.0), this->INPUT_H / (img.rows*1.0));
// run inference
auto start = std::chrono::system_clock::now();
doInference(*context, blob, this->prob, output_size, pr_img.size());
auto end = std::chrono::system_clock::now();
std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
// 结果后处理
std::vector<Object> objects;
decode_outputs(this->prob, this->output_size, objects, scale, img_w, img_h);
draw_objects(img, objects, image_path);
delete blob;
}
// 结果后处理
static void decode_outputs(float* prob, int output_size, std::vector<Object>& objects, float scale, const int img_w, const int img_h) {
std::vector<Object> proposals;
generate_yolo_proposals(prob, output_size, BBOX_CONF_THRESH, proposals);
std::cout << "num of boxes before nms: " << proposals.size() << std::endl;
qsort_descent_inplace(proposals);
std::vector<int> picked;
nms_sorted_bboxes(proposals, picked, NMS_THRESH);
int count = picked.size();
std::cout << "num of boxes: " << count << std::endl;
objects.resize(count);
for (int i = 0; i < count; i++)
{
objects[i] = proposals[picked[i]];
// adjust offset to original unpadded
float x0 = (objects[i].rect.x) / scale;
float y0 = (objects[i].rect.y) / scale;
float x1 = (objects[i].rect.x + objects[i].rect.width) / scale;
float y1 = (objects[i].rect.y + objects[i].rect.height) / scale;
// clip
x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f);
y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f);
x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f);
y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f);
objects[i].rect.x = x0;
objects[i].rect.y = y0;
objects[i].rect.width = x1 - x0;
objects[i].rect.height = y1 - y0;
}
}
2.验证模型转换前后的精度
ref:
完整代码:https://github.com/Linaom1214/TensorRT-For-YOLO-Series
https://docs.nvidia.com/deeplearning/tensorrt/quick-start-guide/index.html